All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/28] ext4: xattr-in-inode support
@ 2017-05-31  8:14 Tahsin Erdogan
  2017-05-31  8:14 ` [PATCH 02/28] ext4: fix lockdep warning about recursive inode locking Tahsin Erdogan
                   ` (27 more replies)
  0 siblings, 28 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:14 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Andreas Dilger, Kalpak Shah, James Simmons,
	Tahsin Erdogan

From: Andreas Dilger <andreas.dilger@intel.com>

Large xattr support is implemented for EXT4_FEATURE_INCOMPAT_EA_INODE.

If the size of an xattr value is larger than will fit in a single
external block, then the xattr value will be saved into the body
of an external xattr inode.

The also helps support a larger number of xattr, since only the headers
will be stored in the in-inode space or the single external block.

The inode is referenced from the xattr header via "e_value_inum",
which was formerly "e_value_block", but that field was never used.
The e_value_size still contains the xattr size so that listing
xattrs does not need to look up the inode if the data is not accessed.

struct ext4_xattr_entry {
        __u8    e_name_len;     /* length of name */
        __u8    e_name_index;   /* attribute name index */
        __le16  e_value_offs;   /* offset in disk block of value */
        __le32  e_value_inum;   /* inode in which value is stored */
        __le32  e_value_size;   /* size of attribute value */
        __le32  e_hash;         /* hash value of name and value */
        char    e_name[0];      /* attribute name */
};

The xattr inode is marked with the EXT4_EA_INODE_FL flag and also
holds a back-reference to the owning inode in its i_mtime field,
allowing the ext4/e2fsck to verify the correct inode is accessed.

Lustre-Jira: https://jira.hpdd.intel.com/browse/LU-80
Lustre-bugzilla: https://bugzilla.lustre.org/show_bug.cgi?id=4424
Signed-off-by: Kalpak Shah <kalpak.shah@sun.com>
Signed-off-by: James Simmons <uja.ornl@gmail.com>
Signed-off-by: Andreas Dilger <andreas.dilger@intel.com>
Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/ext4.h   |  12 ++
 fs/ext4/ialloc.c |   1 -
 fs/ext4/inline.c |   2 +-
 fs/ext4/inode.c  |  49 ++++-
 fs/ext4/xattr.c  | 565 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/ext4/xattr.h  |  33 +++-
 6 files changed, 606 insertions(+), 56 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 32191548abed..24ef56b4572f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1797,6 +1797,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,		ENCRYPT)
 					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
 					 EXT4_FEATURE_INCOMPAT_64BIT| \
 					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+					 EXT4_FEATURE_INCOMPAT_EA_INODE| \
 					 EXT4_FEATURE_INCOMPAT_MMP | \
 					 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
 					 EXT4_FEATURE_INCOMPAT_ENCRYPT | \
@@ -2220,6 +2221,12 @@ struct mmpd_data {
 #define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
 
 /*
+ * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb
+ * This limit is arbitrary, but is reasonable for the xattr API.
+ */
+#define EXT4_XATTR_MAX_LARGE_EA_SIZE    (1024 * 1024)
+
+/*
  * Function prototypes
  */
 
@@ -2231,6 +2238,10 @@ struct mmpd_data {
 # define ATTRIB_NORET	__attribute__((noreturn))
 # define NORET_AND	noreturn,
 
+struct ext4_xattr_ino_array {
+	unsigned int xia_count;		/* # of used item in the array */
+	unsigned int xia_inodes[0];
+};
 /* bitmap.c */
 extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
 void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
@@ -2478,6 +2489,7 @@ extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
+extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 98ac2f1f23b3..e2eb3cc06820 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -294,7 +294,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	 * as writing the quota to disk may need the lock as well.
 	 */
 	dquot_initialize(inode);
-	ext4_xattr_delete_inode(handle, inode);
 	dquot_free_inode(inode);
 	dquot_drop(inode);
 
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 8d141c0c8ff9..28c5c3abddb3 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -61,7 +61,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
 
 	/* Compute min_offs. */
 	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
-		if (!entry->e_value_block && entry->e_value_size) {
+		if (!entry->e_value_inum && entry->e_value_size) {
 			size_t offs = le16_to_cpu(entry->e_value_offs);
 			if (offs < min_offs)
 				min_offs = offs;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5cf82d03968c..e5535e5b3dc5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -139,8 +139,6 @@ static void ext4_invalidatepage(struct page *page, unsigned int offset,
 				unsigned int length);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
-				  int pextents);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -189,6 +187,8 @@ void ext4_evict_inode(struct inode *inode)
 {
 	handle_t *handle;
 	int err;
+	int extra_credits = 3;
+	struct ext4_xattr_ino_array *lea_ino_array = NULL;
 
 	trace_ext4_evict_inode(inode);
 
@@ -238,8 +238,8 @@ void ext4_evict_inode(struct inode *inode)
 	 * protection against it
 	 */
 	sb_start_intwrite(inode->i_sb);
-	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
-				    ext4_blocks_for_truncate(inode)+3);
+
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits);
 	if (IS_ERR(handle)) {
 		ext4_std_error(inode->i_sb, PTR_ERR(handle));
 		/*
@@ -251,9 +251,36 @@ void ext4_evict_inode(struct inode *inode)
 		sb_end_intwrite(inode->i_sb);
 		goto no_delete;
 	}
-
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
+
+	/*
+	 * Delete xattr inode before deleting the main inode.
+	 */
+	err = ext4_xattr_delete_inode(handle, inode, &lea_ino_array);
+	if (err) {
+		ext4_warning(inode->i_sb,
+			     "couldn't delete inode's xattr (err %d)", err);
+		goto stop_handle;
+	}
+
+	if (!IS_NOQUOTA(inode))
+		extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
+
+	if (!ext4_handle_has_enough_credits(handle,
+			ext4_blocks_for_truncate(inode) + extra_credits)) {
+		err = ext4_journal_extend(handle,
+			ext4_blocks_for_truncate(inode) + extra_credits);
+		if (err > 0)
+			err = ext4_journal_restart(handle,
+			ext4_blocks_for_truncate(inode) + extra_credits);
+		if (err != 0) {
+			ext4_warning(inode->i_sb,
+				     "couldn't extend journal (err %d)", err);
+			goto stop_handle;
+		}
+	}
+
 	inode->i_size = 0;
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
@@ -277,10 +304,10 @@ void ext4_evict_inode(struct inode *inode)
 	 * enough credits left in the handle to remove the inode from
 	 * the orphan list and set the dtime field.
 	 */
-	if (!ext4_handle_has_enough_credits(handle, 3)) {
-		err = ext4_journal_extend(handle, 3);
+	if (!ext4_handle_has_enough_credits(handle, extra_credits)) {
+		err = ext4_journal_extend(handle, extra_credits);
 		if (err > 0)
-			err = ext4_journal_restart(handle, 3);
+			err = ext4_journal_restart(handle, extra_credits);
 		if (err != 0) {
 			ext4_warning(inode->i_sb,
 				     "couldn't extend journal (err %d)", err);
@@ -315,8 +342,12 @@ void ext4_evict_inode(struct inode *inode)
 		ext4_clear_inode(inode);
 	else
 		ext4_free_inode(handle, inode);
+
 	ext4_journal_stop(handle);
 	sb_end_intwrite(inode->i_sb);
+
+	if (lea_ino_array != NULL)
+		ext4_xattr_inode_array_free(inode, lea_ino_array);
 	return;
 no_delete:
 	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
@@ -5504,7 +5535,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
  *
  * Also account for superblock, inode, quota and xattr blocks
  */
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
 				  int pextents)
 {
 	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 5d3c2536641c..444be5c7a1d5 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -177,9 +177,8 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
 
 	/* Check the values */
 	while (!IS_LAST_ENTRY(entry)) {
-		if (entry->e_value_block != 0)
-			return -EFSCORRUPTED;
-		if (entry->e_value_size != 0) {
+		if (entry->e_value_size != 0 &&
+		    entry->e_value_inum == 0) {
 			u16 offs = le16_to_cpu(entry->e_value_offs);
 			u32 size = le32_to_cpu(entry->e_value_size);
 			void *value;
@@ -269,6 +268,99 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
 	return cmp ? -ENODATA : 0;
 }
 
+/*
+ * Read the EA value from an inode.
+ */
+static int
+ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size)
+{
+	unsigned long block = 0;
+	struct buffer_head *bh = NULL;
+	int blocksize;
+	size_t csize, ret_size = 0;
+
+	if (*size == 0)
+		return 0;
+
+	blocksize = ea_inode->i_sb->s_blocksize;
+
+	while (ret_size < *size) {
+		csize = (*size - ret_size) > blocksize ? blocksize :
+							*size - ret_size;
+		bh = ext4_bread(NULL, ea_inode, block, 0);
+		if (IS_ERR(bh)) {
+			*size = ret_size;
+			return PTR_ERR(bh);
+		}
+		memcpy(buf, bh->b_data, csize);
+		brelse(bh);
+
+		buf += csize;
+		block += 1;
+		ret_size += csize;
+	}
+
+	*size = ret_size;
+
+	return 0;
+}
+
+struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, int *err)
+{
+	struct inode *ea_inode = NULL;
+
+	ea_inode = ext4_iget(parent->i_sb, ea_ino);
+	if (IS_ERR(ea_inode) || is_bad_inode(ea_inode)) {
+		int rc = IS_ERR(ea_inode) ? PTR_ERR(ea_inode) : 0;
+		ext4_error(parent->i_sb, "error while reading EA inode %lu "
+			   "/ %d %d", ea_ino, rc, is_bad_inode(ea_inode));
+		*err = rc != 0 ? rc : -EIO;
+		return NULL;
+	}
+
+	if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != parent->i_ino ||
+	    ea_inode->i_generation != parent->i_generation) {
+		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
+			   "to parent invalid.", ea_ino);
+		*err = -EINVAL;
+		goto error;
+	}
+
+	if (!(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL)) {
+		ext4_error(parent->i_sb, "EA inode %lu does not have "
+			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
+		*err = -EINVAL;
+		goto error;
+	}
+
+	*err = 0;
+	return ea_inode;
+
+error:
+	iput(ea_inode);
+	return NULL;
+}
+
+/*
+ * Read the value from the EA inode.
+ */
+static int
+ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
+		     size_t *size)
+{
+	struct inode *ea_inode = NULL;
+	int err;
+
+	ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
+	if (err)
+		return err;
+
+	err = ext4_xattr_inode_read(ea_inode, buffer, size);
+	iput(ea_inode);
+
+	return err;
+}
+
 static int
 ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 		     void *buffer, size_t buffer_size)
@@ -308,8 +400,16 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 		error = -ERANGE;
 		if (size > buffer_size)
 			goto cleanup;
-		memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
-		       size);
+		if (entry->e_value_inum) {
+			error = ext4_xattr_inode_get(inode,
+					     le32_to_cpu(entry->e_value_inum),
+					     buffer, &size);
+			if (error)
+				goto cleanup;
+		} else {
+			memcpy(buffer, bh->b_data +
+			       le16_to_cpu(entry->e_value_offs), size);
+		}
 	}
 	error = size;
 
@@ -350,8 +450,16 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
 		error = -ERANGE;
 		if (size > buffer_size)
 			goto cleanup;
-		memcpy(buffer, (void *)IFIRST(header) +
-		       le16_to_cpu(entry->e_value_offs), size);
+		if (entry->e_value_inum) {
+			error = ext4_xattr_inode_get(inode,
+					     le32_to_cpu(entry->e_value_inum),
+					     buffer, &size);
+			if (error)
+				goto cleanup;
+		} else {
+			memcpy(buffer, (void *)IFIRST(header) +
+			       le16_to_cpu(entry->e_value_offs), size);
+		}
 	}
 	error = size;
 
@@ -620,7 +728,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
 				    size_t *min_offs, void *base, int *total)
 {
 	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
-		if (last->e_value_size) {
+		if (!last->e_value_inum && last->e_value_size) {
 			size_t offs = le16_to_cpu(last->e_value_offs);
 			if (offs < *min_offs)
 				*min_offs = offs;
@@ -631,16 +739,173 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
 	return (*min_offs - ((void *)last - base) - sizeof(__u32));
 }
 
-static int
-ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
+/*
+ * Write the value of the EA in an inode.
+ */
+static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
+				  const void *buf, int bufsize)
+{
+	struct buffer_head *bh = NULL;
+	unsigned long block = 0;
+	unsigned blocksize = ea_inode->i_sb->s_blocksize;
+	unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
+	int csize, wsize = 0;
+	int ret = 0;
+	int retries = 0;
+
+retry:
+	while (ret >= 0 && ret < max_blocks) {
+		struct ext4_map_blocks map;
+		map.m_lblk = block += ret;
+		map.m_len = max_blocks -= ret;
+
+		ret = ext4_map_blocks(handle, ea_inode, &map,
+				      EXT4_GET_BLOCKS_CREATE);
+		if (ret <= 0) {
+			ext4_mark_inode_dirty(handle, ea_inode);
+			if (ret == -ENOSPC &&
+			    ext4_should_retry_alloc(ea_inode->i_sb, &retries)) {
+				ret = 0;
+				goto retry;
+			}
+			break;
+		}
+	}
+
+	if (ret < 0)
+		return ret;
+
+	block = 0;
+	while (wsize < bufsize) {
+		if (bh != NULL)
+			brelse(bh);
+		csize = (bufsize - wsize) > blocksize ? blocksize :
+								bufsize - wsize;
+		bh = ext4_getblk(handle, ea_inode, block, 0);
+		if (IS_ERR(bh)) {
+			ret = PTR_ERR(bh);
+			goto out;
+		}
+		ret = ext4_journal_get_write_access(handle, bh);
+		if (ret)
+			goto out;
+
+		memcpy(bh->b_data, buf, csize);
+		set_buffer_uptodate(bh);
+		ext4_handle_dirty_metadata(handle, ea_inode, bh);
+
+		buf += csize;
+		wsize += csize;
+		block += 1;
+	}
+
+	inode_lock(ea_inode);
+	i_size_write(ea_inode, wsize);
+	ext4_update_i_disksize(ea_inode, wsize);
+	inode_unlock(ea_inode);
+
+	ext4_mark_inode_dirty(handle, ea_inode);
+
+out:
+	brelse(bh);
+
+	return ret;
+}
+
+/*
+ * Create an inode to store the value of a large EA.
+ */
+static struct inode *ext4_xattr_inode_create(handle_t *handle,
+					     struct inode *inode)
+{
+	struct inode *ea_inode = NULL;
+
+	/*
+	 * Let the next inode be the goal, so we try and allocate the EA inode
+	 * in the same group, or nearby one.
+	 */
+	ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
+				  S_IFREG | 0600, NULL, inode->i_ino + 1, NULL);
+	if (!IS_ERR(ea_inode)) {
+		ea_inode->i_op = &ext4_file_inode_operations;
+		ea_inode->i_fop = &ext4_file_operations;
+		ext4_set_aops(ea_inode);
+		ea_inode->i_generation = inode->i_generation;
+		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
+
+		/*
+		 * A back-pointer from EA inode to parent inode will be useful
+		 * for e2fsck.
+		 */
+		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
+		unlock_new_inode(ea_inode);
+	}
+
+	return ea_inode;
+}
+
+/*
+ * Unlink the inode storing the value of the EA.
+ */
+int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
+{
+	struct inode *ea_inode = NULL;
+	int err;
+
+	ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
+	if (err)
+		return err;
+
+	clear_nlink(ea_inode);
+	iput(ea_inode);
+
+	return 0;
+}
+
+/*
+ * Add value of the EA in an inode.
+ */
+static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
+				unsigned long *ea_ino, const void *value,
+				size_t value_len)
+{
+	struct inode *ea_inode;
+	int err;
+
+	/* Create an inode for the EA value */
+	ea_inode = ext4_xattr_inode_create(handle, inode);
+	if (IS_ERR(ea_inode))
+		return PTR_ERR(ea_inode);
+
+	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
+	if (err)
+		clear_nlink(ea_inode);
+	else
+		*ea_ino = ea_inode->i_ino;
+
+	iput(ea_inode);
+
+	return err;
+}
+
+static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
+				struct ext4_xattr_search *s,
+				handle_t *handle, struct inode *inode)
 {
 	struct ext4_xattr_entry *last;
 	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
+	int in_inode = i->in_inode;
+	int rc;
+
+	if (ext4_has_feature_ea_inode(inode->i_sb) &&
+	    (EXT4_XATTR_SIZE(i->value_len) >
+	     EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize)))
+		in_inode = 1;
 
 	/* Compute min_offs and last. */
 	last = s->first;
 	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
-		if (last->e_value_size) {
+		if (!last->e_value_inum && last->e_value_size) {
 			size_t offs = le16_to_cpu(last->e_value_offs);
 			if (offs < min_offs)
 				min_offs = offs;
@@ -648,15 +913,20 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 	}
 	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
 	if (!s->not_found) {
-		if (s->here->e_value_size) {
+		if (!in_inode &&
+		    !s->here->e_value_inum && s->here->e_value_size) {
 			size_t size = le32_to_cpu(s->here->e_value_size);
 			free += EXT4_XATTR_SIZE(size);
 		}
 		free += EXT4_XATTR_LEN(name_len);
 	}
 	if (i->value) {
-		if (free < EXT4_XATTR_LEN(name_len) +
-			   EXT4_XATTR_SIZE(i->value_len))
+		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
+
+		if (in_inode)
+			value_len = 0;
+
+		if (free < EXT4_XATTR_LEN(name_len) + value_len)
 			return -ENOSPC;
 	}
 
@@ -670,7 +940,8 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 		s->here->e_name_len = name_len;
 		memcpy(s->here->e_name, i->name, name_len);
 	} else {
-		if (s->here->e_value_size) {
+		if (!s->here->e_value_inum && s->here->e_value_size &&
+		    s->here->e_value_offs > 0) {
 			void *first_val = s->base + min_offs;
 			size_t offs = le16_to_cpu(s->here->e_value_offs);
 			void *val = s->base + offs;
@@ -704,12 +975,18 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 			last = s->first;
 			while (!IS_LAST_ENTRY(last)) {
 				size_t o = le16_to_cpu(last->e_value_offs);
-				if (last->e_value_size && o < offs)
+				if (!last->e_value_inum &&
+				    last->e_value_size && o < offs)
 					last->e_value_offs =
 						cpu_to_le16(o + size);
 				last = EXT4_XATTR_NEXT(last);
 			}
 		}
+		if (s->here->e_value_inum) {
+			ext4_xattr_inode_unlink(inode,
+					    le32_to_cpu(s->here->e_value_inum));
+			s->here->e_value_inum = 0;
+		}
 		if (!i->value) {
 			/* Remove the old name. */
 			size_t size = EXT4_XATTR_LEN(name_len);
@@ -722,11 +999,20 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 
 	if (i->value) {
 		/* Insert the new value. */
-		s->here->e_value_size = cpu_to_le32(i->value_len);
-		if (i->value_len) {
+		if (in_inode) {
+			unsigned long ea_ino =
+				le32_to_cpu(s->here->e_value_inum);
+			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
+						  i->value, i->value_len);
+			if (rc)
+				goto out;
+			s->here->e_value_inum = cpu_to_le32(ea_ino);
+			s->here->e_value_offs = 0;
+		} else if (i->value_len) {
 			size_t size = EXT4_XATTR_SIZE(i->value_len);
 			void *val = s->base + min_offs - size;
 			s->here->e_value_offs = cpu_to_le16(min_offs - size);
+			s->here->e_value_inum = 0;
 			if (i->value == EXT4_ZERO_XATTR_VALUE) {
 				memset(val, 0, size);
 			} else {
@@ -736,8 +1022,11 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 				memcpy(val, i->value, i->value_len);
 			}
 		}
+		s->here->e_value_size = cpu_to_le32(i->value_len);
 	}
-	return 0;
+
+out:
+	return rc;
 }
 
 struct ext4_xattr_block_find {
@@ -801,8 +1090,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 
 #define header(x) ((struct ext4_xattr_header *)(x))
 
-	if (i->value && i->value_len > sb->s_blocksize)
-		return -ENOSPC;
 	if (s->base) {
 		BUFFER_TRACE(bs->bh, "get_write_access");
 		error = ext4_journal_get_write_access(handle, bs->bh);
@@ -821,7 +1108,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			mb_cache_entry_delete_block(ext4_mb_cache, hash,
 						    bs->bh->b_blocknr);
 			ea_bdebug(bs->bh, "modifying in-place");
-			error = ext4_xattr_set_entry(i, s);
+			error = ext4_xattr_set_entry(i, s, handle, inode);
 			if (!error) {
 				if (!IS_LAST_ENTRY(s->first))
 					ext4_xattr_rehash(header(s->base),
@@ -870,7 +1157,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		s->end = s->base + sb->s_blocksize;
 	}
 
-	error = ext4_xattr_set_entry(i, s);
+	error = ext4_xattr_set_entry(i, s, handle, inode);
 	if (error == -EFSCORRUPTED)
 		goto bad_block;
 	if (error)
@@ -1070,7 +1357,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
 
 	if (EXT4_I(inode)->i_extra_isize == 0)
 		return -ENOSPC;
-	error = ext4_xattr_set_entry(i, s);
+	error = ext4_xattr_set_entry(i, s, handle, inode);
 	if (error) {
 		if (error == -ENOSPC &&
 		    ext4_has_inline_data(inode)) {
@@ -1082,7 +1369,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
 			error = ext4_xattr_ibody_find(inode, i, is);
 			if (error)
 				return error;
-			error = ext4_xattr_set_entry(i, s);
+			error = ext4_xattr_set_entry(i, s, handle, inode);
 		}
 		if (error)
 			return error;
@@ -1098,7 +1385,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
-static int ext4_xattr_ibody_set(struct inode *inode,
+static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 				struct ext4_xattr_info *i,
 				struct ext4_xattr_ibody_find *is)
 {
@@ -1108,7 +1395,7 @@ static int ext4_xattr_ibody_set(struct inode *inode,
 
 	if (EXT4_I(inode)->i_extra_isize == 0)
 		return -ENOSPC;
-	error = ext4_xattr_set_entry(i, s);
+	error = ext4_xattr_set_entry(i, s, handle, inode);
 	if (error)
 		return error;
 	header = IHDR(inode, ext4_raw_inode(&is->iloc));
@@ -1155,7 +1442,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		.name = name,
 		.value = value,
 		.value_len = value_len,
-
+		.in_inode = 0,
 	};
 	struct ext4_xattr_ibody_find is = {
 		.s = { .not_found = -ENODATA, },
@@ -1204,7 +1491,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	}
 	if (!value) {
 		if (!is.s.not_found)
-			error = ext4_xattr_ibody_set(inode, &i, &is);
+			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
 		else if (!bs.s.not_found)
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
 	} else {
@@ -1215,7 +1502,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
 			goto cleanup;
 
-		error = ext4_xattr_ibody_set(inode, &i, &is);
+		error = ext4_xattr_ibody_set(handle, inode, &i, &is);
 		if (!error && !bs.s.not_found) {
 			i.value = NULL;
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
@@ -1226,11 +1513,20 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 					goto cleanup;
 			}
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
+			if (ext4_has_feature_ea_inode(inode->i_sb) &&
+			    error == -ENOSPC) {
+				/* xattr not fit to block, store at external
+				 * inode */
+				i.in_inode = 1;
+				error = ext4_xattr_ibody_set(handle, inode,
+							     &i, &is);
+			}
 			if (error)
 				goto cleanup;
 			if (!is.s.not_found) {
 				i.value = NULL;
-				error = ext4_xattr_ibody_set(inode, &i, &is);
+				error = ext4_xattr_ibody_set(handle, inode, &i,
+							     &is);
 			}
 		}
 	}
@@ -1269,12 +1565,26 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 	       const void *value, size_t value_len, int flags)
 {
 	handle_t *handle;
+	struct super_block *sb = inode->i_sb;
 	int error, retries = 0;
 	int credits = ext4_jbd2_credits_xattr(inode);
 
 	error = dquot_initialize(inode);
 	if (error)
 		return error;
+
+	if ((value_len >= EXT4_XATTR_MIN_LARGE_EA_SIZE(sb->s_blocksize)) &&
+	    ext4_has_feature_ea_inode(sb)) {
+		int nrblocks = (value_len + sb->s_blocksize - 1) >>
+					sb->s_blocksize_bits;
+
+		/* For new inode */
+		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
+
+		/* For data blocks of EA inode */
+		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
+	}
+
 retry:
 	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle)) {
@@ -1286,7 +1596,7 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 					      value, value_len, flags);
 		error2 = ext4_journal_stop(handle);
 		if (error == -ENOSPC &&
-		    ext4_should_retry_alloc(inode->i_sb, &retries))
+		    ext4_should_retry_alloc(sb, &retries))
 			goto retry;
 		if (error == 0)
 			error = error2;
@@ -1311,7 +1621,7 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
 
 	/* Adjust the value offsets of the entries */
 	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
-		if (last->e_value_size) {
+		if (!last->e_value_inum && last->e_value_size) {
 			new_offs = le16_to_cpu(last->e_value_offs) +
 							value_offs_shift;
 			last->e_value_offs = cpu_to_le16(new_offs);
@@ -1372,7 +1682,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
 		goto out;
 
 	/* Remove the chosen entry from the inode */
-	error = ext4_xattr_ibody_set(inode, &i, is);
+	error = ext4_xattr_ibody_set(handle, inode, &i, is);
 	if (error)
 		goto out;
 
@@ -1572,21 +1882,135 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 }
 
 
+#define EIA_INCR 16 /* must be 2^n */
+#define EIA_MASK (EIA_INCR - 1)
+/* Add the large xattr @ino into @lea_ino_array for later deletion.
+ * If @lea_ino_array is new or full it will be grown and the old
+ * contents copied over.
+ */
+static int
+ext4_expand_ino_array(struct ext4_xattr_ino_array **lea_ino_array, __u32 ino)
+{
+	if (*lea_ino_array == NULL) {
+		/*
+		 * Start with 15 inodes, so it fits into a power-of-two size.
+		 * If *lea_ino_array is NULL, this is essentially offsetof()
+		 */
+		(*lea_ino_array) =
+			kmalloc(offsetof(struct ext4_xattr_ino_array,
+					 xia_inodes[EIA_MASK]),
+				GFP_NOFS);
+		if (*lea_ino_array == NULL)
+			return -ENOMEM;
+		(*lea_ino_array)->xia_count = 0;
+	} else if (((*lea_ino_array)->xia_count & EIA_MASK) == EIA_MASK) {
+		/* expand the array once all 15 + n * 16 slots are full */
+		struct ext4_xattr_ino_array *new_array = NULL;
+		int count = (*lea_ino_array)->xia_count;
+
+		/* if new_array is NULL, this is essentially offsetof() */
+		new_array = kmalloc(
+				offsetof(struct ext4_xattr_ino_array,
+					 xia_inodes[count + EIA_INCR]),
+				GFP_NOFS);
+		if (new_array == NULL)
+			return -ENOMEM;
+		memcpy(new_array, *lea_ino_array,
+		       offsetof(struct ext4_xattr_ino_array,
+				xia_inodes[count]));
+		kfree(*lea_ino_array);
+		*lea_ino_array = new_array;
+	}
+	(*lea_ino_array)->xia_inodes[(*lea_ino_array)->xia_count++] = ino;
+	return 0;
+}
+
+/**
+ * Add xattr inode to orphan list
+ */
+static int
+ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode,
+			int credits, struct ext4_xattr_ino_array *lea_ino_array)
+{
+	struct inode *ea_inode = NULL;
+	int idx = 0, error = 0;
+
+	if (lea_ino_array == NULL)
+		return 0;
+
+	for (; idx < lea_ino_array->xia_count; ++idx) {
+		if (!ext4_handle_has_enough_credits(handle, credits)) {
+			error = ext4_journal_extend(handle, credits);
+			if (error > 0)
+				error = ext4_journal_restart(handle, credits);
+
+			if (error != 0) {
+				ext4_warning(inode->i_sb,
+					"couldn't extend journal "
+					"(err %d)", error);
+				return error;
+			}
+		}
+		ea_inode = ext4_xattr_inode_iget(inode,
+				lea_ino_array->xia_inodes[idx], &error);
+		if (error)
+			continue;
+		ext4_orphan_add(handle, ea_inode);
+		/* the inode's i_count will be released by caller */
+	}
+
+	return 0;
+}
 
 /*
  * ext4_xattr_delete_inode()
  *
- * Free extended attribute resources associated with this inode. This
+ * Free extended attribute resources associated with this inode. Traverse
+ * all entries and unlink any xattr inodes associated with this inode. This
  * is called immediately before an inode is freed. We have exclusive
- * access to the inode.
+ * access to the inode. If an orphan inode is deleted it will also delete any
+ * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
+ * to ensure they belong to the parent inode and were not deleted already.
  */
-void
-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+int
+ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
+			struct ext4_xattr_ino_array **lea_ino_array)
 {
 	struct buffer_head *bh = NULL;
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_inode *raw_inode;
+	struct ext4_iloc iloc;
+	struct ext4_xattr_entry *entry;
+	int credits = 3, error = 0;
 
-	if (!EXT4_I(inode)->i_file_acl)
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+		goto delete_external_ea;
+
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error)
+		goto cleanup;
+	raw_inode = ext4_raw_inode(&iloc);
+	header = IHDR(inode, raw_inode);
+	for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		if (ext4_expand_ino_array(lea_ino_array,
+					  entry->e_value_inum) != 0) {
+			brelse(iloc.bh);
+			goto cleanup;
+		}
+		entry->e_value_inum = 0;
+	}
+	brelse(iloc.bh);
+
+delete_external_ea:
+	if (!EXT4_I(inode)->i_file_acl) {
+		/* add xattr inode to orphan list */
+		ext4_xattr_inode_orphan_add(handle, inode, credits,
+						*lea_ino_array);
 		goto cleanup;
+	}
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
 	if (!bh) {
 		EXT4_ERROR_INODE(inode, "block %llu read error",
@@ -1599,11 +2023,69 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
 				 EXT4_I(inode)->i_file_acl);
 		goto cleanup;
 	}
+
+	for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		if (ext4_expand_ino_array(lea_ino_array,
+					  entry->e_value_inum) != 0)
+			goto cleanup;
+		entry->e_value_inum = 0;
+	}
+
+	/* add xattr inode to orphan list */
+	error = ext4_xattr_inode_orphan_add(handle, inode, credits,
+					*lea_ino_array);
+	if (error != 0)
+		goto cleanup;
+
+	if (!IS_NOQUOTA(inode))
+		credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
+
+	if (!ext4_handle_has_enough_credits(handle, credits)) {
+		error = ext4_journal_extend(handle, credits);
+		if (error > 0)
+			error = ext4_journal_restart(handle, credits);
+		if (error != 0) {
+			ext4_warning(inode->i_sb,
+				"couldn't extend journal (err %d)", error);
+			goto cleanup;
+		}
+	}
+
 	ext4_xattr_release_block(handle, inode, bh);
 	EXT4_I(inode)->i_file_acl = 0;
 
 cleanup:
 	brelse(bh);
+
+	return error;
+}
+
+void
+ext4_xattr_inode_array_free(struct inode *inode,
+			    struct ext4_xattr_ino_array *lea_ino_array)
+{
+	struct inode	*ea_inode = NULL;
+	int		idx = 0;
+	int		err;
+
+	if (lea_ino_array == NULL)
+		return;
+
+	for (; idx < lea_ino_array->xia_count; ++idx) {
+		ea_inode = ext4_xattr_inode_iget(inode,
+				lea_ino_array->xia_inodes[idx], &err);
+		if (err)
+			continue;
+		/* for inode's i_count get from ext4_xattr_delete_inode */
+		if (!list_empty(&EXT4_I(ea_inode)->i_orphan))
+			iput(ea_inode);
+		clear_nlink(ea_inode);
+		iput(ea_inode);
+	}
+	kfree(lea_ino_array);
 }
 
 /*
@@ -1655,10 +2137,9 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
 		    entry1->e_name_index != entry2->e_name_index ||
 		    entry1->e_name_len != entry2->e_name_len ||
 		    entry1->e_value_size != entry2->e_value_size ||
+		    entry1->e_value_inum != entry2->e_value_inum ||
 		    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
 			return 1;
-		if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
-			return -EFSCORRUPTED;
 		if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
 			   (char *)header2 + le16_to_cpu(entry2->e_value_offs),
 			   le32_to_cpu(entry1->e_value_size)))
@@ -1730,7 +2211,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
 		       *name++;
 	}
 
-	if (entry->e_value_size != 0) {
+	if (!entry->e_value_inum && entry->e_value_size) {
 		__le32 *value = (__le32 *)((char *)header +
 			le16_to_cpu(entry->e_value_offs));
 		for (n = (le32_to_cpu(entry->e_value_size) +
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 099c8b670ef5..6e10ff9393d4 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -44,7 +44,7 @@ struct ext4_xattr_entry {
 	__u8	e_name_len;	/* length of name */
 	__u8	e_name_index;	/* attribute name index */
 	__le16	e_value_offs;	/* offset in disk block of value */
-	__le32	e_value_block;	/* disk block attribute is stored on (n/i) */
+	__le32	e_value_inum;	/* inode in which the value is stored */
 	__le32	e_value_size;	/* size of attribute value */
 	__le32	e_hash;		/* hash value of name and value */
 	char	e_name[0];	/* attribute name */
@@ -69,6 +69,26 @@ struct ext4_xattr_entry {
 		EXT4_I(inode)->i_extra_isize))
 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
 
+/*
+ * Link EA inode back to parent one using i_mtime field.
+ * Extra integer type conversion added to ignore higher
+ * bits in i_mtime.tv_sec which might be set by ext4_get()
+ */
+#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
+do {                                                  \
+      (inode)->i_mtime.tv_sec = inum;                 \
+} while(0)
+
+#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
+((__u32)(inode)->i_mtime.tv_sec)
+
+/*
+ * The minimum size of EA value when you start storing it in an external inode
+ * size of block - size of header - size of 1 entry - 4 null bytes
+*/
+#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b)					\
+	((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)
+
 #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
 #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
 #define BFIRST(bh) ENTRY(BHDR(bh)+1)
@@ -77,10 +97,11 @@ struct ext4_xattr_entry {
 #define EXT4_ZERO_XATTR_VALUE ((void *)-1)
 
 struct ext4_xattr_info {
-	int name_index;
 	const char *name;
 	const void *value;
 	size_t value_len;
+	int name_index;
+	int in_inode;
 };
 
 struct ext4_xattr_search {
@@ -140,7 +161,13 @@ extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
 extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
 
-extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
+extern struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
+					   int *err);
+extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
+extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
+				   struct ext4_xattr_ino_array **array);
+extern void ext4_xattr_inode_array_free(struct inode *inode,
+					struct ext4_xattr_ino_array *array);
 
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 02/28] ext4: fix lockdep warning about recursive inode locking
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
@ 2017-05-31  8:14 ` Tahsin Erdogan
  2017-05-31  8:14 ` [PATCH 03/28] ext4: lock inode before calling ext4_orphan_add() Tahsin Erdogan
                   ` (26 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:14 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

Setting a large xattr value may require writing the attribute contents
to an external inode. In this case we may need to lock the xattr inode
along with the parent inode. This doesn't pose a deadlock risk because
xattr inodes are not directly visible to the user and their access is
restricted.

Assign a lockdep subclass to xattr inode's lock.

 ============================================
 WARNING: possible recursive locking detected
 4.12.0-rc1+ #740 Not tainted
 --------------------------------------------
 python/1822 is trying to acquire lock:
  (&sb->s_type->i_mutex_key#15){+.+...}, at: [<ffffffff804912ca>] ext4_xattr_set_entry+0x65a/0x7b0

 but task is already holding lock:
  (&sb->s_type->i_mutex_key#15){+.+...}, at: [<ffffffff803d6687>] vfs_setxattr+0x57/0xb0

 other info that might help us debug this:
  Possible unsafe locking scenario:

        CPU0
        ----
   lock(&sb->s_type->i_mutex_key#15);
   lock(&sb->s_type->i_mutex_key#15);

  *** DEADLOCK ***

  May be due to missing lock nesting notation

 4 locks held by python/1822:
  #0:  (sb_writers#10){.+.+.+}, at: [<ffffffff803d0eef>] mnt_want_write+0x1f/0x50
  #1:  (&sb->s_type->i_mutex_key#15){+.+...}, at: [<ffffffff803d6687>] vfs_setxattr+0x57/0xb0
  #2:  (jbd2_handle){.+.+..}, at: [<ffffffff80493f40>] start_this_handle+0xf0/0x420
  #3:  (&ei->xattr_sem){++++..}, at: [<ffffffff804920ba>] ext4_xattr_set_handle+0x9a/0x4f0

 stack backtrace:
 CPU: 0 PID: 1822 Comm: python Not tainted 4.12.0-rc1+ #740
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
 Call Trace:
  dump_stack+0x67/0x9e
  __lock_acquire+0x5f3/0x1750
  lock_acquire+0xb5/0x1d0
  down_write+0x2c/0x60
  ext4_xattr_set_entry+0x65a/0x7b0
  ext4_xattr_block_set+0x1b2/0x9b0
  ext4_xattr_set_handle+0x322/0x4f0
  ext4_xattr_set+0x144/0x1a0
  ext4_xattr_user_set+0x34/0x40
  __vfs_setxattr+0x66/0x80
  __vfs_setxattr_noperm+0x69/0x1c0
  vfs_setxattr+0xa2/0xb0
  setxattr+0x12e/0x150
  path_setxattr+0x87/0xb0
  SyS_setxattr+0xf/0x20
  entry_SYSCALL_64_fastpath+0x18/0xad

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/inode.c | 2 ++
 fs/ext4/xattr.c | 8 ++++++++
 fs/ext4/xattr.h | 6 ++++++
 3 files changed, 16 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e5535e5b3dc5..d095bf7ad390 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4877,6 +4877,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	}
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
+	if (ei->i_flags & EXT4_EA_INODE_FL)
+		ext4_xattr_inode_set_class(inode);
 	unlock_new_inode(inode);
 	return inode;
 
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 444be5c7a1d5..26d2705950a5 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -107,6 +107,13 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
 #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
 				inode->i_sb->s_fs_info)->s_mb_cache)
 
+#ifdef CONFIG_LOCKDEP
+void ext4_xattr_inode_set_class(struct inode *ea_inode)
+{
+	lockdep_set_subclass(&ea_inode->i_rwsem, 1);
+}
+#endif
+
 static __le32 ext4_xattr_block_csum(struct inode *inode,
 				    sector_t block_nr,
 				    struct ext4_xattr_header *hdr)
@@ -830,6 +837,7 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 		ea_inode->i_op = &ext4_file_inode_operations;
 		ea_inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(ea_inode);
+		ext4_xattr_inode_set_class(ea_inode);
 		ea_inode->i_generation = inode->i_generation;
 		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 6e10ff9393d4..e8bef79bdc38 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -196,3 +196,9 @@ static inline int ext4_init_security(handle_t *handle, struct inode *inode,
 	return 0;
 }
 #endif
+
+#ifdef CONFIG_LOCKDEP
+extern void ext4_xattr_inode_set_class(struct inode *ea_inode);
+#else
+static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { }
+#endif
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 03/28] ext4: lock inode before calling ext4_orphan_add()
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
  2017-05-31  8:14 ` [PATCH 02/28] ext4: fix lockdep warning about recursive inode locking Tahsin Erdogan
@ 2017-05-31  8:14 ` Tahsin Erdogan
  2017-05-31  8:14 ` [PATCH 04/28] ext4: do not set posix acls on xattr inodes Tahsin Erdogan
                   ` (25 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:14 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

ext4_orphan_add() requires caller to be holding the inode lock.
Add missing lock statements.

 WARNING: CPU: 3 PID: 1806 at fs/ext4/namei.c:2731 ext4_orphan_add+0x4e/0x240
 CPU: 3 PID: 1806 Comm: python Not tainted 4.12.0-rc1+ #746
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
 task: ffff880135d466c0 task.stack: ffffc900014b0000
 RIP: 0010:ext4_orphan_add+0x4e/0x240
 RSP: 0018:ffffc900014b3d50 EFLAGS: 00010246
 RAX: 0000000000000000 RBX: ffff8801348fe1f0 RCX: ffffc900014b3c64
 RDX: 0000000000000000 RSI: ffff8801348fe1f0 RDI: ffff8801348fe1f0
 RBP: ffffc900014b3da0 R08: 0000000000000000 R09: ffffffff80e82025
 R10: 0000000000004692 R11: 000000000000468d R12: ffff880137598000
 R13: ffff880137217000 R14: ffff880134ac58d0 R15: 0000000000000000
 FS:  00007fc50f09e740(0000) GS:ffff88013fd80000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00000000008bc2e0 CR3: 00000001375ac000 CR4: 00000000000006e0
 Call Trace:
  ext4_xattr_inode_orphan_add.constprop.19+0x9d/0xf0
  ext4_xattr_delete_inode+0x1c4/0x2f0
  ext4_evict_inode+0x15a/0x7f0
  evict+0xc0/0x1a0
  iput+0x16a/0x270
  do_unlinkat+0x172/0x290
  SyS_unlink+0x11/0x20
  entry_SYSCALL_64_fastpath+0x18/0xad

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/xattr.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 26d2705950a5..09ba0137d529 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1963,7 +1963,9 @@ ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode,
 				lea_ino_array->xia_inodes[idx], &error);
 		if (error)
 			continue;
+		inode_lock(ea_inode);
 		ext4_orphan_add(handle, ea_inode);
+		inode_unlock(ea_inode);
 		/* the inode's i_count will be released by caller */
 	}
 
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 04/28] ext4: do not set posix acls on xattr inodes
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
  2017-05-31  8:14 ` [PATCH 02/28] ext4: fix lockdep warning about recursive inode locking Tahsin Erdogan
  2017-05-31  8:14 ` [PATCH 03/28] ext4: lock inode before calling ext4_orphan_add() Tahsin Erdogan
@ 2017-05-31  8:14 ` Tahsin Erdogan
  2017-05-31  8:14 ` [PATCH 05/28] ext4: attach jinode after creation of xattr inode Tahsin Erdogan
                   ` (24 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:14 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

We don't need acls on xattr inodes because they are not directly
accessible from user mode.

Besides lockdep complains about recursive locking of xattr_sem as seen
below.

  =============================================
  [ INFO: possible recursive locking detected ]
  4.11.0-rc8+ #402 Not tainted
  ---------------------------------------------
  python/1894 is trying to acquire lock:
   (&ei->xattr_sem){++++..}, at: [<ffffffff804878a6>] ext4_xattr_get+0x66/0x270

  but task is already holding lock:
   (&ei->xattr_sem){++++..}, at: [<ffffffff80489500>] ext4_xattr_set_handle+0xa0/0x5d0

  other info that might help us debug this:
   Possible unsafe locking scenario:

         CPU0
         ----
    lock(&ei->xattr_sem);
    lock(&ei->xattr_sem);

   *** DEADLOCK ***

   May be due to missing lock nesting notation

  3 locks held by python/1894:
   #0:  (sb_writers#10){.+.+.+}, at: [<ffffffff803d829f>] mnt_want_write+0x1f/0x50
   #1:  (&sb->s_type->i_mutex_key#15){+.+...}, at: [<ffffffff803dda27>] vfs_setxattr+0x57/0xb0
   #2:  (&ei->xattr_sem){++++..}, at: [<ffffffff80489500>] ext4_xattr_set_handle+0xa0/0x5d0

  stack backtrace:
  CPU: 0 PID: 1894 Comm: python Not tainted 4.11.0-rc8+ #402
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
  Call Trace:
   dump_stack+0x67/0x99
   __lock_acquire+0x5f3/0x1830
   lock_acquire+0xb5/0x1d0
   down_read+0x2f/0x60
   ext4_xattr_get+0x66/0x270
   ext4_get_acl+0x43/0x1e0
   get_acl+0x72/0xf0
   posix_acl_create+0x5e/0x170
   ext4_init_acl+0x21/0xc0
   __ext4_new_inode+0xffd/0x16b0
   ext4_xattr_set_entry+0x5ea/0xb70
   ext4_xattr_block_set+0x1b5/0x970
   ext4_xattr_set_handle+0x351/0x5d0
   ext4_xattr_set+0x124/0x180
   ext4_xattr_user_set+0x34/0x40
   __vfs_setxattr+0x66/0x80
   __vfs_setxattr_noperm+0x69/0x1c0
   vfs_setxattr+0xa2/0xb0
   setxattr+0x129/0x160
   path_setxattr+0x87/0xb0
   SyS_setxattr+0xf/0x20
   entry_SYSCALL_64_fastpath+0x18/0xad

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/ext4.h    | 11 ++++++-----
 fs/ext4/ialloc.c  | 14 +++++++++-----
 fs/ext4/migrate.c |  2 +-
 fs/ext4/xattr.c   |  3 ++-
 4 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 24ef56b4572f..5d5fc0d0e2bc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2400,16 +2400,17 @@ extern int ext4fs_dirhash(const char *name, int len, struct
 /* ialloc.c */
 extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
 				      const struct qstr *qstr, __u32 goal,
-				      uid_t *owner, int handle_type,
-				      unsigned int line_no, int nblocks);
+				      uid_t *owner, __u32 i_flags,
+				      int handle_type, unsigned int line_no,
+				      int nblocks);
 
-#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \
+#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \
 	__ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
-			 0, 0, 0)
+			 i_flags, 0, 0, 0)
 #define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
 				    type, nblocks)		    \
 	__ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
-			 (type), __LINE__, (nblocks))
+			 0, (type), __LINE__, (nblocks))
 
 
 extern void ext4_free_inode(handle_t *, struct inode *);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e2eb3cc06820..fb1b3df17f6e 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -742,8 +742,9 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
  */
 struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 			       umode_t mode, const struct qstr *qstr,
-			       __u32 goal, uid_t *owner, int handle_type,
-			       unsigned int line_no, int nblocks)
+			       __u32 goal, uid_t *owner, __u32 i_flags,
+			       int handle_type, unsigned int line_no,
+			       int nblocks)
 {
 	struct super_block *sb;
 	struct buffer_head *inode_bitmap_bh = NULL;
@@ -1052,6 +1053,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 	/* Don't inherit extent flag from directory, amongst others. */
 	ei->i_flags =
 		ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
+	ei->i_flags |= i_flags;
 	ei->i_file_acl = 0;
 	ei->i_dtime = 0;
 	ei->i_block_group = group;
@@ -1108,9 +1110,11 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 			goto fail_free_drop;
 	}
 
-	err = ext4_init_acl(handle, inode, dir);
-	if (err)
-		goto fail_free_drop;
+	if (!(ei->i_flags & EXT4_EA_INODE_FL)) {
+		err = ext4_init_acl(handle, inode, dir);
+		if (err)
+			goto fail_free_drop;
+	}
 
 	err = ext4_init_security(handle, inode, dir, qstr);
 	if (err)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 364ea4d4a943..cf5181b62df1 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
 	owner[0] = i_uid_read(inode);
 	owner[1] = i_gid_read(inode);
 	tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root),
-				   S_IFREG, NULL, goal, owner);
+				   S_IFREG, NULL, goal, owner, 0);
 	if (IS_ERR(tmp_inode)) {
 		retval = PTR_ERR(tmp_inode);
 		ext4_journal_stop(handle);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 09ba0137d529..12210fe87ea3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -832,7 +832,8 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 	 * in the same group, or nearby one.
 	 */
 	ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
-				  S_IFREG | 0600, NULL, inode->i_ino + 1, NULL);
+				  S_IFREG | 0600, NULL, inode->i_ino + 1, NULL,
+				  EXT4_EA_INODE_FL);
 	if (!IS_ERR(ea_inode)) {
 		ea_inode->i_op = &ext4_file_inode_operations;
 		ea_inode->i_fop = &ext4_file_operations;
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 05/28] ext4: attach jinode after creation of xattr inode
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (2 preceding siblings ...)
  2017-05-31  8:14 ` [PATCH 04/28] ext4: do not set posix acls on xattr inodes Tahsin Erdogan
@ 2017-05-31  8:14 ` Tahsin Erdogan
  2017-05-31  8:14 ` [PATCH 06/28] ext4: ea_inode owner should be the same as the inode owner Tahsin Erdogan
                   ` (23 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:14 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

In data=ordered mode jinode needs to be attached to the xattr inode when
writing data to it. Attachment normally occurs during file open for regular
files. Since we are not using file interface to write to the xattr inode,
the jinode attach needs to be done manually.

Otherwise the following crash occurs in data=ordered mode.

 BUG: unable to handle kernel NULL pointer dereference at           (null)
 IP: jbd2_journal_file_inode+0x37/0x110
 PGD 13b3c0067
 P4D 13b3c0067
 PUD 137660067
 PMD 0

 Oops: 0000 [#1] SMP
 CPU: 3 PID: 1877 Comm: python Not tainted 4.12.0-rc1+ #749
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
 task: ffff88010e368980 task.stack: ffffc90000374000
 RIP: 0010:jbd2_journal_file_inode+0x37/0x110
 RSP: 0018:ffffc90000377980 EFLAGS: 00010246
 RAX: 0000000000000000 RBX: ffff880123b06230 RCX: 0000000000280000
 RDX: 0000000000000006 RSI: 0000000000000000 RDI: ffff88012c8585d0
 RBP: ffffc900003779b0 R08: 0000000000000202 R09: 0000000000000001
 R10: 0000000000000000 R11: 0000000000000400 R12: ffff8801111f81c0
 R13: ffff88013b2b6800 R14: ffffc90000377ab0 R15: 0000000000000001
 FS:  00007f0c99b77740(0000) GS:ffff88013fd80000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000000 CR3: 0000000136d91000 CR4: 00000000000006e0
 Call Trace:
  jbd2_journal_inode_add_write+0xe/0x10
  ext4_map_blocks+0x59e/0x620
  ext4_xattr_set_entry+0x501/0x7d0
  ext4_xattr_block_set+0x1b2/0x9b0
  ext4_xattr_set_handle+0x322/0x4f0
  ext4_xattr_set+0x144/0x1a0
  ext4_xattr_user_set+0x34/0x40
  __vfs_setxattr+0x66/0x80
  __vfs_setxattr_noperm+0x69/0x1c0
  vfs_setxattr+0xa2/0xb0
  setxattr+0x12e/0x150
  path_setxattr+0x87/0xb0
  SyS_setxattr+0xf/0x20
  entry_SYSCALL_64_fastpath+0x18/0xad

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/xattr.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 12210fe87ea3..8e123533315f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -826,6 +826,7 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 					     struct inode *inode)
 {
 	struct inode *ea_inode = NULL;
+	int err;
 
 	/*
 	 * Let the next inode be the goal, so we try and allocate the EA inode
@@ -848,6 +849,11 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 		 */
 		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
 		unlock_new_inode(ea_inode);
+		err = ext4_inode_attach_jinode(ea_inode);
+		if (err) {
+			iput(ea_inode);
+			return ERR_PTR(err);
+		}
 	}
 
 	return ea_inode;
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 06/28] ext4: ea_inode owner should be the same as the inode owner
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (3 preceding siblings ...)
  2017-05-31  8:14 ` [PATCH 05/28] ext4: attach jinode after creation of xattr inode Tahsin Erdogan
@ 2017-05-31  8:14 ` Tahsin Erdogan
  2017-05-31  8:14 ` [PATCH 07/28] ext4: call journal revoke when freeing ea_inode blocks Tahsin Erdogan
                   ` (22 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:14 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

Quota charging is based on the ownership of the inode. Currently, the
xattr inode owner is set to the caller which may be different from the
parent inode owner. This is inconsistent with how quota is charged for
xattr block and regular data block writes.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/xattr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8e123533315f..32ad2f2870e9 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -826,6 +826,7 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 					     struct inode *inode)
 {
 	struct inode *ea_inode = NULL;
+	uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
 	int err;
 
 	/*
@@ -833,7 +834,7 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 	 * in the same group, or nearby one.
 	 */
 	ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
-				  S_IFREG | 0600, NULL, inode->i_ino + 1, NULL,
+				  S_IFREG | 0600, NULL, inode->i_ino + 1, owner,
 				  EXT4_EA_INODE_FL);
 	if (!IS_ERR(ea_inode)) {
 		ea_inode->i_op = &ext4_file_inode_operations;
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 07/28] ext4: call journal revoke when freeing ea_inode blocks
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (4 preceding siblings ...)
  2017-05-31  8:14 ` [PATCH 06/28] ext4: ea_inode owner should be the same as the inode owner Tahsin Erdogan
@ 2017-05-31  8:14 ` Tahsin Erdogan
  2017-05-31 16:12     ` Darrick J. Wong
  2017-05-31  8:14 ` [PATCH 08/28] ext4: fix ref counting for ea_inode Tahsin Erdogan
                   ` (21 subsequent siblings)
  27 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:14 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

ea_inode contents are treated as metadata, that's why it is journaled
during initial writes. Failing to call revoke during freeing could cause
user data to be overwritten with original ea_inode contents during journal
replay.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/extents.c  | 3 ++-
 fs/ext4/indirect.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3e36508610b7..e0a8425ff74d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2488,7 +2488,8 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
 
 static inline int get_default_free_blocks_flags(struct inode *inode)
 {
-	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
+	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
 		return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
 	else if (ext4_should_journal_data(inode))
 		return EXT4_FREE_BLOCKS_FORGET;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index bc15c2c17633..7ffa290cbb8e 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -829,7 +829,8 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 	int	flags = EXT4_FREE_BLOCKS_VALIDATED;
 	int	err;
 
-	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
+	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
 		flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
 	else if (ext4_should_journal_data(inode))
 		flags |= EXT4_FREE_BLOCKS_FORGET;
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 08/28] ext4: fix ref counting for ea_inode
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (5 preceding siblings ...)
  2017-05-31  8:14 ` [PATCH 07/28] ext4: call journal revoke when freeing ea_inode blocks Tahsin Erdogan
@ 2017-05-31  8:14 ` Tahsin Erdogan
  2017-05-31  8:14 ` [PATCH 09/28] ext4: extended attribute value size limit is enforced by vfs Tahsin Erdogan
                   ` (20 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:14 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

The ref count on ea_inode is incremented by
ext4_xattr_inode_orphan_add() which is supposed to be decremented by
ext4_xattr_inode_array_free(). The decrement is conditioned on whether
the ea_inode is currently on the orphan list. However, the orphan list
addition only happens when journaling is enabled. In non-journaled case,r
we fail to release the ref count causing an error message like below.

"VFS: Busy inodes after unmount of sdb. Self-destruct in 5 seconds.
Have a nice day..."

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/xattr.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 32ad2f2870e9..13daf634244b 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -2098,8 +2098,7 @@ ext4_xattr_inode_array_free(struct inode *inode,
 		if (err)
 			continue;
 		/* for inode's i_count get from ext4_xattr_delete_inode */
-		if (!list_empty(&EXT4_I(ea_inode)->i_orphan))
-			iput(ea_inode);
+		iput(ea_inode);
 		clear_nlink(ea_inode);
 		iput(ea_inode);
 	}
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 09/28] ext4: extended attribute value size limit is enforced by vfs
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (6 preceding siblings ...)
  2017-05-31  8:14 ` [PATCH 08/28] ext4: fix ref counting for ea_inode Tahsin Erdogan
@ 2017-05-31  8:14 ` Tahsin Erdogan
  2017-05-31 16:03     ` Darrick J. Wong
  2017-05-31  8:14 ` [PATCH 10/28] ext4: change ext4_xattr_inode_iget() signature Tahsin Erdogan
                   ` (19 subsequent siblings)
  27 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:14 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

EXT4_XATTR_MAX_LARGE_EA_SIZE definition in ext4 is currently unused.
Besides, vfs enforces its own 64k limit which makes the 1MB limit in
ext4 redundant. Remove it.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/ext4.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5d5fc0d0e2bc..2cdd6070e348 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2221,12 +2221,6 @@ struct mmpd_data {
 #define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
 
 /*
- * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb
- * This limit is arbitrary, but is reasonable for the xattr API.
- */
-#define EXT4_XATTR_MAX_LARGE_EA_SIZE    (1024 * 1024)
-
-/*
  * Function prototypes
  */
 
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 10/28] ext4: change ext4_xattr_inode_iget() signature
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (7 preceding siblings ...)
  2017-05-31  8:14 ` [PATCH 09/28] ext4: extended attribute value size limit is enforced by vfs Tahsin Erdogan
@ 2017-05-31  8:14 ` Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 11/28] ext4: clean up ext4_xattr_inode_get() Tahsin Erdogan
                   ` (18 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:14 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

In general, kernel functions indicate success/failure through their return
values. This function returns the status as an output parameter and reserves
the return value for the inode. Make it follow the general convention.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/xattr.c | 73 +++++++++++++++++++++++++++++++--------------------------
 fs/ext4/xattr.h |  2 --
 2 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 13daf634244b..d9477d01be9b 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -312,40 +312,47 @@ ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size)
 	return 0;
 }
 
-struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, int *err)
+static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
+				 struct inode **ea_inode)
 {
-	struct inode *ea_inode = NULL;
+	struct inode *inode;
+	int err;
+
+	inode = ext4_iget(parent->i_sb, ea_ino);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		ext4_error(parent->i_sb, "error while reading EA inode %lu "
+			   "err=%d", ea_ino, err);
+		return err;
+	}
 
-	ea_inode = ext4_iget(parent->i_sb, ea_ino);
-	if (IS_ERR(ea_inode) || is_bad_inode(ea_inode)) {
-		int rc = IS_ERR(ea_inode) ? PTR_ERR(ea_inode) : 0;
+	if (is_bad_inode(inode)) {
 		ext4_error(parent->i_sb, "error while reading EA inode %lu "
-			   "/ %d %d", ea_ino, rc, is_bad_inode(ea_inode));
-		*err = rc != 0 ? rc : -EIO;
-		return NULL;
+			   "is_bad_inode", ea_ino);
+		err = -EIO;
+		goto error;
 	}
 
-	if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != parent->i_ino ||
-	    ea_inode->i_generation != parent->i_generation) {
+	if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
+	    inode->i_generation != parent->i_generation) {
 		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
-			   "to parent invalid.", ea_ino);
-		*err = -EINVAL;
+			   "to parent is invalid.", ea_ino);
+		err = -EINVAL;
 		goto error;
 	}
 
-	if (!(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL)) {
+	if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
 		ext4_error(parent->i_sb, "EA inode %lu does not have "
 			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
-		*err = -EINVAL;
+		err = -EINVAL;
 		goto error;
 	}
 
-	*err = 0;
-	return ea_inode;
-
+	*ea_inode = inode;
+	return 0;
 error:
-	iput(ea_inode);
-	return NULL;
+	iput(inode);
+	return err;
 }
 
 /*
@@ -355,17 +362,17 @@ static int
 ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
 		     size_t *size)
 {
-	struct inode *ea_inode = NULL;
-	int err;
+	struct inode *ea_inode;
+	int ret;
 
-	ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
-	if (err)
-		return err;
+	ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+	if (ret)
+		return ret;
 
-	err = ext4_xattr_inode_read(ea_inode, buffer, size);
+	ret = ext4_xattr_inode_read(ea_inode, buffer, size);
 	iput(ea_inode);
 
-	return err;
+	return ret;
 }
 
 static int
@@ -868,7 +875,7 @@ int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
 	struct inode *ea_inode = NULL;
 	int err;
 
-	ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
+	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
 	if (err)
 		return err;
 
@@ -1948,7 +1955,7 @@ static int
 ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode,
 			int credits, struct ext4_xattr_ino_array *lea_ino_array)
 {
-	struct inode *ea_inode = NULL;
+	struct inode *ea_inode;
 	int idx = 0, error = 0;
 
 	if (lea_ino_array == NULL)
@@ -1967,8 +1974,8 @@ ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode,
 				return error;
 			}
 		}
-		ea_inode = ext4_xattr_inode_iget(inode,
-				lea_ino_array->xia_inodes[idx], &error);
+		error = ext4_xattr_inode_iget(inode,
+				lea_ino_array->xia_inodes[idx], &ea_inode);
 		if (error)
 			continue;
 		inode_lock(ea_inode);
@@ -2085,7 +2092,7 @@ void
 ext4_xattr_inode_array_free(struct inode *inode,
 			    struct ext4_xattr_ino_array *lea_ino_array)
 {
-	struct inode	*ea_inode = NULL;
+	struct inode	*ea_inode;
 	int		idx = 0;
 	int		err;
 
@@ -2093,8 +2100,8 @@ ext4_xattr_inode_array_free(struct inode *inode,
 		return;
 
 	for (; idx < lea_ino_array->xia_count; ++idx) {
-		ea_inode = ext4_xattr_inode_iget(inode,
-				lea_ino_array->xia_inodes[idx], &err);
+		err = ext4_xattr_inode_iget(inode,
+				lea_ino_array->xia_inodes[idx], &ea_inode);
 		if (err)
 			continue;
 		/* for inode's i_count get from ext4_xattr_delete_inode */
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index e8bef79bdc38..b6ef99d1a061 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -161,8 +161,6 @@ extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
 extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
 
-extern struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
-					   int *err);
 extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 				   struct ext4_xattr_ino_array **array);
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 11/28] ext4: clean up ext4_xattr_inode_get()
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (8 preceding siblings ...)
  2017-05-31  8:14 ` [PATCH 10/28] ext4: change ext4_xattr_inode_iget() signature Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 12/28] ext4: add missing le32_to_cpu(e_value_inum) conversions Tahsin Erdogan
                   ` (17 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

The input and output values of *size parameter are equal on successful
return from ext4_xattr_inode_get(). On error return, the callers ignore
the output value so there is no need to update it.

Also check for NULL return from ext4_bread(). If the actual xattr inode
size happens to be smaller than the expected size, ext4_bread() may
return NULL which would indicate data corruption.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/xattr.c | 35 +++++++++++++----------------------
 1 file changed, 13 insertions(+), 22 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index d9477d01be9b..8e855fc2eb03 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -278,37 +278,28 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
 /*
  * Read the EA value from an inode.
  */
-static int
-ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size)
+static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
 {
 	unsigned long block = 0;
 	struct buffer_head *bh = NULL;
-	int blocksize;
-	size_t csize, ret_size = 0;
-
-	if (*size == 0)
-		return 0;
+	int blocksize = ea_inode->i_sb->s_blocksize;
+	size_t csize, copied = 0;
 
-	blocksize = ea_inode->i_sb->s_blocksize;
-
-	while (ret_size < *size) {
-		csize = (*size - ret_size) > blocksize ? blocksize :
-							*size - ret_size;
+	while (copied < size) {
+		csize = (size - copied) > blocksize ? blocksize : size - copied;
 		bh = ext4_bread(NULL, ea_inode, block, 0);
-		if (IS_ERR(bh)) {
-			*size = ret_size;
+		if (IS_ERR(bh))
 			return PTR_ERR(bh);
-		}
+		if (!bh)
+			return -EFSCORRUPTED;
+
 		memcpy(buf, bh->b_data, csize);
 		brelse(bh);
 
 		buf += csize;
 		block += 1;
-		ret_size += csize;
+		copied += csize;
 	}
-
-	*size = ret_size;
-
 	return 0;
 }
 
@@ -360,7 +351,7 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
  */
 static int
 ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
-		     size_t *size)
+		     size_t size)
 {
 	struct inode *ea_inode;
 	int ret;
@@ -417,7 +408,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 		if (entry->e_value_inum) {
 			error = ext4_xattr_inode_get(inode,
 					     le32_to_cpu(entry->e_value_inum),
-					     buffer, &size);
+					     buffer, size);
 			if (error)
 				goto cleanup;
 		} else {
@@ -467,7 +458,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
 		if (entry->e_value_inum) {
 			error = ext4_xattr_inode_get(inode,
 					     le32_to_cpu(entry->e_value_inum),
-					     buffer, &size);
+					     buffer, size);
 			if (error)
 				goto cleanup;
 		} else {
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 12/28] ext4: add missing le32_to_cpu(e_value_inum) conversions
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (9 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 11/28] ext4: clean up ext4_xattr_inode_get() Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 13/28] ext4: ext4_xattr_value_same() should return false for external data Tahsin Erdogan
                   ` (16 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

Two places in code missed converting xattr inode number using
le32_to_cpu().

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/xattr.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8e855fc2eb03..4dd8be16d175 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1997,6 +1997,7 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	struct ext4_inode *raw_inode;
 	struct ext4_iloc iloc;
 	struct ext4_xattr_entry *entry;
+	unsigned int ea_ino;
 	int credits = 3, error = 0;
 
 	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
@@ -2011,8 +2012,8 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	     entry = EXT4_XATTR_NEXT(entry)) {
 		if (!entry->e_value_inum)
 			continue;
-		if (ext4_expand_ino_array(lea_ino_array,
-					  entry->e_value_inum) != 0) {
+		ea_ino = le32_to_cpu(entry->e_value_inum);
+		if (ext4_expand_ino_array(lea_ino_array, ea_ino) != 0) {
 			brelse(iloc.bh);
 			goto cleanup;
 		}
@@ -2044,8 +2045,8 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	     entry = EXT4_XATTR_NEXT(entry)) {
 		if (!entry->e_value_inum)
 			continue;
-		if (ext4_expand_ino_array(lea_ino_array,
-					  entry->e_value_inum) != 0)
+		ea_ino = le32_to_cpu(entry->e_value_inum);
+		if (ext4_expand_ino_array(lea_ino_array, ea_ino) != 0)
 			goto cleanup;
 		entry->e_value_inum = 0;
 	}
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 13/28] ext4: ext4_xattr_value_same() should return false for external data
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (10 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 12/28] ext4: add missing le32_to_cpu(e_value_inum) conversions Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 14/28] ext4: fix ext4_xattr_make_inode_space() value size calculation Tahsin Erdogan
                   ` (15 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

ext4_xattr_value_same() is used as a quick optimization in case the new
xattr value is identical to the previous value. When xattr value is
stored in a xattr inode the check becomes expensive so it is better to
just assume that they are not equal.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/xattr.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 4dd8be16d175..681a9b5eefd8 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1428,6 +1428,9 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
 {
 	void *value;
 
+	/* When e_value_inum is set the value is stored externally. */
+	if (s->here->e_value_inum)
+		return 0;
 	if (le32_to_cpu(s->here->e_value_size) != i->value_len)
 		return 0;
 	value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs);
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 14/28] ext4: fix ext4_xattr_make_inode_space() value size calculation
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (11 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 13/28] ext4: ext4_xattr_value_same() should return false for external data Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 15/28] ext4: fix ext4_xattr_move_to_block() Tahsin Erdogan
                   ` (14 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

ext4_xattr_make_inode_space() is interested in calculating the inline
space used in an inode. When a xattr entry refers to an external inode
the value size indicates the external inode size, not the value size in
the inline area. Change the function to take this into account.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/xattr.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 681a9b5eefd8..6a6bee246873 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1747,9 +1747,10 @@ static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode,
 		last = IFIRST(header);
 		/* Find the entry best suited to be pushed into EA block */
 		for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
-			total_size =
-			EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
-					EXT4_XATTR_LEN(last->e_name_len);
+			total_size = EXT4_XATTR_LEN(last->e_name_len);
+			if (!last->e_value_inum)
+				total_size += EXT4_XATTR_SIZE(
+					       le32_to_cpu(last->e_value_size));
 			if (total_size <= bfree &&
 			    total_size < min_total_size) {
 				if (total_size + ifree < isize_diff) {
@@ -1768,8 +1769,10 @@ static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode,
 		}
 
 		entry_size = EXT4_XATTR_LEN(entry->e_name_len);
-		total_size = entry_size +
-			EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
+		total_size = entry_size;
+		if (!entry->e_value_inum)
+			total_size += EXT4_XATTR_SIZE(
+					      le32_to_cpu(entry->e_value_size));
 		error = ext4_xattr_move_to_block(handle, inode, raw_inode,
 						 entry);
 		if (error)
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 15/28] ext4: fix ext4_xattr_move_to_block()
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (12 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 14/28] ext4: fix ext4_xattr_make_inode_space() value size calculation Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 16/28] ext4: fix ext4_xattr_cmp() Tahsin Erdogan
                   ` (13 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

When moving xattr entries from inline area to a xattr block, entries
that refer to external xattr inodes need special handling because
value data is not available in the inline area but rather should be
read from its external inode.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/xattr.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 6a6bee246873..9c243b3510b7 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1658,18 +1658,16 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
 	struct ext4_xattr_ibody_find *is = NULL;
 	struct ext4_xattr_block_find *bs = NULL;
 	char *buffer = NULL, *b_entry_name = NULL;
-	size_t value_offs, value_size;
+	size_t value_size = le32_to_cpu(entry->e_value_size);
 	struct ext4_xattr_info i = {
 		.value = NULL,
 		.value_len = 0,
 		.name_index = entry->e_name_index,
+		.in_inode = !!entry->e_value_inum,
 	};
 	struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode);
 	int error;
 
-	value_offs = le16_to_cpu(entry->e_value_offs);
-	value_size = le32_to_cpu(entry->e_value_size);
-
 	is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
 	bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
 	buffer = kmalloc(value_size, GFP_NOFS);
@@ -1685,7 +1683,17 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
 	bs->bh = NULL;
 
 	/* Save the entry name and the entry value */
-	memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size);
+	if (entry->e_value_inum) {
+		error = ext4_xattr_inode_get(inode,
+					     le32_to_cpu(entry->e_value_inum),
+					     buffer, value_size);
+		if (error)
+			goto out;
+	} else {
+		size_t value_offs = le16_to_cpu(entry->e_value_offs);
+		memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size);
+	}
+
 	memcpy(b_entry_name, entry->e_name, entry->e_name_len);
 	b_entry_name[entry->e_name_len] = '\0';
 	i.name = b_entry_name;
@@ -1703,7 +1711,6 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
 	if (error)
 		goto out;
 
-	i.name = b_entry_name;
 	i.value = buffer;
 	i.value_len = value_size;
 	error = ext4_xattr_block_find(inode, &i, bs);
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 16/28] ext4: fix ext4_xattr_cmp()
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (13 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 15/28] ext4: fix ext4_xattr_move_to_block() Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 17/28] ext4: fix credits calculation for xattr inode Tahsin Erdogan
                   ` (12 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

When a xattr entry refers to an external inode, the value data is not
available in the inline area so we should not attempt to read it using
value offset.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/xattr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 9c243b3510b7..739f73a5a345 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -2169,7 +2169,8 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
 		    entry1->e_value_inum != entry2->e_value_inum ||
 		    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
 			return 1;
-		if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
+		if (!entry1->e_value_inum &&
+		    memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
 			   (char *)header2 + le16_to_cpu(entry2->e_value_offs),
 			   le32_to_cpu(entry1->e_value_size)))
 			return 1;
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 17/28] ext4: fix credits calculation for xattr inode
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (14 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 16/28] ext4: fix ext4_xattr_cmp() Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 18/28] ext4: retry storing value in external inode with xattr block too Tahsin Erdogan
                   ` (11 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

When there is no space for a value in xattr block, it may be stored
in an xattr inode even if the value length is less than
EXT4_XATTR_MIN_LARGE_EA_SIZE(). So the current assumption in credits
calculation is wrong.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/xattr.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 739f73a5a345..dcf7ec98f138 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1590,8 +1590,7 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 	if (error)
 		return error;
 
-	if ((value_len >= EXT4_XATTR_MIN_LARGE_EA_SIZE(sb->s_blocksize)) &&
-	    ext4_has_feature_ea_inode(sb)) {
+	if (ext4_has_feature_ea_inode(sb)) {
 		int nrblocks = (value_len + sb->s_blocksize - 1) >>
 					sb->s_blocksize_bits;
 
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 18/28] ext4: retry storing value in external inode with xattr block too
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (15 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 17/28] ext4: fix credits calculation for xattr inode Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-06-20  8:56   ` [PATCH v2 18/31] " Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 19/28] ext4: ext4_xattr_delete_inode() should return accurate errors Tahsin Erdogan
                   ` (10 subsequent siblings)
  27 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

When value size is <= EXT4_XATTR_MIN_LARGE_EA_SIZE(), and it
doesn't fit in either inline or xattr block, a second try is made to
store it in an external inode while storing the entry itself in inline
area. There should also be an attempt to store the entry in xattr block.

This patch adds a retry loop to do that. It also makes the caller the
sole decider on whether to store a value in an external inode.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/xattr.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index dcf7ec98f138..1d354e447842 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -911,11 +911,6 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 	int in_inode = i->in_inode;
 	int rc;
 
-	if (ext4_has_feature_ea_inode(inode->i_sb) &&
-	    (EXT4_XATTR_SIZE(i->value_len) >
-	     EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize)))
-		in_inode = 1;
-
 	/* Compute min_offs and last. */
 	last = s->first;
 	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
@@ -1097,7 +1092,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 {
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *new_bh = NULL;
-	struct ext4_xattr_search *s = &bs->s;
+	struct ext4_xattr_search s_copy = bs->s;
+	struct ext4_xattr_search *s = &s_copy;
 	struct mb_cache_entry *ce = NULL;
 	int error = 0;
 	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
@@ -1519,6 +1515,11 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
 			goto cleanup;
 
+		if (ext4_has_feature_ea_inode(inode->i_sb) &&
+		    (EXT4_XATTR_SIZE(i.value_len) >
+			EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize)))
+			i.in_inode = 1;
+	retry_inode:
 		error = ext4_xattr_ibody_set(handle, inode, &i, &is);
 		if (!error && !bs.s.not_found) {
 			i.value = NULL;
@@ -1530,20 +1531,20 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 					goto cleanup;
 			}
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
-			if (ext4_has_feature_ea_inode(inode->i_sb) &&
-			    error == -ENOSPC) {
-				/* xattr not fit to block, store at external
-				 * inode */
-				i.in_inode = 1;
-				error = ext4_xattr_ibody_set(handle, inode,
-							     &i, &is);
-			}
-			if (error)
-				goto cleanup;
-			if (!is.s.not_found) {
+			if (!error && !is.s.not_found) {
 				i.value = NULL;
 				error = ext4_xattr_ibody_set(handle, inode, &i,
 							     &is);
+			} else if (error == -ENOSPC) {
+				/*
+				 * Xattr does not fit in the block, store at
+				 * external inode if possible.
+				 */
+				if (ext4_has_feature_ea_inode(inode->i_sb) &&
+				    !i.in_inode) {
+					i.in_inode = 1;
+					goto retry_inode;
+				}
 			}
 		}
 	}
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 19/28] ext4: ext4_xattr_delete_inode() should return accurate errors
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (16 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 18/28] ext4: retry storing value in external inode with xattr block too Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 20/28] ext4: improve journal credit handling in set xattr paths Tahsin Erdogan
                   ` (9 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

In a few places the function returns without trying to pass the actual
error code to the caller. Fix those.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/xattr.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 1d354e447842..230e0aa76777 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -2026,7 +2026,8 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 		if (!entry->e_value_inum)
 			continue;
 		ea_ino = le32_to_cpu(entry->e_value_inum);
-		if (ext4_expand_ino_array(lea_ino_array, ea_ino) != 0) {
+		error = ext4_expand_ino_array(lea_ino_array, ea_ino);
+		if (error) {
 			brelse(iloc.bh);
 			goto cleanup;
 		}
@@ -2037,20 +2038,22 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 delete_external_ea:
 	if (!EXT4_I(inode)->i_file_acl) {
 		/* add xattr inode to orphan list */
-		ext4_xattr_inode_orphan_add(handle, inode, credits,
-						*lea_ino_array);
+		error = ext4_xattr_inode_orphan_add(handle, inode, credits,
+						    *lea_ino_array);
 		goto cleanup;
 	}
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
 	if (!bh) {
 		EXT4_ERROR_INODE(inode, "block %llu read error",
 				 EXT4_I(inode)->i_file_acl);
+		error = -EIO;
 		goto cleanup;
 	}
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
 		EXT4_ERROR_INODE(inode, "bad block %llu",
 				 EXT4_I(inode)->i_file_acl);
+		error = -EFSCORRUPTED;
 		goto cleanup;
 	}
 
@@ -2059,7 +2062,8 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 		if (!entry->e_value_inum)
 			continue;
 		ea_ino = le32_to_cpu(entry->e_value_inum);
-		if (ext4_expand_ino_array(lea_ino_array, ea_ino) != 0)
+		error = ext4_expand_ino_array(lea_ino_array, ea_ino);
+		if (error)
 			goto cleanup;
 		entry->e_value_inum = 0;
 	}
@@ -2067,7 +2071,7 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	/* add xattr inode to orphan list */
 	error = ext4_xattr_inode_orphan_add(handle, inode, credits,
 					*lea_ino_array);
-	if (error != 0)
+	if (error)
 		goto cleanup;
 
 	if (!IS_NOQUOTA(inode))
@@ -2077,7 +2081,7 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 		error = ext4_journal_extend(handle, credits);
 		if (error > 0)
 			error = ext4_journal_restart(handle, credits);
-		if (error != 0) {
+		if (error) {
 			ext4_warning(inode->i_sb,
 				"couldn't extend journal (err %d)", error);
 			goto cleanup;
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 20/28] ext4: improve journal credit handling in set xattr paths
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (17 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 19/28] ext4: ext4_xattr_delete_inode() should return accurate errors Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-06-20  8:59   ` [PATCH v2 20/31] " Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 21/28] ext4: modify ext4_xattr_ino_array to hold struct inode * Tahsin Erdogan
                   ` (8 subsequent siblings)
  27 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

Both ext4_set_acl() and ext4_set_context() need to be made aware of
ea_inode feature when it comes to credits calculation.

Also add a sufficient credits check in ext4_xattr_set_handle() right
after xattr write lock is grabbed. Original credits calculation is done
outside the lock so there is a possiblity that the initially calculated
credits are not sufficient anymore.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/acl.c       |  7 ++++---
 fs/ext4/ext4_jbd2.h | 14 --------------
 fs/ext4/super.c     |  6 +++---
 fs/ext4/xattr.c     | 55 +++++++++++++++++++++++++++++++++++++++++------------
 fs/ext4/xattr.h     |  1 +
 5 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 3ec0e46de95f..74f7ac539e00 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -231,14 +231,15 @@ int
 ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
 	handle_t *handle;
-	int error, retries = 0;
+	int error, credits, retries = 0;
+	size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0;
 
 	error = dquot_initialize(inode);
 	if (error)
 		return error;
 retry:
-	handle = ext4_journal_start(inode, EXT4_HT_XATTR,
-				    ext4_jbd2_credits_xattr(inode));
+	credits = ext4_xattr_set_credits(inode, acl_size);
+	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index f97611171023..a5bda70feed5 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -104,20 +104,6 @@
 #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
 #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
 
-static inline int ext4_jbd2_credits_xattr(struct inode *inode)
-{
-	int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
-
-	/*
-	 * In case of inline data, we may push out the data to a block,
-	 * so we need to reserve credits for this eventuality
-	 */
-	if (ext4_has_inline_data(inode))
-		credits += ext4_writepage_trans_blocks(inode) + 1;
-	return credits;
-}
-
-
 /*
  * Ext4 handle operation types -- for logging purposes
  */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d37c81f327e7..b02a23ec92ca 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1143,7 +1143,7 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 							void *fs_data)
 {
 	handle_t *handle = fs_data;
-	int res, res2, retries = 0;
+	int res, res2, credits, retries = 0;
 
 	res = ext4_convert_inline_data(inode);
 	if (res)
@@ -1178,8 +1178,8 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 	if (res)
 		return res;
 retry:
-	handle = ext4_journal_start(inode, EXT4_HT_MISC,
-			ext4_jbd2_credits_xattr(inode));
+	credits = ext4_xattr_set_credits(inode, len);
+	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 230e0aa76777..5bce73b43b2a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1473,6 +1473,17 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 
 	ext4_write_lock_xattr(inode, &no_expand);
 
+	/* Check journal credits under write lock. */
+	if (ext4_handle_valid(handle)) {
+		int credits;
+
+		credits = ext4_xattr_set_credits(inode, value_len);
+		if (!ext4_handle_has_enough_credits(handle, credits)) {
+			error = -ENOSPC;
+			goto cleanup;
+		}
+	}
+
 	error = ext4_reserve_inode_write(handle, inode, &is.iloc);
 	if (error)
 		goto cleanup;
@@ -1570,6 +1581,36 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	return error;
 }
 
+int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
+{
+	struct super_block *sb = inode->i_sb;
+	int credits;
+
+	if (!EXT4_SB(sb)->s_journal)
+		return 0;
+
+	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+
+	/*
+	 * In case of inline data, we may push out the data to a block,
+	 * so we need to reserve credits for this eventuality
+	 */
+	if (ext4_has_inline_data(inode))
+	        credits += ext4_writepage_trans_blocks(inode) + 1;
+
+	if (ext4_has_feature_ea_inode(sb)) {
+		int nrblocks = (value_len + sb->s_blocksize - 1) >>
+					sb->s_blocksize_bits;
+
+		/* For new inode */
+		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
+
+		/* For data blocks of EA inode */
+		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
+	}
+	return credits;
+}
+
 /*
  * ext4_xattr_set()
  *
@@ -1585,24 +1626,14 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 	handle_t *handle;
 	struct super_block *sb = inode->i_sb;
 	int error, retries = 0;
-	int credits = ext4_jbd2_credits_xattr(inode);
+	int credits;
 
 	error = dquot_initialize(inode);
 	if (error)
 		return error;
 
-	if (ext4_has_feature_ea_inode(sb)) {
-		int nrblocks = (value_len + sb->s_blocksize - 1) >>
-					sb->s_blocksize_bits;
-
-		/* For new inode */
-		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
-
-		/* For data blocks of EA inode */
-		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
-	}
-
 retry:
+	credits = ext4_xattr_set_credits(inode, value_len);
 	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index b6ef99d1a061..e82c5fe36a26 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -160,6 +160,7 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
 extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
 extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
+extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
 
 extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 21/28] ext4: modify ext4_xattr_ino_array to hold struct inode *
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (18 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 20/28] ext4: improve journal credit handling in set xattr paths Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 22/28] ext4: move struct ext4_xattr_inode_array to xattr.h Tahsin Erdogan
                   ` (7 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

Tracking struct inode * rather than the inode number eliminates the
repeated ext4_xattr_inode_iget() call later. The second call cannot
fail in practice but still requires explanation when it wants to ignore
the return value. Avoid the trouble and make things simple.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/ext4.h  |  6 ++--
 fs/ext4/inode.c |  8 ++---
 fs/ext4/xattr.c | 93 ++++++++++++++++++++++++++++-----------------------------
 fs/ext4/xattr.h |  5 ++--
 4 files changed, 53 insertions(+), 59 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2cdd6070e348..603edb5ff304 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2232,9 +2232,9 @@ struct mmpd_data {
 # define ATTRIB_NORET	__attribute__((noreturn))
 # define NORET_AND	noreturn,
 
-struct ext4_xattr_ino_array {
-	unsigned int xia_count;		/* # of used item in the array */
-	unsigned int xia_inodes[0];
+struct ext4_xattr_inode_array {
+	unsigned int count;		/* # of used items in the array */
+	struct inode *inodes[0];
 };
 /* bitmap.c */
 extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d095bf7ad390..8ee20b586567 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -188,7 +188,7 @@ void ext4_evict_inode(struct inode *inode)
 	handle_t *handle;
 	int err;
 	int extra_credits = 3;
-	struct ext4_xattr_ino_array *lea_ino_array = NULL;
+	struct ext4_xattr_inode_array *ea_inode_array = NULL;
 
 	trace_ext4_evict_inode(inode);
 
@@ -257,7 +257,7 @@ void ext4_evict_inode(struct inode *inode)
 	/*
 	 * Delete xattr inode before deleting the main inode.
 	 */
-	err = ext4_xattr_delete_inode(handle, inode, &lea_ino_array);
+	err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array);
 	if (err) {
 		ext4_warning(inode->i_sb,
 			     "couldn't delete inode's xattr (err %d)", err);
@@ -345,9 +345,7 @@ void ext4_evict_inode(struct inode *inode)
 
 	ext4_journal_stop(handle);
 	sb_end_intwrite(inode->i_sb);
-
-	if (lea_ino_array != NULL)
-		ext4_xattr_inode_array_free(inode, lea_ino_array);
+	ext4_xattr_inode_array_free(ea_inode_array);
 	return;
 no_delete:
 	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 5bce73b43b2a..886d06e409b6 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1942,44 +1942,44 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 
 #define EIA_INCR 16 /* must be 2^n */
 #define EIA_MASK (EIA_INCR - 1)
-/* Add the large xattr @ino into @lea_ino_array for later deletion.
- * If @lea_ino_array is new or full it will be grown and the old
+/* Add the large xattr @inode into @ea_inode_array for later deletion.
+ * If @ea_inode_array is new or full it will be grown and the old
  * contents copied over.
  */
 static int
-ext4_expand_ino_array(struct ext4_xattr_ino_array **lea_ino_array, __u32 ino)
+ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
+			struct inode *inode)
 {
-	if (*lea_ino_array == NULL) {
+	if (*ea_inode_array == NULL) {
 		/*
 		 * Start with 15 inodes, so it fits into a power-of-two size.
-		 * If *lea_ino_array is NULL, this is essentially offsetof()
+		 * If *ea_inode_array is NULL, this is essentially offsetof()
 		 */
-		(*lea_ino_array) =
-			kmalloc(offsetof(struct ext4_xattr_ino_array,
-					 xia_inodes[EIA_MASK]),
+		(*ea_inode_array) =
+			kmalloc(offsetof(struct ext4_xattr_inode_array,
+					 inodes[EIA_MASK]),
 				GFP_NOFS);
-		if (*lea_ino_array == NULL)
+		if (*ea_inode_array == NULL)
 			return -ENOMEM;
-		(*lea_ino_array)->xia_count = 0;
-	} else if (((*lea_ino_array)->xia_count & EIA_MASK) == EIA_MASK) {
+		(*ea_inode_array)->count = 0;
+	} else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) {
 		/* expand the array once all 15 + n * 16 slots are full */
-		struct ext4_xattr_ino_array *new_array = NULL;
-		int count = (*lea_ino_array)->xia_count;
+		struct ext4_xattr_inode_array *new_array = NULL;
+		int count = (*ea_inode_array)->count;
 
 		/* if new_array is NULL, this is essentially offsetof() */
 		new_array = kmalloc(
-				offsetof(struct ext4_xattr_ino_array,
-					 xia_inodes[count + EIA_INCR]),
+				offsetof(struct ext4_xattr_inode_array,
+					 inodes[count + EIA_INCR]),
 				GFP_NOFS);
 		if (new_array == NULL)
 			return -ENOMEM;
-		memcpy(new_array, *lea_ino_array,
-		       offsetof(struct ext4_xattr_ino_array,
-				xia_inodes[count]));
-		kfree(*lea_ino_array);
-		*lea_ino_array = new_array;
+		memcpy(new_array, *ea_inode_array,
+		       offsetof(struct ext4_xattr_inode_array, inodes[count]));
+		kfree(*ea_inode_array);
+		*ea_inode_array = new_array;
 	}
-	(*lea_ino_array)->xia_inodes[(*lea_ino_array)->xia_count++] = ino;
+	(*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode;
 	return 0;
 }
 
@@ -1987,16 +1987,16 @@ ext4_expand_ino_array(struct ext4_xattr_ino_array **lea_ino_array, __u32 ino)
  * Add xattr inode to orphan list
  */
 static int
-ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode,
-			int credits, struct ext4_xattr_ino_array *lea_ino_array)
+ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits,
+			    struct ext4_xattr_inode_array *ea_inode_array)
 {
-	struct inode *ea_inode;
 	int idx = 0, error = 0;
+	struct inode *ea_inode;
 
-	if (lea_ino_array == NULL)
+	if (ea_inode_array == NULL)
 		return 0;
 
-	for (; idx < lea_ino_array->xia_count; ++idx) {
+	for (; idx < ea_inode_array->count; ++idx) {
 		if (!ext4_handle_has_enough_credits(handle, credits)) {
 			error = ext4_journal_extend(handle, credits);
 			if (error > 0)
@@ -2009,10 +2009,7 @@ ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode,
 				return error;
 			}
 		}
-		error = ext4_xattr_inode_iget(inode,
-				lea_ino_array->xia_inodes[idx], &ea_inode);
-		if (error)
-			continue;
+		ea_inode = ea_inode_array->inodes[idx];
 		inode_lock(ea_inode);
 		ext4_orphan_add(handle, ea_inode);
 		inode_unlock(ea_inode);
@@ -2034,13 +2031,14 @@ ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode,
  */
 int
 ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-			struct ext4_xattr_ino_array **lea_ino_array)
+			struct ext4_xattr_inode_array **ea_inode_array)
 {
 	struct buffer_head *bh = NULL;
 	struct ext4_xattr_ibody_header *header;
 	struct ext4_inode *raw_inode;
 	struct ext4_iloc iloc;
 	struct ext4_xattr_entry *entry;
+	struct inode *ea_inode;
 	unsigned int ea_ino;
 	int credits = 3, error = 0;
 
@@ -2057,8 +2055,12 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 		if (!entry->e_value_inum)
 			continue;
 		ea_ino = le32_to_cpu(entry->e_value_inum);
-		error = ext4_expand_ino_array(lea_ino_array, ea_ino);
+		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+		if (error)
+			continue;
+		error = ext4_expand_inode_array(ea_inode_array, ea_inode);
 		if (error) {
+			iput(ea_inode);
 			brelse(iloc.bh);
 			goto cleanup;
 		}
@@ -2070,7 +2072,7 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	if (!EXT4_I(inode)->i_file_acl) {
 		/* add xattr inode to orphan list */
 		error = ext4_xattr_inode_orphan_add(handle, inode, credits,
-						    *lea_ino_array);
+						    *ea_inode_array);
 		goto cleanup;
 	}
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
@@ -2093,7 +2095,10 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 		if (!entry->e_value_inum)
 			continue;
 		ea_ino = le32_to_cpu(entry->e_value_inum);
-		error = ext4_expand_ino_array(lea_ino_array, ea_ino);
+		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+		if (error)
+			continue;
+		error = ext4_expand_inode_array(ea_inode_array, ea_inode);
 		if (error)
 			goto cleanup;
 		entry->e_value_inum = 0;
@@ -2101,7 +2106,7 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 
 	/* add xattr inode to orphan list */
 	error = ext4_xattr_inode_orphan_add(handle, inode, credits,
-					*lea_ino_array);
+					*ea_inode_array);
 	if (error)
 		goto cleanup;
 
@@ -2128,28 +2133,20 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	return error;
 }
 
-void
-ext4_xattr_inode_array_free(struct inode *inode,
-			    struct ext4_xattr_ino_array *lea_ino_array)
+void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
 {
 	struct inode	*ea_inode;
 	int		idx = 0;
-	int		err;
 
-	if (lea_ino_array == NULL)
+	if (ea_inode_array == NULL)
 		return;
 
-	for (; idx < lea_ino_array->xia_count; ++idx) {
-		err = ext4_xattr_inode_iget(inode,
-				lea_ino_array->xia_inodes[idx], &ea_inode);
-		if (err)
-			continue;
-		/* for inode's i_count get from ext4_xattr_delete_inode */
-		iput(ea_inode);
+	for (; idx < ea_inode_array->count; ++idx) {
+		ea_inode = ea_inode_array->inodes[idx];
 		clear_nlink(ea_inode);
 		iput(ea_inode);
 	}
-	kfree(lea_ino_array);
+	kfree(ea_inode_array);
 }
 
 /*
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index e82c5fe36a26..323eba54f72f 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -164,9 +164,8 @@ extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
 
 extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-				   struct ext4_xattr_ino_array **array);
-extern void ext4_xattr_inode_array_free(struct inode *inode,
-					struct ext4_xattr_ino_array *array);
+				   struct ext4_xattr_inode_array **array);
+extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
 
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 22/28] ext4: move struct ext4_xattr_inode_array to xattr.h
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (19 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 21/28] ext4: modify ext4_xattr_ino_array to hold struct inode * Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 23/28] mbcache: make mbcache more generic Tahsin Erdogan
                   ` (6 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

Since this is a xattr specific data structure it is cleaner to keep it in
xattr header file.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/ext4.h  | 4 ----
 fs/ext4/xattr.h | 5 +++++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 603edb5ff304..580fdb753f29 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2232,10 +2232,6 @@ struct mmpd_data {
 # define ATTRIB_NORET	__attribute__((noreturn))
 # define NORET_AND	noreturn,
 
-struct ext4_xattr_inode_array {
-	unsigned int count;		/* # of used items in the array */
-	struct inode *inodes[0];
-};
 /* bitmap.c */
 extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
 void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 323eba54f72f..adf761518a73 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -117,6 +117,11 @@ struct ext4_xattr_ibody_find {
 	struct ext4_iloc iloc;
 };
 
+struct ext4_xattr_inode_array {
+	unsigned int count;		/* # of used items in the array */
+	struct inode *inodes[0];
+};
+
 extern const struct xattr_handler ext4_xattr_user_handler;
 extern const struct xattr_handler ext4_xattr_trusted_handler;
 extern const struct xattr_handler ext4_xattr_security_handler;
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 23/28] mbcache: make mbcache more generic
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (20 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 22/28] ext4: move struct ext4_xattr_inode_array to xattr.h Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-06-15  7:41     ` [Ocfs2-devel] " Jan Kara
  2017-05-31  8:15 ` [PATCH 24/28] ext4: rename mb block cache functions Tahsin Erdogan
                   ` (5 subsequent siblings)
  27 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

Large xattr feature would like to use the mbcache for xattr value
deduplication. Current implementation is geared towards xattr block
deduplication. Make it more generic so that it can be used by both.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext2/xattr.c         | 18 +++++++++---------
 fs/ext4/xattr.c         | 10 +++++-----
 fs/mbcache.c            | 43 +++++++++++++++++++++----------------------
 include/linux/mbcache.h | 14 ++++++++------
 4 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index fbdb8f171893..1e5f76070580 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -493,8 +493,8 @@ bad_block:		ext2_error(sb, "ext2_xattr_set",
 			 * This must happen under buffer lock for
 			 * ext2_xattr_set2() to reliably detect modified block
 			 */
-			mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
-						    hash, bh->b_blocknr);
+			mb_cache_entry_delete(EXT2_SB(sb)->s_mb_cache, hash,
+					      bh->b_blocknr);
 
 			/* keep the buffer locked while modifying it. */
 		} else {
@@ -721,8 +721,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 			 * This must happen under buffer lock for
 			 * ext2_xattr_set2() to reliably detect freed block
 			 */
-			mb_cache_entry_delete_block(ext2_mb_cache,
-						    hash, old_bh->b_blocknr);
+			mb_cache_entry_delete(ext2_mb_cache, hash,
+					      old_bh->b_blocknr);
 			/* Free the old block. */
 			ea_bdebug(old_bh, "freeing");
 			ext2_free_blocks(inode, old_bh->b_blocknr, 1);
@@ -795,8 +795,8 @@ ext2_xattr_delete_inode(struct inode *inode)
 		 * This must happen under buffer lock for ext2_xattr_set2() to
 		 * reliably detect freed block
 		 */
-		mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
-					    hash, bh->b_blocknr);
+		mb_cache_entry_delete(EXT2_SB(inode->i_sb)->s_mb_cache, hash,
+				      bh->b_blocknr);
 		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
 		get_bh(bh);
 		bforget(bh);
@@ -907,11 +907,11 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
 	while (ce) {
 		struct buffer_head *bh;
 
-		bh = sb_bread(inode->i_sb, ce->e_block);
+		bh = sb_bread(inode->i_sb, ce->e_value);
 		if (!bh) {
 			ext2_error(inode->i_sb, "ext2_xattr_cache_find",
 				"inode %ld: block %ld read error",
-				inode->i_ino, (unsigned long) ce->e_block);
+				inode->i_ino, (unsigned long) ce->e_value);
 		} else {
 			lock_buffer(bh);
 			/*
@@ -931,7 +931,7 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
 			} else if (le32_to_cpu(HDR(bh)->h_refcount) >
 				   EXT2_XATTR_REFCOUNT_MAX) {
 				ea_idebug(inode, "block %ld refcount %d>%d",
-					  (unsigned long) ce->e_block,
+					  (unsigned long) ce->e_value,
 					  le32_to_cpu(HDR(bh)->h_refcount),
 					  EXT2_XATTR_REFCOUNT_MAX);
 			} else if (!ext2_xattr_cmp(header, HDR(bh))) {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 886d06e409b6..772948f168c3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -678,7 +678,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		 * This must happen under buffer lock for
 		 * ext4_xattr_block_set() to reliably detect freed block
 		 */
-		mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
+		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
 		get_bh(bh);
 		unlock_buffer(bh);
 		ext4_free_blocks(handle, inode, bh, 0, 1,
@@ -1115,8 +1115,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			 * ext4_xattr_block_set() to reliably detect modified
 			 * block
 			 */
-			mb_cache_entry_delete_block(ext4_mb_cache, hash,
-						    bs->bh->b_blocknr);
+			mb_cache_entry_delete(ext4_mb_cache, hash,
+					      bs->bh->b_blocknr);
 			ea_bdebug(bs->bh, "modifying in-place");
 			error = ext4_xattr_set_entry(i, s, handle, inode);
 			if (!error) {
@@ -2238,10 +2238,10 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
 	while (ce) {
 		struct buffer_head *bh;
 
-		bh = sb_bread(inode->i_sb, ce->e_block);
+		bh = sb_bread(inode->i_sb, ce->e_value);
 		if (!bh) {
 			EXT4_ERROR_INODE(inode, "block %lu read error",
-					 (unsigned long) ce->e_block);
+					 (unsigned long) ce->e_value);
 		} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
 			*pce = ce;
 			return bh;
diff --git a/fs/mbcache.c b/fs/mbcache.c
index b19be429d655..77a5b99d8f92 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -10,7 +10,7 @@
 /*
  * Mbcache is a simple key-value store. Keys need not be unique, however
  * key-value pairs are expected to be unique (we use this fact in
- * mb_cache_entry_delete_block()).
+ * mb_cache_entry_delete()).
  *
  * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
  * They use hash of a block contents as a key and block number as a value.
@@ -62,15 +62,15 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
  * @cache - cache where the entry should be created
  * @mask - gfp mask with which the entry should be allocated
  * @key - key of the entry
- * @block - block that contains data
- * @reusable - is the block reusable by other inodes?
+ * @value - value of the entry
+ * @reusable - is the entry reusable by others?
  *
- * Creates entry in @cache with key @key and records that data is stored in
- * block @block. The function returns -EBUSY if entry with the same key
- * and for the same block already exists in cache. Otherwise 0 is returned.
+ * Creates entry in @cache with key @key and value @value. The function returns
+ * -EBUSY if entry with the same key and value already exists in cache.
+ * Otherwise 0 is returned.
  */
 int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
-			  sector_t block, bool reusable)
+			  cache_value_t value, bool reusable)
 {
 	struct mb_cache_entry *entry, *dup;
 	struct hlist_bl_node *dup_node;
@@ -91,12 +91,12 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 	/* One ref for hash, one ref returned */
 	atomic_set(&entry->e_refcnt, 1);
 	entry->e_key = key;
-	entry->e_block = block;
+	entry->e_value = value;
 	entry->e_reusable = reusable;
 	head = mb_cache_entry_head(cache, key);
 	hlist_bl_lock(head);
 	hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
-		if (dup->e_key == key && dup->e_block == block) {
+		if (dup->e_key == key && dup->e_value == value) {
 			hlist_bl_unlock(head);
 			kmem_cache_free(mb_entry_cache, entry);
 			return -EBUSY;
@@ -187,13 +187,13 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
 EXPORT_SYMBOL(mb_cache_entry_find_next);
 
 /*
- * mb_cache_entry_get - get a cache entry by block number (and key)
+ * mb_cache_entry_get - get a cache entry by value (and key)
  * @cache - cache we work with
- * @key - key of block number @block
- * @block - block number
+ * @key - key
+ * @value - value
  */
 struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
-					  sector_t block)
+					  cache_value_t value)
 {
 	struct hlist_bl_node *node;
 	struct hlist_bl_head *head;
@@ -202,7 +202,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 	head = mb_cache_entry_head(cache, key);
 	hlist_bl_lock(head);
 	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
-		if (entry->e_key == key && entry->e_block == block) {
+		if (entry->e_key == key && entry->e_value == value) {
 			atomic_inc(&entry->e_refcnt);
 			goto out;
 		}
@@ -214,15 +214,14 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 }
 EXPORT_SYMBOL(mb_cache_entry_get);
 
-/* mb_cache_entry_delete_block - remove information about block from cache
+/* mb_cache_entry_delete - remove a cache entry
  * @cache - cache we work with
- * @key - key of block @block
- * @block - block number
+ * @key - key
+ * @value - value
  *
- * Remove entry from cache @cache with key @key with data stored in @block.
+ * Remove entry from cache @cache with key @key and value @value.
  */
-void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
-				 sector_t block)
+void mb_cache_entry_delete(struct mb_cache *cache, u32 key, cache_value_t value)
 {
 	struct hlist_bl_node *node;
 	struct hlist_bl_head *head;
@@ -231,7 +230,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
 	head = mb_cache_entry_head(cache, key);
 	hlist_bl_lock(head);
 	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
-		if (entry->e_key == key && entry->e_block == block) {
+		if (entry->e_key == key && entry->e_value == value) {
 			/* We keep hash list reference to keep entry alive */
 			hlist_bl_del_init(&entry->e_hash_list);
 			hlist_bl_unlock(head);
@@ -248,7 +247,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
 	}
 	hlist_bl_unlock(head);
 }
-EXPORT_SYMBOL(mb_cache_entry_delete_block);
+EXPORT_SYMBOL(mb_cache_entry_delete);
 
 /* mb_cache_entry_touch - cache entry got used
  * @cache - cache the entry belongs to
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 86c9a8b480c5..e2d9f2f926a4 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -9,6 +9,8 @@
 
 struct mb_cache;
 
+typedef sector_t cache_value_t;
+
 struct mb_cache_entry {
 	/* List of entries in cache - protected by cache->c_list_lock */
 	struct list_head	e_list;
@@ -19,15 +21,15 @@ struct mb_cache_entry {
 	u32			e_key;
 	u32			e_referenced:1;
 	u32			e_reusable:1;
-	/* Block number of hashed block - stable during lifetime of the entry */
-	sector_t		e_block;
+	/* User provided value - stable during lifetime of the entry */
+	cache_value_t		e_value;
 };
 
 struct mb_cache *mb_cache_create(int bucket_bits);
 void mb_cache_destroy(struct mb_cache *cache);
 
 int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
-			  sector_t block, bool reusable);
+			  cache_value_t value, bool reusable);
 void __mb_cache_entry_free(struct mb_cache_entry *entry);
 static inline int mb_cache_entry_put(struct mb_cache *cache,
 				     struct mb_cache_entry *entry)
@@ -38,10 +40,10 @@ static inline int mb_cache_entry_put(struct mb_cache *cache,
 	return 1;
 }
 
-void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
-				  sector_t block);
+void mb_cache_entry_delete(struct mb_cache *cache, u32 key,
+			   cache_value_t value);
 struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
-					  sector_t block);
+					  cache_value_t value);
 struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
 						 u32 key);
 struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 24/28] ext4: rename mb block cache functions
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (21 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 23/28] mbcache: make mbcache more generic Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 25/28] ext4: add ext4_is_quota_file() Tahsin Erdogan
                   ` (4 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

There will be a second cache instance that tracks ea_inodes. Make
existing names more explicit so that it is clear that they refer to
xattr block cache.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/xattr.c | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 772948f168c3..3ee7e2f68476 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -72,10 +72,11 @@
 # define ea_bdebug(bh, fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
-static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
-static struct buffer_head *ext4_xattr_cache_find(struct inode *,
-						 struct ext4_xattr_header *,
-						 struct mb_cache_entry **);
+static void ext4_xattr_block_cache_insert(struct mb_cache *,
+					  struct buffer_head *);
+static struct buffer_head *
+ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
+			    struct mb_cache_entry **);
 static void ext4_xattr_rehash(struct ext4_xattr_header *,
 			      struct ext4_xattr_entry *);
 
@@ -395,7 +396,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 		error = -EFSCORRUPTED;
 		goto cleanup;
 	}
-	ext4_xattr_cache_insert(ext4_mb_cache, bh);
+	ext4_xattr_block_cache_insert(ext4_mb_cache, bh);
 	entry = BFIRST(bh);
 	error = ext4_xattr_find_entry(&entry, name_index, name, 1);
 	if (error)
@@ -563,7 +564,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 		error = -EFSCORRUPTED;
 		goto cleanup;
 	}
-	ext4_xattr_cache_insert(ext4_mb_cache, bh);
+	ext4_xattr_block_cache_insert(ext4_mb_cache, bh);
 	error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
 
 cleanup:
@@ -1123,8 +1124,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 				if (!IS_LAST_ENTRY(s->first))
 					ext4_xattr_rehash(header(s->base),
 							  s->here);
-				ext4_xattr_cache_insert(ext4_mb_cache,
-					bs->bh);
+				ext4_xattr_block_cache_insert(ext4_mb_cache,
+							      bs->bh);
 			}
 			ext4_xattr_block_csum_set(inode, bs->bh);
 			unlock_buffer(bs->bh);
@@ -1177,7 +1178,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 
 inserted:
 	if (!IS_LAST_ENTRY(s->first)) {
-		new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce);
+		new_bh = ext4_xattr_block_cache_find(inode, header(s->base),
+						     &ce);
 		if (new_bh) {
 			/* We found an identical block in the cache. */
 			if (new_bh == bs->bh)
@@ -1292,7 +1294,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			ext4_xattr_block_csum_set(inode, new_bh);
 			set_buffer_uptodate(new_bh);
 			unlock_buffer(new_bh);
-			ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
+			ext4_xattr_block_cache_insert(ext4_mb_cache, new_bh);
 			error = ext4_handle_dirty_metadata(handle, inode,
 							   new_bh);
 			if (error)
@@ -2150,15 +2152,16 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
 }
 
 /*
- * ext4_xattr_cache_insert()
+ * ext4_xattr_block_cache_insert()
  *
- * Create a new entry in the extended attribute cache, and insert
+ * Create a new entry in the extended attribute block cache, and insert
  * it unless such an entry is already in the cache.
  *
  * Returns 0, or a negative error number on failure.
  */
 static void
-ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
+ext4_xattr_block_cache_insert(struct mb_cache *ext4_mb_cache,
+			      struct buffer_head *bh)
 {
 	struct ext4_xattr_header *header = BHDR(bh);
 	__u32 hash = le32_to_cpu(header->h_hash);
@@ -2216,7 +2219,7 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
 }
 
 /*
- * ext4_xattr_cache_find()
+ * ext4_xattr_block_cache_find()
  *
  * Find an identical extended attribute block.
  *
@@ -2224,8 +2227,9 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
  * not found or an error occurred.
  */
 static struct buffer_head *
-ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
-		      struct mb_cache_entry **pce)
+ext4_xattr_block_cache_find(struct inode *inode,
+			    struct ext4_xattr_header *header,
+			    struct mb_cache_entry **pce)
 {
 	__u32 hash = le32_to_cpu(header->h_hash);
 	struct mb_cache_entry *ce;
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 25/28] ext4: add ext4_is_quota_file()
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (22 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 24/28] ext4: rename mb block cache functions Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 26/28] ext4: cleanup transaction restarts during inode deletion Tahsin Erdogan
                   ` (3 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

IS_NOQUOTA() indicates whether quota is disabled for an inode. Ext4
also uses it to check whether an inode is for a quota file. The
distinction currently doesn't matter because quota is disabled only
for the quota files. When we start disabling quota for other inodes
in the future, we will want to make the distinction clear.

Replace IS_NOQUOTA() call with ext4_is_quota_file() at places where
we are checking for quota files.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/ext4.h        | 2 ++
 fs/ext4/inode.c       | 2 +-
 fs/ext4/ioctl.c       | 4 ++--
 fs/ext4/mballoc.c     | 2 +-
 fs/ext4/move_extent.c | 2 +-
 5 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 580fdb753f29..d79d8d7bee88 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2099,6 +2099,8 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
 	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
 }
 
+#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
+
 /*
  * This structure is stuffed into the struct file's private_data field
  * for directories.  It is where we put information so that we can do
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8ee20b586567..cf91532765a4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -739,7 +739,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		if (map->m_flags & EXT4_MAP_NEW &&
 		    !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
 		    !(flags & EXT4_GET_BLOCKS_ZERO) &&
-		    !IS_NOQUOTA(inode) &&
+		    !ext4_is_quota_file(inode) &&
 		    ext4_should_order_data(inode)) {
 			if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
 				ret = ext4_jbd2_inode_add_wait(handle, inode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0c21e22acd74..dde8deb11e59 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -218,7 +218,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
 	unsigned int jflag;
 
 	/* Is it quota file? Do not allow user to mess with it */
-	if (IS_NOQUOTA(inode))
+	if (ext4_is_quota_file(inode))
 		goto flags_out;
 
 	oldflags = ei->i_flags;
@@ -342,7 +342,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
 	err = -EPERM;
 	inode_lock(inode);
 	/* Is it quota file? Do not allow user to mess with it */
-	if (IS_NOQUOTA(inode))
+	if (ext4_is_quota_file(inode))
 		goto out_unlock;
 
 	err = ext4_get_inode_loc(inode, &iloc);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b7928cddd539..d109a2a2fea0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4464,7 +4464,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	trace_ext4_request_blocks(ar);
 
 	/* Allow to use superuser reservation for quota file */
-	if (IS_NOQUOTA(ar->inode))
+	if (ext4_is_quota_file(ar->inode))
 		ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
 
 	if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c992ef2c2f94..9bb36909ec92 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -484,7 +484,7 @@ mext_check_arguments(struct inode *orig_inode,
 		return -EBUSY;
 	}
 
-	if (IS_NOQUOTA(orig_inode) || IS_NOQUOTA(donor_inode)) {
+	if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) {
 		ext4_debug("ext4 move extent: The argument files should "
 			"not be quota files [ino:orig %lu, donor %lu]\n",
 			orig_inode->i_ino, donor_inode->i_ino);
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 26/28] ext4: cleanup transaction restarts during inode deletion
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (23 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 25/28] ext4: add ext4_is_quota_file() Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-06-14 14:17   ` [PATCH v2 " Tahsin Erdogan
  2017-05-31  8:15 ` [PATCH 27/28] ext4: xattr inode deduplication Tahsin Erdogan
                   ` (2 subsequent siblings)
  27 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

During inode deletion, journal credits that will be needed are hard to
determine, that is why we have journal extend/restart calls in several
places. Whenever a transaction is restarted, filesystem must be in a
consistent state because there is no atomicity guarantee beyond a
restart call.

Add ext4_xattr_ensure_credits() helper function which takes care of
journal extend/restart logic. It also handles getting jbd2 write access
and dirty metadata calls. This function is called at every iteration of
handling an ea_inode reference.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/inode.c |  66 ++++-----------
 fs/ext4/xattr.c | 257 ++++++++++++++++++++++++++++++++++++--------------------
 fs/ext4/xattr.h |   3 +-
 3 files changed, 183 insertions(+), 143 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cf91532765a4..4d6936f0d8a4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -239,7 +239,11 @@ void ext4_evict_inode(struct inode *inode)
 	 */
 	sb_start_intwrite(inode->i_sb);
 
-	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits);
+	if (!IS_NOQUOTA(inode))
+		extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
+
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+				 ext4_blocks_for_truncate(inode)+extra_credits);
 	if (IS_ERR(handle)) {
 		ext4_std_error(inode->i_sb, PTR_ERR(handle));
 		/*
@@ -251,36 +255,9 @@ void ext4_evict_inode(struct inode *inode)
 		sb_end_intwrite(inode->i_sb);
 		goto no_delete;
 	}
+
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
-
-	/*
-	 * Delete xattr inode before deleting the main inode.
-	 */
-	err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array);
-	if (err) {
-		ext4_warning(inode->i_sb,
-			     "couldn't delete inode's xattr (err %d)", err);
-		goto stop_handle;
-	}
-
-	if (!IS_NOQUOTA(inode))
-		extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
-
-	if (!ext4_handle_has_enough_credits(handle,
-			ext4_blocks_for_truncate(inode) + extra_credits)) {
-		err = ext4_journal_extend(handle,
-			ext4_blocks_for_truncate(inode) + extra_credits);
-		if (err > 0)
-			err = ext4_journal_restart(handle,
-			ext4_blocks_for_truncate(inode) + extra_credits);
-		if (err != 0) {
-			ext4_warning(inode->i_sb,
-				     "couldn't extend journal (err %d)", err);
-			goto stop_handle;
-		}
-	}
-
 	inode->i_size = 0;
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
@@ -298,25 +275,17 @@ void ext4_evict_inode(struct inode *inode)
 		}
 	}
 
-	/*
-	 * ext4_ext_truncate() doesn't reserve any slop when it
-	 * restarts journal transactions; therefore there may not be
-	 * enough credits left in the handle to remove the inode from
-	 * the orphan list and set the dtime field.
-	 */
-	if (!ext4_handle_has_enough_credits(handle, extra_credits)) {
-		err = ext4_journal_extend(handle, extra_credits);
-		if (err > 0)
-			err = ext4_journal_restart(handle, extra_credits);
-		if (err != 0) {
-			ext4_warning(inode->i_sb,
-				     "couldn't extend journal (err %d)", err);
-		stop_handle:
-			ext4_journal_stop(handle);
-			ext4_orphan_del(NULL, inode);
-			sb_end_intwrite(inode->i_sb);
-			goto no_delete;
-		}
+	/* Remove xattr references. */
+	err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
+				      extra_credits);
+	if (err) {
+		ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
+	stop_handle:
+		ext4_journal_stop(handle);
+		ext4_orphan_del(NULL, inode);
+		sb_end_intwrite(inode->i_sb);
+		ext4_xattr_inode_array_free(ea_inode_array);
+		goto no_delete;
 	}
 
 	/*
@@ -342,7 +311,6 @@ void ext4_evict_inode(struct inode *inode)
 		ext4_clear_inode(inode);
 	else
 		ext4_free_inode(handle, inode);
-
 	ext4_journal_stop(handle);
 	sb_end_intwrite(inode->i_sb);
 	ext4_xattr_inode_array_free(ea_inode_array);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3ee7e2f68476..6acce1f689ab 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -108,6 +108,10 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
 #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
 				inode->i_sb->s_fs_info)->s_mb_cache)
 
+static int
+ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
+			struct inode *inode);
+
 #ifdef CONFIG_LOCKDEP
 void ext4_xattr_inode_set_class(struct inode *ea_inode)
 {
@@ -653,6 +657,127 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 	}
 }
 
+int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
+			      int credits, struct buffer_head *bh,
+			      bool dirty, bool block_csum)
+{
+	int error;
+
+	if (!ext4_handle_valid(handle))
+		return 0;
+
+	if (handle->h_buffer_credits >= credits)
+		return 0;
+
+	error = ext4_journal_extend(handle, credits - handle->h_buffer_credits);
+	if (!error)
+		return 0;
+	if (error < 0) {
+		ext4_warning(inode->i_sb, "Extend journal (error %d)", error);
+		return error;
+	}
+
+	if (bh && dirty) {
+		if (block_csum)
+			ext4_xattr_block_csum_set(inode, bh);
+		error = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (error) {
+			ext4_warning(inode->i_sb, "Handle metadata (error %d)",
+				     error);
+			return error;
+		}
+	}
+
+	error = ext4_journal_restart(handle, credits);
+	if (error) {
+		ext4_warning(inode->i_sb, "Restart journal (error %d)", error);
+		return error;
+	}
+
+	if (bh) {
+		error = ext4_journal_get_write_access(handle, bh);
+		if (error) {
+			ext4_warning(inode->i_sb,
+				     "Get write access failed (error %d)",
+				     error);
+			return error;
+		}
+	}
+	return 0;
+}
+
+static void
+ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
+			    struct buffer_head *bh,
+			    struct ext4_xattr_entry *first, bool block_csum,
+			    struct ext4_xattr_inode_array **ea_inode_array,
+			    int extra_credits)
+{
+	struct inode *ea_inode;
+	struct ext4_xattr_entry *entry;
+	bool dirty = false;
+	unsigned int ea_ino;
+	int err;
+	int credits;
+
+	/* One credit for dec ref on ea_inode, one for orphan list addition, */
+	credits = 2 + extra_credits;
+
+	for (entry = first; !IS_LAST_ENTRY(entry);
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		ea_ino = le32_to_cpu(entry->e_value_inum);
+		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		if (err)
+			continue;
+
+		err = ext4_expand_inode_array(ea_inode_array, ea_inode);
+		if (err) {
+			ext4_warning_inode(ea_inode,
+					   "Expand inode array err=%d", err);
+			iput(ea_inode);
+			continue;
+		}
+
+		err = ext4_xattr_ensure_credits(handle, parent, credits, bh,
+						dirty, block_csum);
+		if (err) {
+			ext4_warning_inode(ea_inode, "Ensure credits err=%d",
+					   err);
+			continue;
+		}
+
+		inode_lock(ea_inode);
+		clear_nlink(ea_inode);
+		ext4_orphan_add(handle, ea_inode);
+		inode_unlock(ea_inode);
+
+		/*
+		 * Forget about ea_inode within the same transaction that decrements the ref
+		 * count. This avoids duplicate decrements in case the rest of the work
+		 * spills over to subsequent transactions.
+		 */
+		entry->e_value_inum = 0;
+		entry->e_value_size = 0;
+
+		dirty = true;
+	}
+
+	if (dirty) {
+		/*
+		 * Note that we are deliberately skipping csum calculation for
+		 * the final update because we do not expect any journal
+		 * restarts until xattr block is freed.
+		 */
+
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (err)
+			ext4_warning_inode(parent,
+					   "handle dirty metadata err=%d", err);
+	}
+}
+
 /*
  * Release the xattr block BH: If the reference count is > 1, decrement it;
  * otherwise free the block.
@@ -1985,42 +2110,6 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
 	return 0;
 }
 
-/**
- * Add xattr inode to orphan list
- */
-static int
-ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits,
-			    struct ext4_xattr_inode_array *ea_inode_array)
-{
-	int idx = 0, error = 0;
-	struct inode *ea_inode;
-
-	if (ea_inode_array == NULL)
-		return 0;
-
-	for (; idx < ea_inode_array->count; ++idx) {
-		if (!ext4_handle_has_enough_credits(handle, credits)) {
-			error = ext4_journal_extend(handle, credits);
-			if (error > 0)
-				error = ext4_journal_restart(handle, credits);
-
-			if (error != 0) {
-				ext4_warning(inode->i_sb,
-					"couldn't extend journal "
-					"(err %d)", error);
-				return error;
-			}
-		}
-		ea_inode = ea_inode_array->inodes[idx];
-		inode_lock(ea_inode);
-		ext4_orphan_add(handle, ea_inode);
-		inode_unlock(ea_inode);
-		/* the inode's i_count will be released by caller */
-	}
-
-	return 0;
-}
-
 /*
  * ext4_xattr_delete_inode()
  *
@@ -2033,16 +2122,23 @@ ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits,
  */
 int
 ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-			struct ext4_xattr_inode_array **ea_inode_array)
+			struct ext4_xattr_inode_array **ea_inode_array,
+			int extra_credits)
 {
 	struct buffer_head *bh = NULL;
 	struct ext4_xattr_ibody_header *header;
 	struct ext4_inode *raw_inode;
-	struct ext4_iloc iloc;
-	struct ext4_xattr_entry *entry;
-	struct inode *ea_inode;
-	unsigned int ea_ino;
-	int credits = 3, error = 0;
+	struct ext4_iloc iloc = { .bh = NULL };
+	int error;
+
+	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
+					  NULL /* bh */,
+					  false /* dirty */,
+					  false /* block_csum */);
+	if (error) {
+		EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error);
+		goto cleanup;
+	}
 
 	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
 		goto delete_external_ea;
@@ -2050,31 +2146,20 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	error = ext4_get_inode_loc(inode, &iloc);
 	if (error)
 		goto cleanup;
+
+	error = ext4_journal_get_write_access(handle, iloc.bh);
+	if (error)
+		goto cleanup;
+
 	raw_inode = ext4_raw_inode(&iloc);
 	header = IHDR(inode, raw_inode);
-	for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
-	     entry = EXT4_XATTR_NEXT(entry)) {
-		if (!entry->e_value_inum)
-			continue;
-		ea_ino = le32_to_cpu(entry->e_value_inum);
-		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-		if (error)
-			continue;
-		error = ext4_expand_inode_array(ea_inode_array, ea_inode);
-		if (error) {
-			iput(ea_inode);
-			brelse(iloc.bh);
-			goto cleanup;
-		}
-		entry->e_value_inum = 0;
-	}
-	brelse(iloc.bh);
+	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
+				    false /* block_csum */, ea_inode_array,
+				    extra_credits);
 
 delete_external_ea:
 	if (!EXT4_I(inode)->i_file_acl) {
-		/* add xattr inode to orphan list */
-		error = ext4_xattr_inode_orphan_add(handle, inode, credits,
-						    *ea_inode_array);
+		error = 0;
 		goto cleanup;
 	}
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
@@ -2092,46 +2177,32 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 		goto cleanup;
 	}
 
-	for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
-	     entry = EXT4_XATTR_NEXT(entry)) {
-		if (!entry->e_value_inum)
-			continue;
-		ea_ino = le32_to_cpu(entry->e_value_inum);
-		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-		if (error)
-			continue;
-		error = ext4_expand_inode_array(ea_inode_array, ea_inode);
-		if (error)
-			goto cleanup;
-		entry->e_value_inum = 0;
-	}
-
-	/* add xattr inode to orphan list */
-	error = ext4_xattr_inode_orphan_add(handle, inode, credits,
-					*ea_inode_array);
-	if (error)
-		goto cleanup;
-
-	if (!IS_NOQUOTA(inode))
-		credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
-
-	if (!ext4_handle_has_enough_credits(handle, credits)) {
-		error = ext4_journal_extend(handle, credits);
-		if (error > 0)
-			error = ext4_journal_restart(handle, credits);
+	if (ext4_has_feature_ea_inode(inode->i_sb)) {
+		error = ext4_journal_get_write_access(handle, bh);
 		if (error) {
-			ext4_warning(inode->i_sb,
-				"couldn't extend journal (err %d)", error);
+			EXT4_ERROR_INODE(inode, "write access %llu",
+					 EXT4_I(inode)->i_file_acl);
 			goto cleanup;
 		}
+		ext4_xattr_inode_remove_all(handle, inode, bh,
+					    BFIRST(bh),
+					    true /* block_csum */,
+					    ea_inode_array,
+					    extra_credits);
 	}
 
 	ext4_xattr_release_block(handle, inode, bh);
+	/* Update i_file_acl within the same transaction that releases block. */
 	EXT4_I(inode)->i_file_acl = 0;
-
+	error = ext4_mark_inode_dirty(handle, inode);
+	if (error) {
+		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
+				 error);
+		goto cleanup;
+	}
 cleanup:
+	brelse(iloc.bh);
 	brelse(bh);
-
 	return error;
 }
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index adf761518a73..b2005a2716d9 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -169,7 +169,8 @@ extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
 
 extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-				   struct ext4_xattr_inode_array **array);
+				   struct ext4_xattr_inode_array **array,
+				   int extra_credits);
 extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
 
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 27/28] ext4: xattr inode deduplication
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (24 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 26/28] ext4: cleanup transaction restarts during inode deletion Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-05-31 15:40     ` [Ocfs2-devel] " kbuild test robot
                     ` (2 more replies)
  2017-05-31  8:15 ` [PATCH 28/28] quota: add extra inode count to dquot transfer functions Tahsin Erdogan
  2017-05-31 16:42   ` Darrick J. Wong
  27 siblings, 3 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

Ext4 now supports xattr values that are up to 64k in size (vfs limit).
Large xattr values are stored in external inodes each one holding a
single value. Once written the data blocks of these inodes are immutable.

The real world use cases are expected to have a lot of value duplication
such as inherited acls etc. To reduce data duplication on disk, this patch
implements a deduplicator that allows sharing of xattr inodes.

The deduplication is based on an in-memory hash lookup that is a best
effort sharing scheme. When a xattr inode is read from disk (i.e.
getxattr() call), its crc32c hash is added to a hash table. Before
creating a new xattr inode for a value being set, the hash table is
checked to see if an existing inode holds an identical value. If such an
inode is found, the ref count on that inode is incremented. On value
removal the ref count is decremented and if it reaches zero the inode is
deleted.

The quota charging for such inodes is manually managed. Every reference
holder is charged the full size as if there was no sharing happening.
This is consistent with how xattr blocks are also charged.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/acl.c   |    5 +-
 fs/ext4/ext4.h  |    7 +-
 fs/ext4/inode.c |    9 +-
 fs/ext4/super.c |   22 +-
 fs/ext4/xattr.c | 1073 +++++++++++++++++++++++++++++++++++++++++++------------
 fs/ext4/xattr.h |   17 +-
 fs/mbcache.c    |    9 +-
 7 files changed, 881 insertions(+), 261 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 74f7ac539e00..8db03e5c78bc 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	if (error)
 		return error;
 retry:
-	credits = ext4_xattr_set_credits(inode, acl_size);
+	error = ext4_xattr_set_credits(inode, acl_size, &credits);
+	if (error)
+		return error;
+
 	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d79d8d7bee88..79f06290e723 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1517,6 +1517,7 @@ struct ext4_sb_info {
 	long s_es_nr_inode;
 	struct ext4_es_stats s_es_stats;
 	struct mb_cache *s_mb_cache;
+	struct mb_cache *s_ea_inode_cache;
 	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
 
 	/* Ratelimit ext4 messages. */
@@ -2099,7 +2100,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
 	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
 }
 
-#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
+static inline bool ext4_is_quota_file(struct inode *inode)
+{
+	return IS_NOQUOTA(inode) &&
+	       !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
+}
 
 /*
  * This structure is stuffed into the struct file's private_data field
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4d6936f0d8a4..6f5872197d6c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4843,8 +4843,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	}
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
-	if (ei->i_flags & EXT4_EA_INODE_FL)
+
+	if (ei->i_flags & EXT4_EA_INODE_FL) {
 		ext4_xattr_inode_set_class(inode);
+
+		inode_lock(inode);
+		inode->i_flags |= S_NOQUOTA;
+		inode_unlock(inode);
+	}
+
 	unlock_new_inode(inode);
 	return inode;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b02a23ec92ca..7d2b692d52ea 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
 		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
+	if (sbi->s_ea_inode_cache) {
+		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+		sbi->s_ea_inode_cache = NULL;
+	}
 	if (sbi->s_mb_cache) {
 		ext4_xattr_destroy_cache(sbi->s_mb_cache);
 		sbi->s_mb_cache = NULL;
@@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 	if (res)
 		return res;
 retry:
-	credits = ext4_xattr_set_credits(inode, len);
+	res = ext4_xattr_set_credits(inode, len, &credits);
+	if (res)
+		return res;
+
 	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -4067,6 +4074,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount_wq;
 	}
 
+	if (ext4_has_feature_ea_inode(sb)) {
+		sbi->s_ea_inode_cache = ext4_xattr_create_cache();
+		if (!sbi->s_ea_inode_cache) {
+			ext4_msg(sb, KERN_ERR,
+				 "Failed to create an s_ea_inode_cache");
+			goto failed_mount_wq;
+		}
+	}
+
 	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
 	    (blocksize != PAGE_SIZE)) {
 		ext4_msg(sb, KERN_ERR,
@@ -4296,6 +4312,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (EXT4_SB(sb)->rsv_conversion_wq)
 		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
 failed_mount_wq:
+	if (sbi->s_ea_inode_cache) {
+		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+		sbi->s_ea_inode_cache = NULL;
+	}
 	if (sbi->s_mb_cache) {
 		ext4_xattr_destroy_cache(sbi->s_mb_cache);
 		sbi->s_mb_cache = NULL;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 6acce1f689ab..caddc176a612 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -55,6 +55,7 @@
 #include <linux/slab.h>
 #include <linux/mbcache.h>
 #include <linux/quotaops.h>
+#include <linux/crc32c.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "xattr.h"
@@ -79,6 +80,7 @@ ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
 			    struct mb_cache_entry **);
 static void ext4_xattr_rehash(struct ext4_xattr_header *,
 			      struct ext4_xattr_entry *);
+static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash);
 
 static const struct xattr_handler * const ext4_xattr_handler_map[] = {
 	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
@@ -105,13 +107,23 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
 	NULL
 };
 
+#define EXT4_XATTR_SYSTEM_EA_INFO  "eai"
+
 #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
 				inode->i_sb->s_fs_info)->s_mb_cache)
 
+#define EA_INODE_CACHE(inode)	(((struct ext4_sb_info *) \
+				inode->i_sb->s_fs_info)->s_ea_inode_cache)
+
 static int
 ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
 			struct inode *inode);
 
+static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
+				 u32 hash);
+static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
+				     u64 *ref_return, u32 *hash);
+
 #ifdef CONFIG_LOCKDEP
 void ext4_xattr_inode_set_class(struct inode *ea_inode)
 {
@@ -329,14 +341,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
 		goto error;
 	}
 
-	if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
-	    inode->i_generation != parent->i_generation) {
-		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
-			   "to parent is invalid.", ea_ino);
-		err = -EINVAL;
-		goto error;
-	}
-
 	if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
 		ext4_error(parent->i_sb, "EA inode %lu does not have "
 			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
@@ -351,6 +355,11 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
 	return err;
 }
 
+static u32 ext4_xattr_inode_hash(const void *buffer, size_t size)
+{
+	return crc32c(0, buffer, size);
+}
+
 /*
  * Read the value from the EA inode.
  */
@@ -358,17 +367,52 @@ static int
 ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
 		     size_t size)
 {
+	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
 	struct inode *ea_inode;
-	int ret;
+	u32 hash, calc_hash;
+	struct mb_cache_entry *ce;
+	int err;
 
-	ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-	if (ret)
-		return ret;
+	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+	if (err) {
+		ea_inode = NULL;
+		goto out;
+	}
 
-	ret = ext4_xattr_inode_read(ea_inode, buffer, size);
-	iput(ea_inode);
+	if (i_size_read(ea_inode) != size) {
+		ext4_warning_inode(ea_inode,
+				   "ea_inode file size=%llu entry size=%zu",
+				   i_size_read(ea_inode), size);
+		err = -EFSCORRUPTED;
+		goto out;
+	}
 
-	return ret;
+	err = ext4_xattr_inode_read(ea_inode, buffer, size);
+	if (!err) {
+		if (ext4_xattr_read_ea_hash(ea_inode, &hash))
+			goto out;
+
+		/* Avoid hash calculation if already cached. */
+		ce = mb_cache_entry_get(ea_inode_cache, hash, ea_inode->i_ino);
+		if (ce) {
+			mb_cache_entry_put(ea_inode_cache, ce);
+			goto out;
+		}
+
+		calc_hash = ext4_xattr_inode_hash(buffer, size);
+		if (hash != calc_hash) {
+			ext4_warning_inode(ea_inode, "EA inode saved hash=%#x "
+					   "does not match calc_hash=%#x",
+					   hash, calc_hash);
+			goto out;
+		}
+
+		mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
+				      ea_inode->i_ino, true /* reusable */);
+	}
+out:
+	iput(ea_inode);
+	return err;
 }
 
 static int
@@ -657,6 +701,101 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 	}
 }
 
+static inline size_t round_up_cluster(struct inode *inode, size_t length)
+{
+	struct super_block *sb = inode->i_sb;
+	size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
+				    inode->i_blkbits);
+	size_t mask = ~(cluster_size - 1);
+
+	return (length + cluster_size - 1) & mask;
+}
+
+static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
+{
+	int err;
+
+	err = dquot_alloc_inode(inode);
+	if (err)
+		return err;
+	err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
+	if (err)
+		dquot_free_inode(inode);
+	return err;
+}
+
+static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
+{
+	dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
+	dquot_free_inode(inode);
+}
+
+static int __ext4_xattr_set_credits(struct super_block *sb,
+				    struct buffer_head *block_bh,
+				    size_t value_len)
+{
+	int credits;
+	int blocks;
+
+	/*
+	 * 1) Owner inode update
+	 * 2) Ref count update on old xattr block
+	 * 3) new xattr block
+	 * 4) block bitmap update for new xattr block
+	 * 5) group descriptor for new xattr block
+	 */
+	credits = 5;
+
+	/* We are done if ea_inode feature is not enabled. */
+	if (!ext4_has_feature_ea_inode(sb))
+		return credits;
+
+	/* New ea_inode, inode map, block bitmap, group descriptor. */
+	credits += 4;
+
+	/* Data blocks. */
+	blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+
+	/* Indirection block. */
+	blocks += 1;
+
+	/* Block bitmap and group descriptor updates for each block. */
+	credits += blocks * 2;
+
+	/* Blocks themselves. */
+	credits += blocks;
+
+	/* Dereference ea_inode holding old xattr value.
+	 * Old ea_inode, inode map, block bitmap, group descriptor.
+	 */
+	credits += 4;
+
+	/* Data blocks for old ea_inode. */
+	blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
+
+	/* Indirection block for old ea_inode. */
+	blocks += 1;
+
+	/* Block bitmap and group descriptor updates for each block. */
+	credits += blocks * 2;
+
+	/* Quota updates. */
+	credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
+
+	/* We may need to clone the existing xattr block in which case we need
+	 * to increment ref counts for existing ea_inodes referenced by it.
+	 */
+	if (block_bh) {
+		struct ext4_xattr_entry *entry = BFIRST(block_bh);
+
+		for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
+			if (entry->e_value_inum)
+				/* Ref count update on ea_inode. */
+				credits += 1;
+	}
+	return credits;
+}
+
 int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
 			      int credits, struct buffer_head *bh,
 			      bool dirty, bool block_csum)
@@ -706,12 +845,139 @@ int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
+static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
+				       int ref_change)
+{
+	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
+	struct ext4_iloc iloc;
+	s64 ref_return;
+	u32 hash;
+	int ret;
+
+	inode_lock(ea_inode);
+
+	ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
+	if (ret) {
+		iloc.bh = NULL;
+		goto out;
+	}
+
+	ret = ext4_xattr_update_ea_info(ea_inode, ref_change, &ref_return,
+					&hash);
+	if (ret)
+		goto out;
+
+	if (ref_change > 0) {
+		WARN_ONCE(ref_return <= 0, "EA inode %lu ref_return=%lld",
+			  ea_inode->i_ino, ref_return);
+
+		if (ref_return == 1) {
+			WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
+				  ea_inode->i_ino, ea_inode->i_nlink);
+
+			set_nlink(ea_inode, 1);
+			ext4_orphan_del(handle, ea_inode);
+
+			mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
+					      ea_inode->i_ino,
+					      true /* reusable */);
+		}
+	} else {
+		WARN_ONCE(ref_return < 0, "EA inode %lu ref_return=%lld",
+			  ea_inode->i_ino, ref_return);
+
+		if (ref_return == 0) {
+			WARN_ONCE(ea_inode->i_nlink != 1,
+				  "EA inode %lu i_nlink=%u",
+				  ea_inode->i_ino, ea_inode->i_nlink);
+
+			clear_nlink(ea_inode);
+			ext4_orphan_add(handle, ea_inode);
+
+			mb_cache_entry_delete(ea_inode_cache, hash,
+					      ea_inode->i_ino);
+		}
+	}
+
+	ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
+	iloc.bh = NULL;
+	if (ret)
+		ext4_warning_inode(ea_inode,
+				   "ext4_mark_iloc_dirty() failed ret=%d", ret);
+out:
+	brelse(iloc.bh);
+	inode_unlock(ea_inode);
+	return ret;
+}
+
+static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
+{
+	return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
+}
+
+static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
+{
+	return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
+}
+
+static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
+					struct ext4_xattr_entry *first)
+{
+	struct inode *ea_inode;
+	struct ext4_xattr_entry *entry;
+	struct ext4_xattr_entry *failed_entry;
+	unsigned int ea_ino;
+	int err, saved_err;
+
+	for (entry = first; !IS_LAST_ENTRY(entry);
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		ea_ino = le32_to_cpu(entry->e_value_inum);
+		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		if (err)
+			goto cleanup;
+		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
+		if (err) {
+			ext4_warning_inode(ea_inode, "inc ref error %d", err);
+			iput(ea_inode);
+			goto cleanup;
+		}
+		iput(ea_inode);
+	}
+	return 0;
+
+cleanup:
+	saved_err = err;
+	failed_entry = entry;
+
+	for (entry = first; entry != failed_entry;
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		ea_ino = le32_to_cpu(entry->e_value_inum);
+		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		if (err) {
+			ext4_warning(parent->i_sb,
+				     "cleanup ea_ino %u iget error %d", ea_ino,
+				     err);
+			continue;
+		}
+		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (err)
+			ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
+					   err);
+		iput(ea_inode);
+	}
+	return saved_err;
+}
+
 static void
-ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
-			    struct buffer_head *bh,
-			    struct ext4_xattr_entry *first, bool block_csum,
-			    struct ext4_xattr_inode_array **ea_inode_array,
-			    int extra_credits)
+ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
+			     struct buffer_head *bh,
+			     struct ext4_xattr_entry *first, bool block_csum,
+			     struct ext4_xattr_inode_array **ea_inode_array,
+			     int extra_credits, bool skip_quota)
 {
 	struct inode *ea_inode;
 	struct ext4_xattr_entry *entry;
@@ -748,10 +1014,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
 			continue;
 		}
 
-		inode_lock(ea_inode);
-		clear_nlink(ea_inode);
-		ext4_orphan_add(handle, ea_inode);
-		inode_unlock(ea_inode);
+		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (err) {
+			ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
+					   err);
+			continue;
+		}
+
+		if (!skip_quota)
+			ext4_xattr_inode_free_quota(parent,
+					      le32_to_cpu(entry->e_value_size));
 
 		/*
 		 * Forget about ea_inode within the same transaction that decrements the ref
@@ -784,7 +1056,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
  */
 static void
 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
-			 struct buffer_head *bh)
+			 struct buffer_head *bh,
+			 struct ext4_xattr_inode_array **ea_inode_array,
+			 int extra_credits)
 {
 	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 	u32 hash, ref;
@@ -807,6 +1081,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
 		get_bh(bh);
 		unlock_buffer(bh);
+
+		if (ext4_has_feature_ea_inode(inode->i_sb))
+			ext4_xattr_inode_dec_ref_all(handle, inode, bh,
+						     BFIRST(bh),
+						     true /* block_csum */,
+						     ea_inode_array,
+						     extra_credits,
+						     true /* skip_quota */);
 		ext4_free_blocks(handle, inode, bh, 0, 1,
 				 EXT4_FREE_BLOCKS_METADATA |
 				 EXT4_FREE_BLOCKS_FORGET);
@@ -947,7 +1229,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
  * Create an inode to store the value of a large EA.
  */
 static struct inode *ext4_xattr_inode_create(handle_t *handle,
-					     struct inode *inode)
+					     struct inode *inode, u32 hash)
 {
 	struct inode *ea_inode = NULL;
 	uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
@@ -965,67 +1247,118 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 		ea_inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(ea_inode);
 		ext4_xattr_inode_set_class(ea_inode);
-		ea_inode->i_generation = inode->i_generation;
-		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
-
-		/*
-		 * A back-pointer from EA inode to parent inode will be useful
-		 * for e2fsck.
-		 */
-		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
 		unlock_new_inode(ea_inode);
 		err = ext4_inode_attach_jinode(ea_inode);
+		if (!err)
+			err = ext4_xattr_inode_init(handle, ea_inode, hash);
 		if (err) {
 			iput(ea_inode);
 			return ERR_PTR(err);
 		}
+
+		/*
+		 * Xattr inodes are shared therefore quota charging is performed
+		 * at a higher level.
+		 */
+		dquot_free_inode(ea_inode);
+		dquot_drop(ea_inode);
+		inode_lock(ea_inode);
+		ea_inode->i_flags |= S_NOQUOTA;
+		inode_unlock(ea_inode);
 	}
 
 	return ea_inode;
 }
 
-/*
- * Unlink the inode storing the value of the EA.
- */
-int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
+static struct inode *
+ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
+			    size_t value_len, u32 hash)
 {
-	struct inode *ea_inode = NULL;
+	struct inode *ea_inode;
+	struct mb_cache_entry *ce;
+	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
+	void *ea_data = NULL;
 	int err;
 
-	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-	if (err)
-		return err;
+	ce = mb_cache_entry_find_first(ea_inode_cache, hash);
+	while (ce) {
+		ea_inode = ext4_iget(inode->i_sb, ce->e_value);
+		if (IS_ERR(ea_inode)) {
+			ea_inode = NULL;
+			goto next;
+		}
 
-	clear_nlink(ea_inode);
-	iput(ea_inode);
+		if (is_bad_inode(ea_inode) ||
+		    !(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) ||
+		    i_size_read(ea_inode) != value_len)
+			goto next;
 
-	return 0;
+		if (!ea_data)
+			ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
+
+		if (!ea_data) {
+			iput(ea_inode);
+			return NULL;
+		}
+
+		err = ext4_xattr_inode_read(ea_inode, ea_data, value_len);
+		if (unlikely(err))
+			goto next;
+
+		if (!memcmp(value, ea_data, value_len)) {
+			mb_cache_entry_touch(ea_inode_cache, ce);
+			mb_cache_entry_put(ea_inode_cache, ce);
+			kvfree(ea_data);
+			return ea_inode;
+		}
+	next:
+		iput(ea_inode);
+		ce = mb_cache_entry_find_next(ea_inode_cache, ce);
+	}
+	kvfree(ea_data);
+	return NULL;
 }
 
 /*
  * Add value of the EA in an inode.
  */
-static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
-				unsigned long *ea_ino, const void *value,
-				size_t value_len)
+static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
+					  const void *value, size_t value_len,
+					  struct inode **ret_inode)
 {
+	u32 hash = ext4_xattr_inode_hash(value, value_len);
 	struct inode *ea_inode;
 	int err;
 
+	ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
+	if (ea_inode) {
+		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
+		if (err) {
+			iput(ea_inode);
+			return err;
+		}
+
+		*ret_inode = ea_inode;
+		return 0;
+	}
+
 	/* Create an inode for the EA value */
-	ea_inode = ext4_xattr_inode_create(handle, inode);
+	ea_inode = ext4_xattr_inode_create(handle, inode, hash);
 	if (IS_ERR(ea_inode))
 		return PTR_ERR(ea_inode);
 
 	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
-	if (err)
-		clear_nlink(ea_inode);
-	else
-		*ea_ino = ea_inode->i_ino;
+	if (err) {
+		ext4_xattr_inode_dec_ref(handle, ea_inode);
+		iput(ea_inode);
+		return err;
+	}
 
-	iput(ea_inode);
+	mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
+			      ea_inode->i_ino, true /* reusable */);
 
-	return err;
+	*ret_inode = ea_inode;
+	return 0;
 }
 
 static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
@@ -1033,11 +1366,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 				handle_t *handle, struct inode *inode)
 {
 	struct ext4_xattr_entry *last;
+	struct ext4_xattr_entry *here = s->here;
 	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
 	int in_inode = i->in_inode;
-	int rc;
+	struct inode *old_ea_inode = NULL;
+	struct inode *new_ea_inode = NULL;
+	int ret;
 
-	/* Compute min_offs and last. */
+	/*
+	 * Optimization for the simple case when old and new values have the
+	 * same padded sizes. Not applicable if the existing value is stored in
+	 * an external inode.
+	 */
+	if (i->value && !s->not_found && !here->e_value_inum &&
+	    EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) ==
+	    EXT4_XATTR_SIZE(i->value_len)) {
+		size_t offs = le16_to_cpu(here->e_value_offs);
+		void *val = s->base + offs;
+		size_t size = EXT4_XATTR_SIZE(i->value_len);
+
+		here->e_value_size = cpu_to_le32(i->value_len);
+		if (i->value == EXT4_ZERO_XATTR_VALUE) {
+			memset(val, 0, size);
+		} else {
+			memcpy(val, i->value, i->value_len);
+			/* Clear padding bytes. */
+			memset(val + i->value_len, 0, size - i->value_len);
+		}
+		return 0;
+	}
+
+	/* Find out min_offs and last to calculate the free space. */
 	last = s->first;
 	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
 		if (!last->e_value_inum && last->e_value_size) {
@@ -1048,120 +1407,149 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 	}
 	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
 	if (!s->not_found) {
-		if (!in_inode &&
-		    !s->here->e_value_inum && s->here->e_value_size) {
-			size_t size = le32_to_cpu(s->here->e_value_size);
+		if (!here->e_value_inum && here->e_value_size) {
+			size_t size = le32_to_cpu(here->e_value_size);
 			free += EXT4_XATTR_SIZE(size);
 		}
 		free += EXT4_XATTR_LEN(name_len);
 	}
 	if (i->value) {
-		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
+		size_t value_len = in_inode ? 0 : EXT4_XATTR_SIZE(i->value_len);
 
-		if (in_inode)
-			value_len = 0;
+		if (free < EXT4_XATTR_LEN(name_len) + value_len) {
+			ret = -ENOSPC;
+			goto out;
+		}
+	}
 
-		if (free < EXT4_XATTR_LEN(name_len) + value_len)
-			return -ENOSPC;
+	/*
+	 * Getting access to old and new ea inodes is subject to failures.
+	 * Finish that work before doing any modifications to the xattr data.
+	 */
+	if (!s->not_found && here->e_value_inum) {
+		ret = ext4_xattr_inode_iget(inode,
+		 			    le32_to_cpu(here->e_value_inum),
+		 			    &old_ea_inode);
+		if (ret) {
+			old_ea_inode = NULL;
+			goto out;
+		}
 	}
+	if (i->value && in_inode) {
+		WARN_ON_ONCE(!i->value_len);
 
-	if (i->value && s->not_found) {
-		/* Insert the new name. */
-		size_t size = EXT4_XATTR_LEN(name_len);
-		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
-		memmove((void *)s->here + size, s->here, rest);
-		memset(s->here, 0, size);
-		s->here->e_name_index = i->name_index;
-		s->here->e_name_len = name_len;
-		memcpy(s->here->e_name, i->name, name_len);
-	} else {
-		if (!s->here->e_value_inum && s->here->e_value_size &&
-		    s->here->e_value_offs > 0) {
-			void *first_val = s->base + min_offs;
-			size_t offs = le16_to_cpu(s->here->e_value_offs);
-			void *val = s->base + offs;
-			size_t size = EXT4_XATTR_SIZE(
-				le32_to_cpu(s->here->e_value_size));
-
-			if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
-				/* The old and the new value have the same
-				   size. Just replace. */
-				s->here->e_value_size =
-					cpu_to_le32(i->value_len);
-				if (i->value == EXT4_ZERO_XATTR_VALUE) {
-					memset(val, 0, size);
-				} else {
-					/* Clear pad bytes first. */
-					memset(val + size - EXT4_XATTR_PAD, 0,
-					       EXT4_XATTR_PAD);
-					memcpy(val, i->value, i->value_len);
-				}
-				return 0;
-			}
+		ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
+		if (ret)
+			goto out;
 
-			/* Remove the old value. */
-			memmove(first_val + size, first_val, val - first_val);
-			memset(first_val, 0, size);
-			s->here->e_value_size = 0;
-			s->here->e_value_offs = 0;
-			min_offs += size;
-
-			/* Adjust all value offsets. */
-			last = s->first;
-			while (!IS_LAST_ENTRY(last)) {
-				size_t o = le16_to_cpu(last->e_value_offs);
-				if (!last->e_value_inum &&
-				    last->e_value_size && o < offs)
-					last->e_value_offs =
-						cpu_to_le16(o + size);
-				last = EXT4_XATTR_NEXT(last);
-			}
+		ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
+						     i->value_len,
+						     &new_ea_inode);
+		if (ret) {
+			new_ea_inode = NULL;
+			ext4_xattr_inode_free_quota(inode, i->value_len);
+			goto out;
 		}
-		if (s->here->e_value_inum) {
-			ext4_xattr_inode_unlink(inode,
-					    le32_to_cpu(s->here->e_value_inum));
-			s->here->e_value_inum = 0;
+	}
+
+	if (old_ea_inode) {
+		/* We are ready to release ref count on the old_ea_inode. */
+		ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
+		if (ret) {
+			/* Release newly required ref count on new_ea_inode. */
+			if (new_ea_inode) {
+				int err;
+
+				err = ext4_xattr_inode_dec_ref(handle,
+							       new_ea_inode);
+				if (err)
+					ext4_warning_inode(new_ea_inode,
+						  "dec ref new_ea_inode err=%d",
+						  err);
+				ext4_xattr_inode_free_quota(inode,
+							    i->value_len);
+			}
+			goto out;
 		}
-		if (!i->value) {
-			/* Remove the old name. */
-			size_t size = EXT4_XATTR_LEN(name_len);
-			last = ENTRY((void *)last - size);
-			memmove(s->here, (void *)s->here + size,
-				(void *)last - (void *)s->here + sizeof(__u32));
-			memset(last, 0, size);
+
+		ext4_xattr_inode_free_quota(inode,
+					    le32_to_cpu(here->e_value_size));
+	}
+
+	/* No failures allowed past this point. */
+
+	if (!s->not_found && here->e_value_offs) {
+		/* Remove the old value. */
+		void *first_val = s->base + min_offs;
+		size_t offs = le16_to_cpu(here->e_value_offs);
+		void *val = s->base + offs;
+		size_t size = EXT4_XATTR_SIZE(
+			le32_to_cpu(here->e_value_size));
+
+		memmove(first_val + size, first_val, val - first_val);
+		memset(first_val, 0, size);
+		min_offs += size;
+
+		/* Adjust all value offsets. */
+		last = s->first;
+		while (!IS_LAST_ENTRY(last)) {
+			size_t o = le16_to_cpu(last->e_value_offs);
+			if (!last->e_value_inum &&
+			    last->e_value_size && o < offs)
+				last->e_value_offs =
+					cpu_to_le16(o + size);
+			last = EXT4_XATTR_NEXT(last);
 		}
 	}
 
+	if (!s->not_found && !i->value) {
+		/* Remove old name. */
+		size_t size = EXT4_XATTR_LEN(name_len);
+		last = ENTRY((void *)last - size);
+		memmove(here, (void *)here + size,
+			(void *)last - (void *)here + sizeof(__u32));
+		memset(last, 0, size);
+	} else if (s->not_found && i->value) {
+		/* Insert new name. */
+		size_t size = EXT4_XATTR_LEN(name_len);
+		size_t rest = (void *)last - (void *)here + sizeof(__u32);
+		memmove((void *)here + size, here, rest);
+		memset(here, 0, size);
+		here->e_name_index = i->name_index;
+		here->e_name_len = name_len;
+		memcpy(here->e_name, i->name, name_len);
+	} else {
+		WARN_ON_ONCE(s->not_found || !i->value);
+		/* This is an update, reset value info. */
+		here->e_value_inum = 0;
+		here->e_value_offs = 0;
+		here->e_value_size = 0;
+	}
+
 	if (i->value) {
-		/* Insert the new value. */
+		/* Insert new value. */
 		if (in_inode) {
-			unsigned long ea_ino =
-				le32_to_cpu(s->here->e_value_inum);
-			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
-						  i->value, i->value_len);
-			if (rc)
-				goto out;
-			s->here->e_value_inum = cpu_to_le32(ea_ino);
-			s->here->e_value_offs = 0;
+			here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
 		} else if (i->value_len) {
 			size_t size = EXT4_XATTR_SIZE(i->value_len);
 			void *val = s->base + min_offs - size;
-			s->here->e_value_offs = cpu_to_le16(min_offs - size);
-			s->here->e_value_inum = 0;
+			here->e_value_offs = cpu_to_le16(min_offs - size);
 			if (i->value == EXT4_ZERO_XATTR_VALUE) {
 				memset(val, 0, size);
 			} else {
-				/* Clear the pad bytes first. */
-				memset(val + size - EXT4_XATTR_PAD, 0,
-				       EXT4_XATTR_PAD);
 				memcpy(val, i->value, i->value_len);
+				/* Clear padding bytes. */
+				memset(val + i->value_len, 0,
+				       size - i->value_len);
 			}
 		}
-		s->here->e_value_size = cpu_to_le32(i->value_len);
+		here->e_value_size = cpu_to_le32(i->value_len);
 	}
-
+	ret = 0;
 out:
-	return rc;
+	iput(old_ea_inode);
+	iput(new_ea_inode);
+	return ret;
 }
 
 struct ext4_xattr_block_find {
@@ -1223,6 +1611,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 	struct mb_cache_entry *ce = NULL;
 	int error = 0;
 	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+	struct inode *ea_inode = NULL;
+	size_t old_ea_inode_size = 0;
 
 #define header(x) ((struct ext4_xattr_header *)(x))
 
@@ -1277,6 +1667,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			header(s->base)->h_refcount = cpu_to_le32(1);
 			s->here = ENTRY(s->base + offset);
 			s->end = s->base + bs->bh->b_size;
+
+			/*
+			 * If existing entry points to an xattr inode, we need
+			 * to prevent ext4_xattr_set_entry() from decrementing
+			 * ref count on it because the reference belongs to the
+			 * original block. In this case, make the entry look
+			 * like it has an empty value.
+			 */
+			if (!s->not_found && s->here->e_value_inum) {
+				/*
+				 * Defer quota free call for previous inode
+				 * until success is guaranteed.
+				 */
+				old_ea_inode_size = le32_to_cpu(
+							s->here->e_value_size);
+				s->here->e_value_inum = 0;
+				s->here->e_value_size = 0;
+			}
 		}
 	} else {
 		/* Allocate a buffer where we construct the new block. */
@@ -1298,6 +1706,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		goto bad_block;
 	if (error)
 		goto cleanup;
+
+	if (i->value && s->here->e_value_inum) {
+		unsigned int ea_ino;
+
+		/*
+		 * A ref count on ea_inode has been taken as part of the call to
+		 * ext4_xattr_set_entry() above. We would like to drop this
+		 * extra ref but we have to wait until the xattr block is
+		 * initialized and has its own ref count on the ea_inode.
+		 */
+		ea_ino = le32_to_cpu(s->here->e_value_inum);
+		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+		if (error) {
+			ea_inode = NULL;
+			goto cleanup;
+		}
+	}
+
 	if (!IS_LAST_ENTRY(s->first))
 		ext4_xattr_rehash(header(s->base), s->here);
 
@@ -1408,6 +1834,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 						 EXT4_FREE_BLOCKS_METADATA);
 				goto cleanup;
 			}
+			error = ext4_xattr_inode_inc_ref_all(handle, inode,
+						      ENTRY(header(s->base)+1));
+			if (error)
+				goto getblk_failed;
+			if (ea_inode) {
+				/* Drop the extra ref on ea_inode. */
+				error = ext4_xattr_inode_dec_ref(handle,
+								 ea_inode);
+				if (error)
+					ext4_warning_inode(ea_inode,
+							   "dec ref error=%d",
+							   error);
+				iput(ea_inode);
+				ea_inode = NULL;
+			}
+
 			lock_buffer(new_bh);
 			error = ext4_journal_get_create_access(handle, new_bh);
 			if (error) {
@@ -1427,15 +1869,36 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		}
 	}
 
+	if (old_ea_inode_size)
+		ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
+
 	/* Update the inode. */
 	EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
 
 	/* Drop the previous xattr block. */
-	if (bs->bh && bs->bh != new_bh)
-		ext4_xattr_release_block(handle, inode, bs->bh);
+	if (bs->bh && bs->bh != new_bh) {
+		struct ext4_xattr_inode_array *ea_inode_array = NULL;
+		ext4_xattr_release_block(handle, inode, bs->bh,
+					 &ea_inode_array,
+					 0 /* extra_credits */);
+		ext4_xattr_inode_array_free(ea_inode_array);
+	}
 	error = 0;
 
 cleanup:
+	if (ea_inode) {
+		int error2;
+		error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (error2)
+			ext4_warning_inode(ea_inode, "dec ref error=%d",
+					   error2);
+
+		/* If there was an error, revert the quota charge. */
+		if (error)
+			ext4_xattr_inode_free_quota(inode,
+						    i_size_read(ea_inode));
+		iput(ea_inode);
+	}
 	if (ce)
 		mb_cache_entry_put(ext4_mb_cache, ce);
 	brelse(new_bh);
@@ -1546,6 +2009,117 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
+struct ext4_xattr_ea_info {
+	__le64 ref_count;	/* number of xattr entry references */
+	__le32 hash;		/* crc32c hash of xattr data */
+	__le32 reserved;	/* reserved, must be 0 */
+};
+
+static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
+				 u32 hash)
+{
+	struct ext4_xattr_ea_info ea_info = {
+		.ref_count = cpu_to_le64(1),
+		.hash = cpu_to_le32(hash),
+		.reserved = 0,
+	};
+	struct ext4_xattr_info i = {
+                .name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_EA_INFO,
+		.value = &ea_info,
+		.value_len = sizeof(ea_info),
+	};
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	int err;
+
+	err = ext4_get_inode_loc(ea_inode, &is.iloc);
+	if (err)
+		return err;
+
+	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
+	if (err)
+		return err;
+
+	return ext4_xattr_ibody_set(handle, ea_inode, &i, &is);
+}
+
+static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
+				     u64 *ref_return, u32 *hash)
+{
+	struct ext4_xattr_ea_info ea_info;
+	struct ext4_xattr_info i = {
+                .name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_EA_INFO,
+		.value = &ea_info,
+		.value_len = sizeof(ea_info),
+	};
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	int err;
+
+	err = ext4_get_inode_loc(ea_inode, &is.iloc);
+	if (err)
+		return err;
+
+	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
+	if (err)
+		return err;
+
+	if (WARN_ON(is.s.not_found) ||
+	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(ea_info)))
+		return -EFSCORRUPTED;
+
+	memcpy(&ea_info,
+	       ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs),
+	       sizeof(ea_info));
+
+	if (hash)
+		*hash = le32_to_cpu(ea_info.hash);
+
+	*ref_return = le64_to_cpu(ea_info.ref_count) + ref_change;
+	ea_info.ref_count = cpu_to_le64(*ref_return);
+
+	return ext4_xattr_set_entry(&i, &is.s, NULL, ea_inode);
+}
+
+static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash)
+{
+	struct ext4_xattr_info i = {
+                .name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_EA_INFO,
+	};
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext4_xattr_ea_info *ea_info;
+	void *ptr;
+	int err;
+
+	err = ext4_get_inode_loc(ea_inode, &is.iloc);
+	if (err)
+		return err;
+
+	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
+	if (err)
+		return err;
+
+	if (WARN_ON(is.s.not_found) ||
+	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(*ea_info)))
+		return -EFSCORRUPTED;
+
+	ptr = ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs);
+	ea_info = (struct ext4_xattr_ea_info *)ptr;
+
+	if (WARN_ON(ea_info->reserved != 0))
+		return -EFSCORRUPTED;
+
+	*hash = le32_to_cpu(ea_info->hash);
+	return 0;
+}
+
 static int ext4_xattr_value_same(struct ext4_xattr_search *s,
 				 struct ext4_xattr_info *i)
 {
@@ -1560,6 +2134,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
 	return !memcmp(value, i->value, i->value_len);
 }
 
+struct buffer_head *ext4_xattr_get_block(struct inode *inode)
+{
+	struct buffer_head *bh;
+	int error;
+
+	if (!EXT4_I(inode)->i_file_acl)
+		return NULL;
+	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+	if (!bh)
+		return ERR_PTR(-EIO);
+	error = ext4_xattr_check_block(inode, bh);
+	if (error)
+		return ERR_PTR(error);
+	return bh;
+}
+
 /*
  * ext4_xattr_set_handle()
  *
@@ -1602,9 +2192,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 
 	/* Check journal credits under write lock. */
 	if (ext4_handle_valid(handle)) {
+		struct buffer_head *bh;
 		int credits;
 
-		credits = ext4_xattr_set_credits(inode, value_len);
+		bh = ext4_xattr_get_block(inode);
+		if (IS_ERR(bh)) {
+			error = PTR_ERR(bh);
+			goto cleanup;
+		}
+
+		credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
+		brelse(bh);
+
 		if (!ext4_handle_has_enough_credits(handle, credits)) {
 			error = -ENOSPC;
 			goto cleanup;
@@ -1640,6 +2239,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		if (flags & XATTR_CREATE)
 			goto cleanup;
 	}
+
 	if (!value) {
 		if (!is.s.not_found)
 			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
@@ -1708,34 +2308,29 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	return error;
 }
 
-int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
+int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
 {
-	struct super_block *sb = inode->i_sb;
-	int credits;
-
-	if (!EXT4_SB(sb)->s_journal)
-		return 0;
-
-	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+	struct buffer_head *bh;
+	int err;
 
-	/*
-	 * In case of inline data, we may push out the data to a block,
-	 * so we need to reserve credits for this eventuality
-	 */
-	if (ext4_has_inline_data(inode))
-	        credits += ext4_writepage_trans_blocks(inode) + 1;
+	*credits = 0;
 
-	if (ext4_has_feature_ea_inode(sb)) {
-		int nrblocks = (value_len + sb->s_blocksize - 1) >>
-					sb->s_blocksize_bits;
+	if (!EXT4_SB(inode->i_sb)->s_journal)
+		return 0;
 
-		/* For new inode */
-		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
+	down_read(&EXT4_I(inode)->xattr_sem);
 
-		/* For data blocks of EA inode */
-		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
+	bh = ext4_xattr_get_block(inode);
+	if (IS_ERR(bh)) {
+		err = PTR_ERR(bh);
+	} else {
+		*credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
+		brelse(bh);
+		err = 0;
 	}
-	return credits;
+
+	up_read(&EXT4_I(inode)->xattr_sem);
+	return err;
 }
 
 /*
@@ -1760,7 +2355,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 		return error;
 
 retry:
-	credits = ext4_xattr_set_credits(inode, value_len);
+	error = ext4_xattr_set_credits(inode, value_len, &credits);
+	if (error)
+		return error;
+
 	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
@@ -2066,10 +2664,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 	return error;
 }
 
-
 #define EIA_INCR 16 /* must be 2^n */
 #define EIA_MASK (EIA_INCR - 1)
-/* Add the large xattr @inode into @ea_inode_array for later deletion.
+
+/* Add the large xattr @inode into @ea_inode_array for deferred iput().
  * If @ea_inode_array is new or full it will be grown and the old
  * contents copied over.
  */
@@ -2114,21 +2712,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
  * ext4_xattr_delete_inode()
  *
  * Free extended attribute resources associated with this inode. Traverse
- * all entries and unlink any xattr inodes associated with this inode. This
- * is called immediately before an inode is freed. We have exclusive
- * access to the inode. If an orphan inode is deleted it will also delete any
- * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
- * to ensure they belong to the parent inode and were not deleted already.
+ * all entries and decrement reference on any xattr inodes associated with this
+ * inode. This is called immediately before an inode is freed. We have exclusive
+ * access to the inode. If an orphan inode is deleted it will also release its
+ * references on xattr block and xattr inodes.
  */
-int
-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-			struct ext4_xattr_inode_array **ea_inode_array,
-			int extra_credits)
+int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
+			    struct ext4_xattr_inode_array **ea_inode_array,
+			    int extra_credits)
 {
 	struct buffer_head *bh = NULL;
 	struct ext4_xattr_ibody_header *header;
-	struct ext4_inode *raw_inode;
 	struct ext4_iloc iloc = { .bh = NULL };
+	struct ext4_xattr_entry *entry;
 	int error;
 
 	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
@@ -2140,66 +2736,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 		goto cleanup;
 	}
 
-	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
-		goto delete_external_ea;
-
-	error = ext4_get_inode_loc(inode, &iloc);
-	if (error)
-		goto cleanup;
+	if (ext4_has_feature_ea_inode(inode->i_sb) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
 
-	error = ext4_journal_get_write_access(handle, iloc.bh);
-	if (error)
-		goto cleanup;
+		error = ext4_get_inode_loc(inode, &iloc);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
+			goto cleanup;
+		}
 
-	raw_inode = ext4_raw_inode(&iloc);
-	header = IHDR(inode, raw_inode);
-	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
-				    false /* block_csum */, ea_inode_array,
-				    extra_credits);
+		error = ext4_journal_get_write_access(handle, iloc.bh);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "write access (error %d)",
+					 error);
+			goto cleanup;
+		}
 
-delete_external_ea:
-	if (!EXT4_I(inode)->i_file_acl) {
-		error = 0;
-		goto cleanup;
-	}
-	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-	if (!bh) {
-		EXT4_ERROR_INODE(inode, "block %llu read error",
-				 EXT4_I(inode)->i_file_acl);
-		error = -EIO;
-		goto cleanup;
-	}
-	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
-	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-		EXT4_ERROR_INODE(inode, "bad block %llu",
-				 EXT4_I(inode)->i_file_acl);
-		error = -EFSCORRUPTED;
-		goto cleanup;
+		header = IHDR(inode, ext4_raw_inode(&iloc));
+		if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
+			ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
+						     IFIRST(header),
+						     false /* block_csum */,
+						     ea_inode_array,
+						     extra_credits,
+						     false /* skip_quota */);
 	}
 
-	if (ext4_has_feature_ea_inode(inode->i_sb)) {
-		error = ext4_journal_get_write_access(handle, bh);
-		if (error) {
-			EXT4_ERROR_INODE(inode, "write access %llu",
+	if (EXT4_I(inode)->i_file_acl) {
+		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+		if (!bh) {
+			EXT4_ERROR_INODE(inode, "block %llu read error",
 					 EXT4_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		error = ext4_xattr_check_block(inode, bh);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
+					 EXT4_I(inode)->i_file_acl, error);
 			goto cleanup;
 		}
-		ext4_xattr_inode_remove_all(handle, inode, bh,
-					    BFIRST(bh),
-					    true /* block_csum */,
-					    ea_inode_array,
-					    extra_credits);
-	}
 
-	ext4_xattr_release_block(handle, inode, bh);
-	/* Update i_file_acl within the same transaction that releases block. */
-	EXT4_I(inode)->i_file_acl = 0;
-	error = ext4_mark_inode_dirty(handle, inode);
-	if (error) {
-		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
-				 error);
-		goto cleanup;
+		if (ext4_has_feature_ea_inode(inode->i_sb)) {
+			for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
+			     entry = EXT4_XATTR_NEXT(entry))
+				if (entry->e_value_inum)
+					ext4_xattr_inode_free_quota(inode,
+					      le32_to_cpu(entry->e_value_size));
+
+		}
+
+		ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
+					 extra_credits);
+		/*
+		 * Update i_file_acl value in the same transaction that releases
+		 * block.
+		 */
+		EXT4_I(inode)->i_file_acl = 0;
+		error = ext4_mark_inode_dirty(handle, inode);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
+					 error);
+			goto cleanup;
+		}
 	}
+	error = 0;
 cleanup:
 	brelse(iloc.bh);
 	brelse(bh);
@@ -2208,17 +2809,13 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 
 void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
 {
-	struct inode	*ea_inode;
-	int		idx = 0;
+	int idx;
 
 	if (ea_inode_array == NULL)
 		return;
 
-	for (; idx < ea_inode_array->count; ++idx) {
-		ea_inode = ea_inode_array->inodes[idx];
-		clear_nlink(ea_inode);
-		iput(ea_inode);
-	}
+	for (idx = 0; idx < ea_inode_array->count; ++idx)
+		iput(ea_inode_array->inodes[idx]);
 	kfree(ea_inode_array);
 }
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index b2005a2716d9..67616cb9a059 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -70,19 +70,6 @@ struct ext4_xattr_entry {
 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
 
 /*
- * Link EA inode back to parent one using i_mtime field.
- * Extra integer type conversion added to ignore higher
- * bits in i_mtime.tv_sec which might be set by ext4_get()
- */
-#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
-do {                                                  \
-      (inode)->i_mtime.tv_sec = inum;                 \
-} while(0)
-
-#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
-((__u32)(inode)->i_mtime.tv_sec)
-
-/*
  * The minimum size of EA value when you start storing it in an external inode
  * size of block - size of header - size of 1 entry - 4 null bytes
 */
@@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
 extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
 extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
-extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
+extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
+				  int *credits);
 
-extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 				   struct ext4_xattr_inode_array **array,
 				   int extra_credits);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 77a5b99d8f92..7dfdca822ccb 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -13,10 +13,11 @@
  * mb_cache_entry_delete()).
  *
  * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
- * They use hash of a block contents as a key and block number as a value.
- * That's why keys need not be unique (different xattr blocks may end up having
- * the same hash). However block number always uniquely identifies a cache
- * entry.
+ * Ext4 also uses it for deduplication of xattr values stored in inodes.
+ * They use hash of data as a key and provide a value that may represent a
+ * block or inode number. That's why keys need not be unique (hash of different
+ * data may be the same). However user provided value always uniquely
+ * identifies a cache entry.
  *
  * We provide functions for creation and removal of entries, search by key,
  * and a special "delete entry with given key-value pair" operation. Fixed
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH 28/28] quota: add extra inode count to dquot transfer functions
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
                   ` (25 preceding siblings ...)
  2017-05-31  8:15 ` [PATCH 27/28] ext4: xattr inode deduplication Tahsin Erdogan
@ 2017-05-31  8:15 ` Tahsin Erdogan
  2017-06-15  7:57     ` [Ocfs2-devel] " Jan Kara
  2017-05-31 16:42   ` Darrick J. Wong
  27 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31  8:15 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

Ext4 ea_inode feature allows storing xattr values in external inodes to
be able to store values that are bigger than a block in size. Ext4 also
has deduplication support for these type of inodes. With deduplication,
the actual storage waste is eliminated but the users of such inodes are
still charged full quota for the inodes as if there was no sharing
happening in the background.

This design requires ext4 to manually charge the users because the
inodes are shared.

An implication of this is that, if someone calls chown on a file that
has such references we need to transfer the quota for the file and xattr
inodes. Current dquot_transfer() function implicitly transfers one inode
charge. In our case, we would like to specify additional inodes to be
transferred.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext2/inode.c          |  2 +-
 fs/ext4/inode.c          |  8 ++++++-
 fs/ext4/ioctl.c          | 13 +++++++++++-
 fs/ext4/xattr.c          | 54 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/xattr.h          |  2 ++
 fs/jfs/file.c            |  2 +-
 fs/ocfs2/file.c          |  2 +-
 fs/quota/dquot.c         | 16 +++++++-------
 fs/reiserfs/inode.c      |  2 +-
 include/linux/quotaops.h |  8 ++++---
 10 files changed, 93 insertions(+), 16 deletions(-)

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 2dcbd5698884..a13ba5dcb355 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1656,7 +1656,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 	}
 	if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
 	    (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
-		error = dquot_transfer(inode, iattr);
+		error = dquot_transfer(inode, iattr, 0);
 		if (error)
 			return error;
 	}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6f5872197d6c..28abbbdbbb80 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5267,6 +5267,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	int error, rc = 0;
 	int orphan = 0;
 	const unsigned int ia_valid = attr->ia_valid;
+	int ea_inode_refs;
 
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 		return -EIO;
@@ -5293,7 +5294,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
-		error = dquot_transfer(inode, attr);
+
+		down_read(&EXT4_I(inode)->xattr_sem);
+		error = ea_inode_refs = ext4_xattr_inode_count(inode);
+		if (ea_inode_refs >= 0)
+			error = dquot_transfer(inode, attr, ea_inode_refs);
+		up_read(&EXT4_I(inode)->xattr_sem);
 		if (error) {
 			ext4_journal_stop(handle);
 			return error;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index dde8deb11e59..9938dc8e24c8 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -21,6 +21,7 @@
 #include "ext4.h"
 #include <linux/fsmap.h>
 #include "fsmap.h"
+#include "xattr.h"
 #include <trace/events/ext4.h>
 
 /**
@@ -319,6 +320,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
 	struct ext4_iloc iloc;
 	struct ext4_inode *raw_inode;
 	struct dquot *transfer_to[MAXQUOTAS] = { };
+	int ea_inode_refs;
 
 	if (!ext4_has_feature_project(sb)) {
 		if (projid != EXT4_DEF_PROJID)
@@ -371,9 +373,17 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
 	if (err)
 		goto out_stop;
 
+	down_read(&EXT4_I(inode)->xattr_sem);
+	ea_inode_refs = ext4_xattr_inode_count(inode);
+	if (ea_inode_refs < 0) {
+		up_read(&EXT4_I(inode)->xattr_sem);
+		err = ea_inode_refs;
+		goto out_stop;
+	}
+
 	transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
 	if (!IS_ERR(transfer_to[PRJQUOTA])) {
-		err = __dquot_transfer(inode, transfer_to);
+		err = __dquot_transfer(inode, transfer_to, ea_inode_refs);
 		dqput(transfer_to[PRJQUOTA]);
 		if (err)
 			goto out_dirty;
@@ -382,6 +392,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
 	EXT4_I(inode)->i_projid = kprojid;
 	inode->i_ctime = current_time(inode);
 out_dirty:
+	up_read(&EXT4_I(inode)->xattr_sem);
 	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
 	if (!err)
 		err = rc;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index caddc176a612..1d6fcbb01517 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -701,6 +701,60 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 	}
 }
 
+int ext4_xattr_inode_count(struct inode *inode)
+{
+	struct ext4_iloc iloc = { .bh = NULL };
+	struct buffer_head *bh = NULL;
+	struct ext4_inode *raw_inode;
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_entry *entry;
+	int inode_count = 0;
+	void *end;
+	int ret;
+
+	lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+		ret = ext4_get_inode_loc(inode, &iloc);
+		if (ret)
+			goto out;
+		raw_inode = ext4_raw_inode(&iloc);
+		header = IHDR(inode, raw_inode);
+		end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+		ret = xattr_check_inode(inode, header, end);
+		if (ret)
+			goto out;
+
+		for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
+		     entry = EXT4_XATTR_NEXT(entry))
+			if (entry->e_value_inum)
+				inode_count++;
+	}
+
+	if (EXT4_I(inode)->i_file_acl) {
+		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+		if (!bh) {
+			ret = -EIO;
+			goto out;
+		}
+
+		if (ext4_xattr_check_block(inode, bh)) {
+			ret = -EFSCORRUPTED;
+			goto out;
+		}
+
+		for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
+		     entry = EXT4_XATTR_NEXT(entry))
+			if (entry->e_value_inum)
+				inode_count++;
+	}
+	ret = inode_count;
+out:
+	brelse(iloc.bh);
+	brelse(bh);
+	return ret;
+}
+
 static inline size_t round_up_cluster(struct inode *inode, size_t length)
 {
 	struct super_block *sb = inode->i_sb;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 67616cb9a059..8ef6fe123255 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -193,3 +193,5 @@ extern void ext4_xattr_inode_set_class(struct inode *ea_inode);
 #else
 static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { }
 #endif
+
+int ext4_xattr_inode_count(struct inode *inode);
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 739492c7a3fd..b08e0b0449a7 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -114,7 +114,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	}
 	if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
 	    (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
-		rc = dquot_transfer(inode, iattr);
+		rc = dquot_transfer(inode, iattr, 0);
 		if (rc)
 			return rc;
 	}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index bfeb647459d9..d3cbf6467af6 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1259,7 +1259,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 			mlog_errno(status);
 			goto bail_unlock;
 		}
-		status = __dquot_transfer(inode, transfer_to);
+		status = __dquot_transfer(inode, transfer_to, 0);
 		if (status < 0)
 			goto bail_commit;
 	} else {
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 48813aeaab80..16e13d554aaa 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1906,10 +1906,12 @@ EXPORT_SYMBOL(dquot_free_inode);
  * We are holding reference on transfer_from & transfer_to, no need to
  * protect them by srcu_read_lock().
  */
-int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
+int __dquot_transfer(struct inode *inode, struct dquot **transfer_to,
+		     int inodes_extra)
 {
 	qsize_t space, cur_space;
 	qsize_t rsv_space = 0;
+	qsize_t inode_count = 1 + inodes_extra;
 	struct dquot *transfer_from[MAXQUOTAS] = {};
 	int cnt, ret = 0;
 	char is_valid[MAXQUOTAS] = {};
@@ -1946,7 +1948,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 			continue;
 		is_valid[cnt] = 1;
 		transfer_from[cnt] = i_dquot(inode)[cnt];
-		ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]);
+		ret = check_idq(transfer_to[cnt], inode_count, &warn_to[cnt]);
 		if (ret)
 			goto over_quota;
 		ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]);
@@ -1963,7 +1965,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 		/* Due to IO error we might not have transfer_from[] structure */
 		if (transfer_from[cnt]) {
 			int wtype;
-			wtype = info_idq_free(transfer_from[cnt], 1);
+			wtype = info_idq_free(transfer_from[cnt], inode_count);
 			if (wtype != QUOTA_NL_NOWARN)
 				prepare_warning(&warn_from_inodes[cnt],
 						transfer_from[cnt], wtype);
@@ -1971,13 +1973,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 			if (wtype != QUOTA_NL_NOWARN)
 				prepare_warning(&warn_from_space[cnt],
 						transfer_from[cnt], wtype);
-			dquot_decr_inodes(transfer_from[cnt], 1);
+			dquot_decr_inodes(transfer_from[cnt], inode_count);
 			dquot_decr_space(transfer_from[cnt], cur_space);
 			dquot_free_reserved_space(transfer_from[cnt],
 						  rsv_space);
 		}
 
-		dquot_incr_inodes(transfer_to[cnt], 1);
+		dquot_incr_inodes(transfer_to[cnt], inode_count);
 		dquot_incr_space(transfer_to[cnt], cur_space);
 		dquot_resv_space(transfer_to[cnt], rsv_space);
 
@@ -2005,7 +2007,7 @@ EXPORT_SYMBOL(__dquot_transfer);
 /* Wrapper for transferring ownership of an inode for uid/gid only
  * Called from FSXXX_setattr()
  */
-int dquot_transfer(struct inode *inode, struct iattr *iattr)
+int dquot_transfer(struct inode *inode, struct iattr *iattr, int inodes_extra)
 {
 	struct dquot *transfer_to[MAXQUOTAS] = {};
 	struct dquot *dquot;
@@ -2037,7 +2039,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		}
 		transfer_to[GRPQUOTA] = dquot;
 	}
-	ret = __dquot_transfer(inode, transfer_to);
+	ret = __dquot_transfer(inode, transfer_to, inodes_extra);
 out_put:
 	dqput_all(transfer_to);
 	return ret;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 873fc04e9403..51586051b5dd 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3370,7 +3370,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 		reiserfs_write_unlock(inode->i_sb);
 		if (error)
 			goto out;
-		error = dquot_transfer(inode, attr);
+		error = dquot_transfer(inode, attr, 0);
 		reiserfs_write_lock(inode->i_sb);
 		if (error) {
 			journal_end(&th);
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index dda22f45fc1b..b7bcd9c6db6c 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -106,8 +106,9 @@ int dquot_get_next_dqblk(struct super_block *sb, struct kqid *id,
 int dquot_set_dqblk(struct super_block *sb, struct kqid id,
 		struct qc_dqblk *di);
 
-int __dquot_transfer(struct inode *inode, struct dquot **transfer_to);
-int dquot_transfer(struct inode *inode, struct iattr *iattr);
+int __dquot_transfer(struct inode *inode, struct dquot **transfer_to,
+		int inodes_extra);
+int dquot_transfer(struct inode *inode, struct iattr *iattr, int inodes_extra);
 
 static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type)
 {
@@ -226,7 +227,8 @@ static inline void dquot_free_inode(struct inode *inode)
 {
 }
 
-static inline int dquot_transfer(struct inode *inode, struct iattr *iattr)
+static inline int dquot_transfer(struct inode *inode, struct iattr *iattr,
+		int inodes_extra)
 {
 	return 0;
 }
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* Re: [PATCH 27/28] ext4: xattr inode deduplication
  2017-05-31  8:15 ` [PATCH 27/28] ext4: xattr inode deduplication Tahsin Erdogan
@ 2017-05-31 15:40     ` kbuild test robot
  2017-05-31 15:50     ` [Ocfs2-devel] " kbuild test robot
  2017-05-31 16:00     ` Darrick J. Wong
  2 siblings, 0 replies; 100+ messages in thread
From: kbuild test robot @ 2017-05-31 15:40 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: kbuild-all, Jan Kara, Theodore Ts'o, Andreas Dilger,
	Dave Kleikamp, Alexander Viro, Mark Fasheh, Joel Becker,
	Jens Axboe, Deepa Dinamani, Mike Christie, Fabian Frederick,
	linux-ext4, linux-kernel, jfs-discussion, linux-fsdevel,
	ocfs2-devel, reiserfs-devel, Tahsin Erdogan

[-- Attachment #1: Type: text/plain, Size: 1008 bytes --]

Hi Tahsin,

[auto build test ERROR on ext4/dev]
[also build test ERROR on next-20170531]
[cannot apply to v4.12-rc3]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Tahsin-Erdogan/ext4-xattr-in-inode-support/20170531-214310
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
config: powerpc-pcm030_defconfig (attached as .config)
compiler: powerpc-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        wget https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=powerpc 

All errors (new ones prefixed by >>):

>> ERROR: "crc32c" [fs/ext4/ext4.ko] undefined!

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 14696 bytes --]

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [Ocfs2-devel] [PATCH 27/28] ext4: xattr inode deduplication
@ 2017-05-31 15:40     ` kbuild test robot
  0 siblings, 0 replies; 100+ messages in thread
From: kbuild test robot @ 2017-05-31 15:40 UTC (permalink / raw)
  To: ocfs2-devel

Hi Tahsin,

[auto build test ERROR on ext4/dev]
[also build test ERROR on next-20170531]
[cannot apply to v4.12-rc3]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Tahsin-Erdogan/ext4-xattr-in-inode-support/20170531-214310
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
config: powerpc-pcm030_defconfig (attached as .config)
compiler: powerpc-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        wget https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=powerpc 

All errors (new ones prefixed by >>):

>> ERROR: "crc32c" [fs/ext4/ext4.ko] undefined!

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
-------------- next part --------------
A non-text attachment was scrubbed...
Name: .config.gz
Type: application/gzip
Size: 14696 bytes
Desc: not available
Url : http://oss.oracle.com/pipermail/ocfs2-devel/attachments/20170531/94b6c3ba/attachment.bin 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 27/28] ext4: xattr inode deduplication
  2017-05-31  8:15 ` [PATCH 27/28] ext4: xattr inode deduplication Tahsin Erdogan
@ 2017-05-31 15:50     ` kbuild test robot
  2017-05-31 15:50     ` [Ocfs2-devel] " kbuild test robot
  2017-05-31 16:00     ` Darrick J. Wong
  2 siblings, 0 replies; 100+ messages in thread
From: kbuild test robot @ 2017-05-31 15:50 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: kbuild-all, Jan Kara, Theodore Ts'o, Andreas Dilger,
	Dave Kleikamp, Alexander Viro, Mark Fasheh, Joel Becker,
	Jens Axboe, Deepa Dinamani, Mike Christie, Fabian Frederick,
	linux-ext4, linux-kernel, jfs-discussion, linux-fsdevel,
	ocfs2-devel, reiserfs-devel, Tahsin Erdogan

[-- Attachment #1: Type: text/plain, Size: 1013 bytes --]

Hi Tahsin,

[auto build test ERROR on ext4/dev]
[also build test ERROR on next-20170531]
[cannot apply to v4.12-rc3]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Tahsin-Erdogan/ext4-xattr-in-inode-support/20170531-214310
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
config: i386-defconfig (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   fs/built-in.o: In function `ext4_xattr_inode_get':
>> xattr.c:(.text+0xb8818): undefined reference to `crc32c'
   fs/built-in.o: In function `ext4_xattr_set_entry':
   xattr.c:(.text+0xb9614): undefined reference to `crc32c'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 26190 bytes --]

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [Ocfs2-devel] [PATCH 27/28] ext4: xattr inode deduplication
@ 2017-05-31 15:50     ` kbuild test robot
  0 siblings, 0 replies; 100+ messages in thread
From: kbuild test robot @ 2017-05-31 15:50 UTC (permalink / raw)
  To: ocfs2-devel

Hi Tahsin,

[auto build test ERROR on ext4/dev]
[also build test ERROR on next-20170531]
[cannot apply to v4.12-rc3]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Tahsin-Erdogan/ext4-xattr-in-inode-support/20170531-214310
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
config: i386-defconfig (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   fs/built-in.o: In function `ext4_xattr_inode_get':
>> xattr.c:(.text+0xb8818): undefined reference to `crc32c'
   fs/built-in.o: In function `ext4_xattr_set_entry':
   xattr.c:(.text+0xb9614): undefined reference to `crc32c'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
-------------- next part --------------
A non-text attachment was scrubbed...
Name: .config.gz
Type: application/gzip
Size: 26190 bytes
Desc: not available
Url : http://oss.oracle.com/pipermail/ocfs2-devel/attachments/20170531/4cc8d48d/attachment-0001.bin 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 27/28] ext4: xattr inode deduplication
  2017-05-31  8:15 ` [PATCH 27/28] ext4: xattr inode deduplication Tahsin Erdogan
  2017-05-31 15:40     ` [Ocfs2-devel] " kbuild test robot
@ 2017-05-31 16:00     ` Darrick J. Wong
  2017-05-31 16:00     ` Darrick J. Wong
  2 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-05-31 16:00 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

On Wed, May 31, 2017 at 01:15:16AM -0700, Tahsin Erdogan wrote:
> Ext4 now supports xattr values that are up to 64k in size (vfs limit).
> Large xattr values are stored in external inodes each one holding a
> single value. Once written the data blocks of these inodes are immutable.
> 
> The real world use cases are expected to have a lot of value duplication
> such as inherited acls etc. To reduce data duplication on disk, this patch
> implements a deduplicator that allows sharing of xattr inodes.
> 
> The deduplication is based on an in-memory hash lookup that is a best
> effort sharing scheme. When a xattr inode is read from disk (i.e.
> getxattr() call), its crc32c hash is added to a hash table. Before
> creating a new xattr inode for a value being set, the hash table is
> checked to see if an existing inode holds an identical value. If such an
> inode is found, the ref count on that inode is incremented. On value
> removal the ref count is decremented and if it reaches zero the inode is
> deleted.
> 
> The quota charging for such inodes is manually managed. Every reference
> holder is charged the full size as if there was no sharing happening.
> This is consistent with how xattr blocks are also charged.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext4/acl.c   |    5 +-
>  fs/ext4/ext4.h  |    7 +-
>  fs/ext4/inode.c |    9 +-
>  fs/ext4/super.c |   22 +-
>  fs/ext4/xattr.c | 1073 +++++++++++++++++++++++++++++++++++++++++++------------
>  fs/ext4/xattr.h |   17 +-
>  fs/mbcache.c    |    9 +-
>  7 files changed, 881 insertions(+), 261 deletions(-)
> 
> diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
> index 74f7ac539e00..8db03e5c78bc 100644
> --- a/fs/ext4/acl.c
> +++ b/fs/ext4/acl.c
> @@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>  	if (error)
>  		return error;
>  retry:
> -	credits = ext4_xattr_set_credits(inode, acl_size);
> +	error = ext4_xattr_set_credits(inode, acl_size, &credits);
> +	if (error)
> +		return error;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>  	if (IS_ERR(handle))
>  		return PTR_ERR(handle);
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index d79d8d7bee88..79f06290e723 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1517,6 +1517,7 @@ struct ext4_sb_info {
>  	long s_es_nr_inode;
>  	struct ext4_es_stats s_es_stats;
>  	struct mb_cache *s_mb_cache;
> +	struct mb_cache *s_ea_inode_cache;
>  	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
>  
>  	/* Ratelimit ext4 messages. */
> @@ -2099,7 +2100,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
>  	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
>  }
>  
> -#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
> +static inline bool ext4_is_quota_file(struct inode *inode)
> +{
> +	return IS_NOQUOTA(inode) &&
> +	       !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
> +}
>  
>  /*
>   * This structure is stuffed into the struct file's private_data field
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 4d6936f0d8a4..6f5872197d6c 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4843,8 +4843,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>  	}
>  	brelse(iloc.bh);
>  	ext4_set_inode_flags(inode);
> -	if (ei->i_flags & EXT4_EA_INODE_FL)
> +
> +	if (ei->i_flags & EXT4_EA_INODE_FL) {
>  		ext4_xattr_inode_set_class(inode);
> +
> +		inode_lock(inode);
> +		inode->i_flags |= S_NOQUOTA;
> +		inode_unlock(inode);
> +	}
> +
>  	unlock_new_inode(inode);
>  	return inode;
>  
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index b02a23ec92ca..7d2b692d52ea 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
>  		invalidate_bdev(sbi->journal_bdev);
>  		ext4_blkdev_remove(sbi);
>  	}
> +	if (sbi->s_ea_inode_cache) {
> +		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
> +		sbi->s_ea_inode_cache = NULL;
> +	}
>  	if (sbi->s_mb_cache) {
>  		ext4_xattr_destroy_cache(sbi->s_mb_cache);
>  		sbi->s_mb_cache = NULL;
> @@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
>  	if (res)
>  		return res;
>  retry:
> -	credits = ext4_xattr_set_credits(inode, len);
> +	res = ext4_xattr_set_credits(inode, len, &credits);
> +	if (res)
> +		return res;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
>  	if (IS_ERR(handle))
>  		return PTR_ERR(handle);
> @@ -4067,6 +4074,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  		goto failed_mount_wq;
>  	}
>  
> +	if (ext4_has_feature_ea_inode(sb)) {
> +		sbi->s_ea_inode_cache = ext4_xattr_create_cache();
> +		if (!sbi->s_ea_inode_cache) {
> +			ext4_msg(sb, KERN_ERR,
> +				 "Failed to create an s_ea_inode_cache");
> +			goto failed_mount_wq;
> +		}
> +	}
> +
>  	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
>  	    (blocksize != PAGE_SIZE)) {
>  		ext4_msg(sb, KERN_ERR,
> @@ -4296,6 +4312,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	if (EXT4_SB(sb)->rsv_conversion_wq)
>  		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
>  failed_mount_wq:
> +	if (sbi->s_ea_inode_cache) {
> +		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
> +		sbi->s_ea_inode_cache = NULL;
> +	}
>  	if (sbi->s_mb_cache) {
>  		ext4_xattr_destroy_cache(sbi->s_mb_cache);
>  		sbi->s_mb_cache = NULL;
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 6acce1f689ab..caddc176a612 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -55,6 +55,7 @@
>  #include <linux/slab.h>
>  #include <linux/mbcache.h>
>  #include <linux/quotaops.h>
> +#include <linux/crc32c.h>
>  #include "ext4_jbd2.h"
>  #include "ext4.h"
>  #include "xattr.h"
> @@ -79,6 +80,7 @@ ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
>  			    struct mb_cache_entry **);
>  static void ext4_xattr_rehash(struct ext4_xattr_header *,
>  			      struct ext4_xattr_entry *);
> +static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash);
>  
>  static const struct xattr_handler * const ext4_xattr_handler_map[] = {
>  	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
> @@ -105,13 +107,23 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
>  	NULL
>  };
>  
> +#define EXT4_XATTR_SYSTEM_EA_INFO  "eai"
> +
>  #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
>  				inode->i_sb->s_fs_info)->s_mb_cache)
>  
> +#define EA_INODE_CACHE(inode)	(((struct ext4_sb_info *) \
> +				inode->i_sb->s_fs_info)->s_ea_inode_cache)
> +
>  static int
>  ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>  			struct inode *inode);
>  
> +static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
> +				 u32 hash);
> +static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
> +				     u64 *ref_return, u32 *hash);
> +
>  #ifdef CONFIG_LOCKDEP
>  void ext4_xattr_inode_set_class(struct inode *ea_inode)
>  {
> @@ -329,14 +341,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>  		goto error;
>  	}
>  
> -	if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
> -	    inode->i_generation != parent->i_generation) {
> -		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
> -			   "to parent is invalid.", ea_ino);
> -		err = -EINVAL;
> -		goto error;
> -	}
> -
>  	if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
>  		ext4_error(parent->i_sb, "EA inode %lu does not have "
>  			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
> @@ -351,6 +355,11 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>  	return err;
>  }
>  
> +static u32 ext4_xattr_inode_hash(const void *buffer, size_t size)
> +{
> +	return crc32c(0, buffer, size);

The metadata checksumming code uses crypto_alloc_shash to dynamically
bind to the crc32c implementation only when ext4 actually needs it.
This introduces a static module dependency on libcrc32c even though we
only need it if we're deduplicating xattrs, which is done only when
EXT4_FEATURE_INCOMPAT_EA_INODE is enabled, correct?

Or, looking through the code, maybe not; are we now capable of deduping
for any filesystem?

Anyway, if this dedupe feature is hidden behind INCOMPAT_EA_INODE then
this crc32c call binding should be done dynamically; however, if it
works for any filesystem and is therefore on all the time, the existing
users ought to be converted to use libcrc32c.

--D

> +}
> +
>  /*
>   * Read the value from the EA inode.
>   */
> @@ -358,17 +367,52 @@ static int
>  ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
>  		     size_t size)
>  {
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
>  	struct inode *ea_inode;
> -	int ret;
> +	u32 hash, calc_hash;
> +	struct mb_cache_entry *ce;
> +	int err;
>  
> -	ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -	if (ret)
> -		return ret;
> +	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> +	if (err) {
> +		ea_inode = NULL;
> +		goto out;
> +	}
>  
> -	ret = ext4_xattr_inode_read(ea_inode, buffer, size);
> -	iput(ea_inode);
> +	if (i_size_read(ea_inode) != size) {
> +		ext4_warning_inode(ea_inode,
> +				   "ea_inode file size=%llu entry size=%zu",
> +				   i_size_read(ea_inode), size);
> +		err = -EFSCORRUPTED;
> +		goto out;
> +	}
>  
> -	return ret;
> +	err = ext4_xattr_inode_read(ea_inode, buffer, size);
> +	if (!err) {
> +		if (ext4_xattr_read_ea_hash(ea_inode, &hash))
> +			goto out;
> +
> +		/* Avoid hash calculation if already cached. */
> +		ce = mb_cache_entry_get(ea_inode_cache, hash, ea_inode->i_ino);
> +		if (ce) {
> +			mb_cache_entry_put(ea_inode_cache, ce);
> +			goto out;
> +		}
> +
> +		calc_hash = ext4_xattr_inode_hash(buffer, size);
> +		if (hash != calc_hash) {
> +			ext4_warning_inode(ea_inode, "EA inode saved hash=%#x "
> +					   "does not match calc_hash=%#x",
> +					   hash, calc_hash);
> +			goto out;
> +		}
> +
> +		mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
> +				      ea_inode->i_ino, true /* reusable */);
> +	}
> +out:
> +	iput(ea_inode);
> +	return err;
>  }
>  
>  static int
> @@ -657,6 +701,101 @@ static void ext4_xattr_update_super_block(handle_t *handle,
>  	}
>  }
>  
> +static inline size_t round_up_cluster(struct inode *inode, size_t length)
> +{
> +	struct super_block *sb = inode->i_sb;
> +	size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
> +				    inode->i_blkbits);
> +	size_t mask = ~(cluster_size - 1);
> +
> +	return (length + cluster_size - 1) & mask;
> +}
> +
> +static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
> +{
> +	int err;
> +
> +	err = dquot_alloc_inode(inode);
> +	if (err)
> +		return err;
> +	err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
> +	if (err)
> +		dquot_free_inode(inode);
> +	return err;
> +}
> +
> +static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
> +{
> +	dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
> +	dquot_free_inode(inode);
> +}
> +
> +static int __ext4_xattr_set_credits(struct super_block *sb,
> +				    struct buffer_head *block_bh,
> +				    size_t value_len)
> +{
> +	int credits;
> +	int blocks;
> +
> +	/*
> +	 * 1) Owner inode update
> +	 * 2) Ref count update on old xattr block
> +	 * 3) new xattr block
> +	 * 4) block bitmap update for new xattr block
> +	 * 5) group descriptor for new xattr block
> +	 */
> +	credits = 5;
> +
> +	/* We are done if ea_inode feature is not enabled. */
> +	if (!ext4_has_feature_ea_inode(sb))
> +		return credits;
> +
> +	/* New ea_inode, inode map, block bitmap, group descriptor. */
> +	credits += 4;
> +
> +	/* Data blocks. */
> +	blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
> +
> +	/* Indirection block. */
> +	blocks += 1;
> +
> +	/* Block bitmap and group descriptor updates for each block. */
> +	credits += blocks * 2;
> +
> +	/* Blocks themselves. */
> +	credits += blocks;
> +
> +	/* Dereference ea_inode holding old xattr value.
> +	 * Old ea_inode, inode map, block bitmap, group descriptor.
> +	 */
> +	credits += 4;
> +
> +	/* Data blocks for old ea_inode. */
> +	blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
> +
> +	/* Indirection block for old ea_inode. */
> +	blocks += 1;
> +
> +	/* Block bitmap and group descriptor updates for each block. */
> +	credits += blocks * 2;
> +
> +	/* Quota updates. */
> +	credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
> +
> +	/* We may need to clone the existing xattr block in which case we need
> +	 * to increment ref counts for existing ea_inodes referenced by it.
> +	 */
> +	if (block_bh) {
> +		struct ext4_xattr_entry *entry = BFIRST(block_bh);
> +
> +		for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
> +			if (entry->e_value_inum)
> +				/* Ref count update on ea_inode. */
> +				credits += 1;
> +	}
> +	return credits;
> +}
> +
>  int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>  			      int credits, struct buffer_head *bh,
>  			      bool dirty, bool block_csum)
> @@ -706,12 +845,139 @@ int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>  	return 0;
>  }
>  
> +static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
> +				       int ref_change)
> +{
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
> +	struct ext4_iloc iloc;
> +	s64 ref_return;
> +	u32 hash;
> +	int ret;
> +
> +	inode_lock(ea_inode);
> +
> +	ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
> +	if (ret) {
> +		iloc.bh = NULL;
> +		goto out;
> +	}
> +
> +	ret = ext4_xattr_update_ea_info(ea_inode, ref_change, &ref_return,
> +					&hash);
> +	if (ret)
> +		goto out;
> +
> +	if (ref_change > 0) {
> +		WARN_ONCE(ref_return <= 0, "EA inode %lu ref_return=%lld",
> +			  ea_inode->i_ino, ref_return);
> +
> +		if (ref_return == 1) {
> +			WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
> +				  ea_inode->i_ino, ea_inode->i_nlink);
> +
> +			set_nlink(ea_inode, 1);
> +			ext4_orphan_del(handle, ea_inode);
> +
> +			mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
> +					      ea_inode->i_ino,
> +					      true /* reusable */);
> +		}
> +	} else {
> +		WARN_ONCE(ref_return < 0, "EA inode %lu ref_return=%lld",
> +			  ea_inode->i_ino, ref_return);
> +
> +		if (ref_return == 0) {
> +			WARN_ONCE(ea_inode->i_nlink != 1,
> +				  "EA inode %lu i_nlink=%u",
> +				  ea_inode->i_ino, ea_inode->i_nlink);
> +
> +			clear_nlink(ea_inode);
> +			ext4_orphan_add(handle, ea_inode);
> +
> +			mb_cache_entry_delete(ea_inode_cache, hash,
> +					      ea_inode->i_ino);
> +		}
> +	}
> +
> +	ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
> +	iloc.bh = NULL;
> +	if (ret)
> +		ext4_warning_inode(ea_inode,
> +				   "ext4_mark_iloc_dirty() failed ret=%d", ret);
> +out:
> +	brelse(iloc.bh);
> +	inode_unlock(ea_inode);
> +	return ret;
> +}
> +
> +static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
> +{
> +	return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
> +}
> +
> +static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
> +{
> +	return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
> +}
> +
> +static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
> +					struct ext4_xattr_entry *first)
> +{
> +	struct inode *ea_inode;
> +	struct ext4_xattr_entry *entry;
> +	struct ext4_xattr_entry *failed_entry;
> +	unsigned int ea_ino;
> +	int err, saved_err;
> +
> +	for (entry = first; !IS_LAST_ENTRY(entry);
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		ea_ino = le32_to_cpu(entry->e_value_inum);
> +		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +		if (err)
> +			goto cleanup;
> +		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
> +		if (err) {
> +			ext4_warning_inode(ea_inode, "inc ref error %d", err);
> +			iput(ea_inode);
> +			goto cleanup;
> +		}
> +		iput(ea_inode);
> +	}
> +	return 0;
> +
> +cleanup:
> +	saved_err = err;
> +	failed_entry = entry;
> +
> +	for (entry = first; entry != failed_entry;
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		ea_ino = le32_to_cpu(entry->e_value_inum);
> +		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +		if (err) {
> +			ext4_warning(parent->i_sb,
> +				     "cleanup ea_ino %u iget error %d", ea_ino,
> +				     err);
> +			continue;
> +		}
> +		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (err)
> +			ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
> +					   err);
> +		iput(ea_inode);
> +	}
> +	return saved_err;
> +}
> +
>  static void
> -ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
> -			    struct buffer_head *bh,
> -			    struct ext4_xattr_entry *first, bool block_csum,
> -			    struct ext4_xattr_inode_array **ea_inode_array,
> -			    int extra_credits)
> +ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
> +			     struct buffer_head *bh,
> +			     struct ext4_xattr_entry *first, bool block_csum,
> +			     struct ext4_xattr_inode_array **ea_inode_array,
> +			     int extra_credits, bool skip_quota)
>  {
>  	struct inode *ea_inode;
>  	struct ext4_xattr_entry *entry;
> @@ -748,10 +1014,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>  			continue;
>  		}
>  
> -		inode_lock(ea_inode);
> -		clear_nlink(ea_inode);
> -		ext4_orphan_add(handle, ea_inode);
> -		inode_unlock(ea_inode);
> +		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (err) {
> +			ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
> +					   err);
> +			continue;
> +		}
> +
> +		if (!skip_quota)
> +			ext4_xattr_inode_free_quota(parent,
> +					      le32_to_cpu(entry->e_value_size));
>  
>  		/*
>  		 * Forget about ea_inode within the same transaction that decrements the ref
> @@ -784,7 +1056,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>   */
>  static void
>  ext4_xattr_release_block(handle_t *handle, struct inode *inode,
> -			 struct buffer_head *bh)
> +			 struct buffer_head *bh,
> +			 struct ext4_xattr_inode_array **ea_inode_array,
> +			 int extra_credits)
>  {
>  	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
>  	u32 hash, ref;
> @@ -807,6 +1081,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>  		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
>  		get_bh(bh);
>  		unlock_buffer(bh);
> +
> +		if (ext4_has_feature_ea_inode(inode->i_sb))
> +			ext4_xattr_inode_dec_ref_all(handle, inode, bh,
> +						     BFIRST(bh),
> +						     true /* block_csum */,
> +						     ea_inode_array,
> +						     extra_credits,
> +						     true /* skip_quota */);
>  		ext4_free_blocks(handle, inode, bh, 0, 1,
>  				 EXT4_FREE_BLOCKS_METADATA |
>  				 EXT4_FREE_BLOCKS_FORGET);
> @@ -947,7 +1229,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
>   * Create an inode to store the value of a large EA.
>   */
>  static struct inode *ext4_xattr_inode_create(handle_t *handle,
> -					     struct inode *inode)
> +					     struct inode *inode, u32 hash)
>  {
>  	struct inode *ea_inode = NULL;
>  	uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
> @@ -965,67 +1247,118 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
>  		ea_inode->i_fop = &ext4_file_operations;
>  		ext4_set_aops(ea_inode);
>  		ext4_xattr_inode_set_class(ea_inode);
> -		ea_inode->i_generation = inode->i_generation;
> -		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
> -
> -		/*
> -		 * A back-pointer from EA inode to parent inode will be useful
> -		 * for e2fsck.
> -		 */
> -		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
>  		unlock_new_inode(ea_inode);
>  		err = ext4_inode_attach_jinode(ea_inode);
> +		if (!err)
> +			err = ext4_xattr_inode_init(handle, ea_inode, hash);
>  		if (err) {
>  			iput(ea_inode);
>  			return ERR_PTR(err);
>  		}
> +
> +		/*
> +		 * Xattr inodes are shared therefore quota charging is performed
> +		 * at a higher level.
> +		 */
> +		dquot_free_inode(ea_inode);
> +		dquot_drop(ea_inode);
> +		inode_lock(ea_inode);
> +		ea_inode->i_flags |= S_NOQUOTA;
> +		inode_unlock(ea_inode);
>  	}
>  
>  	return ea_inode;
>  }
>  
> -/*
> - * Unlink the inode storing the value of the EA.
> - */
> -int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
> +static struct inode *
> +ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
> +			    size_t value_len, u32 hash)
>  {
> -	struct inode *ea_inode = NULL;
> +	struct inode *ea_inode;
> +	struct mb_cache_entry *ce;
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
> +	void *ea_data = NULL;
>  	int err;
>  
> -	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -	if (err)
> -		return err;
> +	ce = mb_cache_entry_find_first(ea_inode_cache, hash);
> +	while (ce) {
> +		ea_inode = ext4_iget(inode->i_sb, ce->e_value);
> +		if (IS_ERR(ea_inode)) {
> +			ea_inode = NULL;
> +			goto next;
> +		}
>  
> -	clear_nlink(ea_inode);
> -	iput(ea_inode);
> +		if (is_bad_inode(ea_inode) ||
> +		    !(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) ||
> +		    i_size_read(ea_inode) != value_len)
> +			goto next;
>  
> -	return 0;
> +		if (!ea_data)
> +			ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
> +
> +		if (!ea_data) {
> +			iput(ea_inode);
> +			return NULL;
> +		}
> +
> +		err = ext4_xattr_inode_read(ea_inode, ea_data, value_len);
> +		if (unlikely(err))
> +			goto next;
> +
> +		if (!memcmp(value, ea_data, value_len)) {
> +			mb_cache_entry_touch(ea_inode_cache, ce);
> +			mb_cache_entry_put(ea_inode_cache, ce);
> +			kvfree(ea_data);
> +			return ea_inode;
> +		}
> +	next:
> +		iput(ea_inode);
> +		ce = mb_cache_entry_find_next(ea_inode_cache, ce);
> +	}
> +	kvfree(ea_data);
> +	return NULL;
>  }
>  
>  /*
>   * Add value of the EA in an inode.
>   */
> -static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
> -				unsigned long *ea_ino, const void *value,
> -				size_t value_len)
> +static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
> +					  const void *value, size_t value_len,
> +					  struct inode **ret_inode)
>  {
> +	u32 hash = ext4_xattr_inode_hash(value, value_len);
>  	struct inode *ea_inode;
>  	int err;
>  
> +	ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
> +	if (ea_inode) {
> +		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
> +		if (err) {
> +			iput(ea_inode);
> +			return err;
> +		}
> +
> +		*ret_inode = ea_inode;
> +		return 0;
> +	}
> +
>  	/* Create an inode for the EA value */
> -	ea_inode = ext4_xattr_inode_create(handle, inode);
> +	ea_inode = ext4_xattr_inode_create(handle, inode, hash);
>  	if (IS_ERR(ea_inode))
>  		return PTR_ERR(ea_inode);
>  
>  	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
> -	if (err)
> -		clear_nlink(ea_inode);
> -	else
> -		*ea_ino = ea_inode->i_ino;
> +	if (err) {
> +		ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		iput(ea_inode);
> +		return err;
> +	}
>  
> -	iput(ea_inode);
> +	mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
> +			      ea_inode->i_ino, true /* reusable */);
>  
> -	return err;
> +	*ret_inode = ea_inode;
> +	return 0;
>  }
>  
>  static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
> @@ -1033,11 +1366,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>  				handle_t *handle, struct inode *inode)
>  {
>  	struct ext4_xattr_entry *last;
> +	struct ext4_xattr_entry *here = s->here;
>  	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
>  	int in_inode = i->in_inode;
> -	int rc;
> +	struct inode *old_ea_inode = NULL;
> +	struct inode *new_ea_inode = NULL;
> +	int ret;
>  
> -	/* Compute min_offs and last. */
> +	/*
> +	 * Optimization for the simple case when old and new values have the
> +	 * same padded sizes. Not applicable if the existing value is stored in
> +	 * an external inode.
> +	 */
> +	if (i->value && !s->not_found && !here->e_value_inum &&
> +	    EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) ==
> +	    EXT4_XATTR_SIZE(i->value_len)) {
> +		size_t offs = le16_to_cpu(here->e_value_offs);
> +		void *val = s->base + offs;
> +		size_t size = EXT4_XATTR_SIZE(i->value_len);
> +
> +		here->e_value_size = cpu_to_le32(i->value_len);
> +		if (i->value == EXT4_ZERO_XATTR_VALUE) {
> +			memset(val, 0, size);
> +		} else {
> +			memcpy(val, i->value, i->value_len);
> +			/* Clear padding bytes. */
> +			memset(val + i->value_len, 0, size - i->value_len);
> +		}
> +		return 0;
> +	}
> +
> +	/* Find out min_offs and last to calculate the free space. */
>  	last = s->first;
>  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
>  		if (!last->e_value_inum && last->e_value_size) {
> @@ -1048,120 +1407,149 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>  	}
>  	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
>  	if (!s->not_found) {
> -		if (!in_inode &&
> -		    !s->here->e_value_inum && s->here->e_value_size) {
> -			size_t size = le32_to_cpu(s->here->e_value_size);
> +		if (!here->e_value_inum && here->e_value_size) {
> +			size_t size = le32_to_cpu(here->e_value_size);
>  			free += EXT4_XATTR_SIZE(size);
>  		}
>  		free += EXT4_XATTR_LEN(name_len);
>  	}
>  	if (i->value) {
> -		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
> +		size_t value_len = in_inode ? 0 : EXT4_XATTR_SIZE(i->value_len);
>  
> -		if (in_inode)
> -			value_len = 0;
> +		if (free < EXT4_XATTR_LEN(name_len) + value_len) {
> +			ret = -ENOSPC;
> +			goto out;
> +		}
> +	}
>  
> -		if (free < EXT4_XATTR_LEN(name_len) + value_len)
> -			return -ENOSPC;
> +	/*
> +	 * Getting access to old and new ea inodes is subject to failures.
> +	 * Finish that work before doing any modifications to the xattr data.
> +	 */
> +	if (!s->not_found && here->e_value_inum) {
> +		ret = ext4_xattr_inode_iget(inode,
> +		 			    le32_to_cpu(here->e_value_inum),
> +		 			    &old_ea_inode);
> +		if (ret) {
> +			old_ea_inode = NULL;
> +			goto out;
> +		}
>  	}
> +	if (i->value && in_inode) {
> +		WARN_ON_ONCE(!i->value_len);
>  
> -	if (i->value && s->not_found) {
> -		/* Insert the new name. */
> -		size_t size = EXT4_XATTR_LEN(name_len);
> -		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
> -		memmove((void *)s->here + size, s->here, rest);
> -		memset(s->here, 0, size);
> -		s->here->e_name_index = i->name_index;
> -		s->here->e_name_len = name_len;
> -		memcpy(s->here->e_name, i->name, name_len);
> -	} else {
> -		if (!s->here->e_value_inum && s->here->e_value_size &&
> -		    s->here->e_value_offs > 0) {
> -			void *first_val = s->base + min_offs;
> -			size_t offs = le16_to_cpu(s->here->e_value_offs);
> -			void *val = s->base + offs;
> -			size_t size = EXT4_XATTR_SIZE(
> -				le32_to_cpu(s->here->e_value_size));
> -
> -			if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
> -				/* The old and the new value have the same
> -				   size. Just replace. */
> -				s->here->e_value_size =
> -					cpu_to_le32(i->value_len);
> -				if (i->value == EXT4_ZERO_XATTR_VALUE) {
> -					memset(val, 0, size);
> -				} else {
> -					/* Clear pad bytes first. */
> -					memset(val + size - EXT4_XATTR_PAD, 0,
> -					       EXT4_XATTR_PAD);
> -					memcpy(val, i->value, i->value_len);
> -				}
> -				return 0;
> -			}
> +		ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
> +		if (ret)
> +			goto out;
>  
> -			/* Remove the old value. */
> -			memmove(first_val + size, first_val, val - first_val);
> -			memset(first_val, 0, size);
> -			s->here->e_value_size = 0;
> -			s->here->e_value_offs = 0;
> -			min_offs += size;
> -
> -			/* Adjust all value offsets. */
> -			last = s->first;
> -			while (!IS_LAST_ENTRY(last)) {
> -				size_t o = le16_to_cpu(last->e_value_offs);
> -				if (!last->e_value_inum &&
> -				    last->e_value_size && o < offs)
> -					last->e_value_offs =
> -						cpu_to_le16(o + size);
> -				last = EXT4_XATTR_NEXT(last);
> -			}
> +		ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
> +						     i->value_len,
> +						     &new_ea_inode);
> +		if (ret) {
> +			new_ea_inode = NULL;
> +			ext4_xattr_inode_free_quota(inode, i->value_len);
> +			goto out;
>  		}
> -		if (s->here->e_value_inum) {
> -			ext4_xattr_inode_unlink(inode,
> -					    le32_to_cpu(s->here->e_value_inum));
> -			s->here->e_value_inum = 0;
> +	}
> +
> +	if (old_ea_inode) {
> +		/* We are ready to release ref count on the old_ea_inode. */
> +		ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
> +		if (ret) {
> +			/* Release newly required ref count on new_ea_inode. */
> +			if (new_ea_inode) {
> +				int err;
> +
> +				err = ext4_xattr_inode_dec_ref(handle,
> +							       new_ea_inode);
> +				if (err)
> +					ext4_warning_inode(new_ea_inode,
> +						  "dec ref new_ea_inode err=%d",
> +						  err);
> +				ext4_xattr_inode_free_quota(inode,
> +							    i->value_len);
> +			}
> +			goto out;
>  		}
> -		if (!i->value) {
> -			/* Remove the old name. */
> -			size_t size = EXT4_XATTR_LEN(name_len);
> -			last = ENTRY((void *)last - size);
> -			memmove(s->here, (void *)s->here + size,
> -				(void *)last - (void *)s->here + sizeof(__u32));
> -			memset(last, 0, size);
> +
> +		ext4_xattr_inode_free_quota(inode,
> +					    le32_to_cpu(here->e_value_size));
> +	}
> +
> +	/* No failures allowed past this point. */
> +
> +	if (!s->not_found && here->e_value_offs) {
> +		/* Remove the old value. */
> +		void *first_val = s->base + min_offs;
> +		size_t offs = le16_to_cpu(here->e_value_offs);
> +		void *val = s->base + offs;
> +		size_t size = EXT4_XATTR_SIZE(
> +			le32_to_cpu(here->e_value_size));
> +
> +		memmove(first_val + size, first_val, val - first_val);
> +		memset(first_val, 0, size);
> +		min_offs += size;
> +
> +		/* Adjust all value offsets. */
> +		last = s->first;
> +		while (!IS_LAST_ENTRY(last)) {
> +			size_t o = le16_to_cpu(last->e_value_offs);
> +			if (!last->e_value_inum &&
> +			    last->e_value_size && o < offs)
> +				last->e_value_offs =
> +					cpu_to_le16(o + size);
> +			last = EXT4_XATTR_NEXT(last);
>  		}
>  	}
>  
> +	if (!s->not_found && !i->value) {
> +		/* Remove old name. */
> +		size_t size = EXT4_XATTR_LEN(name_len);
> +		last = ENTRY((void *)last - size);
> +		memmove(here, (void *)here + size,
> +			(void *)last - (void *)here + sizeof(__u32));
> +		memset(last, 0, size);
> +	} else if (s->not_found && i->value) {
> +		/* Insert new name. */
> +		size_t size = EXT4_XATTR_LEN(name_len);
> +		size_t rest = (void *)last - (void *)here + sizeof(__u32);
> +		memmove((void *)here + size, here, rest);
> +		memset(here, 0, size);
> +		here->e_name_index = i->name_index;
> +		here->e_name_len = name_len;
> +		memcpy(here->e_name, i->name, name_len);
> +	} else {
> +		WARN_ON_ONCE(s->not_found || !i->value);
> +		/* This is an update, reset value info. */
> +		here->e_value_inum = 0;
> +		here->e_value_offs = 0;
> +		here->e_value_size = 0;
> +	}
> +
>  	if (i->value) {
> -		/* Insert the new value. */
> +		/* Insert new value. */
>  		if (in_inode) {
> -			unsigned long ea_ino =
> -				le32_to_cpu(s->here->e_value_inum);
> -			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
> -						  i->value, i->value_len);
> -			if (rc)
> -				goto out;
> -			s->here->e_value_inum = cpu_to_le32(ea_ino);
> -			s->here->e_value_offs = 0;
> +			here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
>  		} else if (i->value_len) {
>  			size_t size = EXT4_XATTR_SIZE(i->value_len);
>  			void *val = s->base + min_offs - size;
> -			s->here->e_value_offs = cpu_to_le16(min_offs - size);
> -			s->here->e_value_inum = 0;
> +			here->e_value_offs = cpu_to_le16(min_offs - size);
>  			if (i->value == EXT4_ZERO_XATTR_VALUE) {
>  				memset(val, 0, size);
>  			} else {
> -				/* Clear the pad bytes first. */
> -				memset(val + size - EXT4_XATTR_PAD, 0,
> -				       EXT4_XATTR_PAD);
>  				memcpy(val, i->value, i->value_len);
> +				/* Clear padding bytes. */
> +				memset(val + i->value_len, 0,
> +				       size - i->value_len);
>  			}
>  		}
> -		s->here->e_value_size = cpu_to_le32(i->value_len);
> +		here->e_value_size = cpu_to_le32(i->value_len);
>  	}
> -
> +	ret = 0;
>  out:
> -	return rc;
> +	iput(old_ea_inode);
> +	iput(new_ea_inode);
> +	return ret;
>  }
>  
>  struct ext4_xattr_block_find {
> @@ -1223,6 +1611,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  	struct mb_cache_entry *ce = NULL;
>  	int error = 0;
>  	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
> +	struct inode *ea_inode = NULL;
> +	size_t old_ea_inode_size = 0;
>  
>  #define header(x) ((struct ext4_xattr_header *)(x))
>  
> @@ -1277,6 +1667,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  			header(s->base)->h_refcount = cpu_to_le32(1);
>  			s->here = ENTRY(s->base + offset);
>  			s->end = s->base + bs->bh->b_size;
> +
> +			/*
> +			 * If existing entry points to an xattr inode, we need
> +			 * to prevent ext4_xattr_set_entry() from decrementing
> +			 * ref count on it because the reference belongs to the
> +			 * original block. In this case, make the entry look
> +			 * like it has an empty value.
> +			 */
> +			if (!s->not_found && s->here->e_value_inum) {
> +				/*
> +				 * Defer quota free call for previous inode
> +				 * until success is guaranteed.
> +				 */
> +				old_ea_inode_size = le32_to_cpu(
> +							s->here->e_value_size);
> +				s->here->e_value_inum = 0;
> +				s->here->e_value_size = 0;
> +			}
>  		}
>  	} else {
>  		/* Allocate a buffer where we construct the new block. */
> @@ -1298,6 +1706,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  		goto bad_block;
>  	if (error)
>  		goto cleanup;
> +
> +	if (i->value && s->here->e_value_inum) {
> +		unsigned int ea_ino;
> +
> +		/*
> +		 * A ref count on ea_inode has been taken as part of the call to
> +		 * ext4_xattr_set_entry() above. We would like to drop this
> +		 * extra ref but we have to wait until the xattr block is
> +		 * initialized and has its own ref count on the ea_inode.
> +		 */
> +		ea_ino = le32_to_cpu(s->here->e_value_inum);
> +		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> +		if (error) {
> +			ea_inode = NULL;
> +			goto cleanup;
> +		}
> +	}
> +
>  	if (!IS_LAST_ENTRY(s->first))
>  		ext4_xattr_rehash(header(s->base), s->here);
>  
> @@ -1408,6 +1834,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  						 EXT4_FREE_BLOCKS_METADATA);
>  				goto cleanup;
>  			}
> +			error = ext4_xattr_inode_inc_ref_all(handle, inode,
> +						      ENTRY(header(s->base)+1));
> +			if (error)
> +				goto getblk_failed;
> +			if (ea_inode) {
> +				/* Drop the extra ref on ea_inode. */
> +				error = ext4_xattr_inode_dec_ref(handle,
> +								 ea_inode);
> +				if (error)
> +					ext4_warning_inode(ea_inode,
> +							   "dec ref error=%d",
> +							   error);
> +				iput(ea_inode);
> +				ea_inode = NULL;
> +			}
> +
>  			lock_buffer(new_bh);
>  			error = ext4_journal_get_create_access(handle, new_bh);
>  			if (error) {
> @@ -1427,15 +1869,36 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  		}
>  	}
>  
> +	if (old_ea_inode_size)
> +		ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
> +
>  	/* Update the inode. */
>  	EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
>  
>  	/* Drop the previous xattr block. */
> -	if (bs->bh && bs->bh != new_bh)
> -		ext4_xattr_release_block(handle, inode, bs->bh);
> +	if (bs->bh && bs->bh != new_bh) {
> +		struct ext4_xattr_inode_array *ea_inode_array = NULL;
> +		ext4_xattr_release_block(handle, inode, bs->bh,
> +					 &ea_inode_array,
> +					 0 /* extra_credits */);
> +		ext4_xattr_inode_array_free(ea_inode_array);
> +	}
>  	error = 0;
>  
>  cleanup:
> +	if (ea_inode) {
> +		int error2;
> +		error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (error2)
> +			ext4_warning_inode(ea_inode, "dec ref error=%d",
> +					   error2);
> +
> +		/* If there was an error, revert the quota charge. */
> +		if (error)
> +			ext4_xattr_inode_free_quota(inode,
> +						    i_size_read(ea_inode));
> +		iput(ea_inode);
> +	}
>  	if (ce)
>  		mb_cache_entry_put(ext4_mb_cache, ce);
>  	brelse(new_bh);
> @@ -1546,6 +2009,117 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
>  	return 0;
>  }
>  
> +struct ext4_xattr_ea_info {
> +	__le64 ref_count;	/* number of xattr entry references */
> +	__le32 hash;		/* crc32c hash of xattr data */
> +	__le32 reserved;	/* reserved, must be 0 */
> +};
> +
> +static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
> +				 u32 hash)
> +{
> +	struct ext4_xattr_ea_info ea_info = {
> +		.ref_count = cpu_to_le64(1),
> +		.hash = cpu_to_le32(hash),
> +		.reserved = 0,
> +	};
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +		.value = &ea_info,
> +		.value_len = sizeof(ea_info),
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	return ext4_xattr_ibody_set(handle, ea_inode, &i, &is);
> +}
> +
> +static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
> +				     u64 *ref_return, u32 *hash)
> +{
> +	struct ext4_xattr_ea_info ea_info;
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +		.value = &ea_info,
> +		.value_len = sizeof(ea_info),
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	if (WARN_ON(is.s.not_found) ||
> +	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(ea_info)))
> +		return -EFSCORRUPTED;
> +
> +	memcpy(&ea_info,
> +	       ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs),
> +	       sizeof(ea_info));
> +
> +	if (hash)
> +		*hash = le32_to_cpu(ea_info.hash);
> +
> +	*ref_return = le64_to_cpu(ea_info.ref_count) + ref_change;
> +	ea_info.ref_count = cpu_to_le64(*ref_return);
> +
> +	return ext4_xattr_set_entry(&i, &is.s, NULL, ea_inode);
> +}
> +
> +static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash)
> +{
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	struct ext4_xattr_ea_info *ea_info;
> +	void *ptr;
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	if (WARN_ON(is.s.not_found) ||
> +	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(*ea_info)))
> +		return -EFSCORRUPTED;
> +
> +	ptr = ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs);
> +	ea_info = (struct ext4_xattr_ea_info *)ptr;
> +
> +	if (WARN_ON(ea_info->reserved != 0))
> +		return -EFSCORRUPTED;
> +
> +	*hash = le32_to_cpu(ea_info->hash);
> +	return 0;
> +}
> +
>  static int ext4_xattr_value_same(struct ext4_xattr_search *s,
>  				 struct ext4_xattr_info *i)
>  {
> @@ -1560,6 +2134,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
>  	return !memcmp(value, i->value, i->value_len);
>  }
>  
> +struct buffer_head *ext4_xattr_get_block(struct inode *inode)
> +{
> +	struct buffer_head *bh;
> +	int error;
> +
> +	if (!EXT4_I(inode)->i_file_acl)
> +		return NULL;
> +	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +	if (!bh)
> +		return ERR_PTR(-EIO);
> +	error = ext4_xattr_check_block(inode, bh);
> +	if (error)
> +		return ERR_PTR(error);
> +	return bh;
> +}
> +
>  /*
>   * ext4_xattr_set_handle()
>   *
> @@ -1602,9 +2192,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  
>  	/* Check journal credits under write lock. */
>  	if (ext4_handle_valid(handle)) {
> +		struct buffer_head *bh;
>  		int credits;
>  
> -		credits = ext4_xattr_set_credits(inode, value_len);
> +		bh = ext4_xattr_get_block(inode);
> +		if (IS_ERR(bh)) {
> +			error = PTR_ERR(bh);
> +			goto cleanup;
> +		}
> +
> +		credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
> +		brelse(bh);
> +
>  		if (!ext4_handle_has_enough_credits(handle, credits)) {
>  			error = -ENOSPC;
>  			goto cleanup;
> @@ -1640,6 +2239,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  		if (flags & XATTR_CREATE)
>  			goto cleanup;
>  	}
> +
>  	if (!value) {
>  		if (!is.s.not_found)
>  			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
> @@ -1708,34 +2308,29 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  	return error;
>  }
>  
> -int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
> +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
>  {
> -	struct super_block *sb = inode->i_sb;
> -	int credits;
> -
> -	if (!EXT4_SB(sb)->s_journal)
> -		return 0;
> -
> -	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
> +	struct buffer_head *bh;
> +	int err;
>  
> -	/*
> -	 * In case of inline data, we may push out the data to a block,
> -	 * so we need to reserve credits for this eventuality
> -	 */
> -	if (ext4_has_inline_data(inode))
> -	        credits += ext4_writepage_trans_blocks(inode) + 1;
> +	*credits = 0;
>  
> -	if (ext4_has_feature_ea_inode(sb)) {
> -		int nrblocks = (value_len + sb->s_blocksize - 1) >>
> -					sb->s_blocksize_bits;
> +	if (!EXT4_SB(inode->i_sb)->s_journal)
> +		return 0;
>  
> -		/* For new inode */
> -		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
> +	down_read(&EXT4_I(inode)->xattr_sem);
>  
> -		/* For data blocks of EA inode */
> -		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
> +	bh = ext4_xattr_get_block(inode);
> +	if (IS_ERR(bh)) {
> +		err = PTR_ERR(bh);
> +	} else {
> +		*credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
> +		brelse(bh);
> +		err = 0;
>  	}
> -	return credits;
> +
> +	up_read(&EXT4_I(inode)->xattr_sem);
> +	return err;
>  }
>  
>  /*
> @@ -1760,7 +2355,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
>  		return error;
>  
>  retry:
> -	credits = ext4_xattr_set_credits(inode, value_len);
> +	error = ext4_xattr_set_credits(inode, value_len, &credits);
> +	if (error)
> +		return error;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>  	if (IS_ERR(handle)) {
>  		error = PTR_ERR(handle);
> @@ -2066,10 +2664,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
>  	return error;
>  }
>  
> -
>  #define EIA_INCR 16 /* must be 2^n */
>  #define EIA_MASK (EIA_INCR - 1)
> -/* Add the large xattr @inode into @ea_inode_array for later deletion.
> +
> +/* Add the large xattr @inode into @ea_inode_array for deferred iput().
>   * If @ea_inode_array is new or full it will be grown and the old
>   * contents copied over.
>   */
> @@ -2114,21 +2712,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>   * ext4_xattr_delete_inode()
>   *
>   * Free extended attribute resources associated with this inode. Traverse
> - * all entries and unlink any xattr inodes associated with this inode. This
> - * is called immediately before an inode is freed. We have exclusive
> - * access to the inode. If an orphan inode is deleted it will also delete any
> - * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
> - * to ensure they belong to the parent inode and were not deleted already.
> + * all entries and decrement reference on any xattr inodes associated with this
> + * inode. This is called immediately before an inode is freed. We have exclusive
> + * access to the inode. If an orphan inode is deleted it will also release its
> + * references on xattr block and xattr inodes.
>   */
> -int
> -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> -			struct ext4_xattr_inode_array **ea_inode_array,
> -			int extra_credits)
> +int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> +			    struct ext4_xattr_inode_array **ea_inode_array,
> +			    int extra_credits)
>  {
>  	struct buffer_head *bh = NULL;
>  	struct ext4_xattr_ibody_header *header;
> -	struct ext4_inode *raw_inode;
>  	struct ext4_iloc iloc = { .bh = NULL };
> +	struct ext4_xattr_entry *entry;
>  	int error;
>  
>  	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
> @@ -2140,66 +2736,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  		goto cleanup;
>  	}
>  
> -	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
> -		goto delete_external_ea;
> -
> -	error = ext4_get_inode_loc(inode, &iloc);
> -	if (error)
> -		goto cleanup;
> +	if (ext4_has_feature_ea_inode(inode->i_sb) &&
> +	    ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
>  
> -	error = ext4_journal_get_write_access(handle, iloc.bh);
> -	if (error)
> -		goto cleanup;
> +		error = ext4_get_inode_loc(inode, &iloc);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
> +			goto cleanup;
> +		}
>  
> -	raw_inode = ext4_raw_inode(&iloc);
> -	header = IHDR(inode, raw_inode);
> -	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
> -				    false /* block_csum */, ea_inode_array,
> -				    extra_credits);
> +		error = ext4_journal_get_write_access(handle, iloc.bh);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "write access (error %d)",
> +					 error);
> +			goto cleanup;
> +		}
>  
> -delete_external_ea:
> -	if (!EXT4_I(inode)->i_file_acl) {
> -		error = 0;
> -		goto cleanup;
> -	}
> -	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> -	if (!bh) {
> -		EXT4_ERROR_INODE(inode, "block %llu read error",
> -				 EXT4_I(inode)->i_file_acl);
> -		error = -EIO;
> -		goto cleanup;
> -	}
> -	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
> -	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
> -		EXT4_ERROR_INODE(inode, "bad block %llu",
> -				 EXT4_I(inode)->i_file_acl);
> -		error = -EFSCORRUPTED;
> -		goto cleanup;
> +		header = IHDR(inode, ext4_raw_inode(&iloc));
> +		if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
> +			ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
> +						     IFIRST(header),
> +						     false /* block_csum */,
> +						     ea_inode_array,
> +						     extra_credits,
> +						     false /* skip_quota */);
>  	}
>  
> -	if (ext4_has_feature_ea_inode(inode->i_sb)) {
> -		error = ext4_journal_get_write_access(handle, bh);
> -		if (error) {
> -			EXT4_ERROR_INODE(inode, "write access %llu",
> +	if (EXT4_I(inode)->i_file_acl) {
> +		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +		if (!bh) {
> +			EXT4_ERROR_INODE(inode, "block %llu read error",
>  					 EXT4_I(inode)->i_file_acl);
> +			error = -EIO;
> +			goto cleanup;
> +		}
> +		error = ext4_xattr_check_block(inode, bh);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
> +					 EXT4_I(inode)->i_file_acl, error);
>  			goto cleanup;
>  		}
> -		ext4_xattr_inode_remove_all(handle, inode, bh,
> -					    BFIRST(bh),
> -					    true /* block_csum */,
> -					    ea_inode_array,
> -					    extra_credits);
> -	}
>  
> -	ext4_xattr_release_block(handle, inode, bh);
> -	/* Update i_file_acl within the same transaction that releases block. */
> -	EXT4_I(inode)->i_file_acl = 0;
> -	error = ext4_mark_inode_dirty(handle, inode);
> -	if (error) {
> -		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> -				 error);
> -		goto cleanup;
> +		if (ext4_has_feature_ea_inode(inode->i_sb)) {
> +			for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> +			     entry = EXT4_XATTR_NEXT(entry))
> +				if (entry->e_value_inum)
> +					ext4_xattr_inode_free_quota(inode,
> +					      le32_to_cpu(entry->e_value_size));
> +
> +		}
> +
> +		ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
> +					 extra_credits);
> +		/*
> +		 * Update i_file_acl value in the same transaction that releases
> +		 * block.
> +		 */
> +		EXT4_I(inode)->i_file_acl = 0;
> +		error = ext4_mark_inode_dirty(handle, inode);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> +					 error);
> +			goto cleanup;
> +		}
>  	}
> +	error = 0;
>  cleanup:
>  	brelse(iloc.bh);
>  	brelse(bh);
> @@ -2208,17 +2809,13 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  
>  void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
>  {
> -	struct inode	*ea_inode;
> -	int		idx = 0;
> +	int idx;
>  
>  	if (ea_inode_array == NULL)
>  		return;
>  
> -	for (; idx < ea_inode_array->count; ++idx) {
> -		ea_inode = ea_inode_array->inodes[idx];
> -		clear_nlink(ea_inode);
> -		iput(ea_inode);
> -	}
> +	for (idx = 0; idx < ea_inode_array->count; ++idx)
> +		iput(ea_inode_array->inodes[idx]);
>  	kfree(ea_inode_array);
>  }
>  
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index b2005a2716d9..67616cb9a059 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -70,19 +70,6 @@ struct ext4_xattr_entry {
>  #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
>  
>  /*
> - * Link EA inode back to parent one using i_mtime field.
> - * Extra integer type conversion added to ignore higher
> - * bits in i_mtime.tv_sec which might be set by ext4_get()
> - */
> -#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
> -do {                                                  \
> -      (inode)->i_mtime.tv_sec = inum;                 \
> -} while(0)
> -
> -#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
> -((__u32)(inode)->i_mtime.tv_sec)
> -
> -/*
>   * The minimum size of EA value when you start storing it in an external inode
>   * size of block - size of header - size of 1 entry - 4 null bytes
>  */
> @@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
>  extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
>  extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
>  extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
> -extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
> +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
> +				  int *credits);
>  
> -extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
>  extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  				   struct ext4_xattr_inode_array **array,
>  				   int extra_credits);
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index 77a5b99d8f92..7dfdca822ccb 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -13,10 +13,11 @@
>   * mb_cache_entry_delete()).
>   *
>   * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
> - * They use hash of a block contents as a key and block number as a value.
> - * That's why keys need not be unique (different xattr blocks may end up having
> - * the same hash). However block number always uniquely identifies a cache
> - * entry.
> + * Ext4 also uses it for deduplication of xattr values stored in inodes.
> + * They use hash of data as a key and provide a value that may represent a
> + * block or inode number. That's why keys need not be unique (hash of different
> + * data may be the same). However user provided value always uniquely
> + * identifies a cache entry.
>   *
>   * We provide functions for creation and removal of entries, search by key,
>   * and a special "delete entry with given key-value pair" operation. Fixed
> -- 
> 2.13.0.219.gdb65acc882-goog
> 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 27/28] ext4: xattr inode deduplication
@ 2017-05-31 16:00     ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-05-31 16:00 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Deepa Dinamani, Dave Kleikamp, jfs-discussion, Theodore Ts'o,
	linux-kernel, reiserfs-devel, Jens Axboe, linux-fsdevel,
	Mike Christie, Andreas Dilger, Alexander Viro, Jan Kara,
	Fabian Frederick, linux-ext4, ocfs2-devel

On Wed, May 31, 2017 at 01:15:16AM -0700, Tahsin Erdogan wrote:
> Ext4 now supports xattr values that are up to 64k in size (vfs limit).
> Large xattr values are stored in external inodes each one holding a
> single value. Once written the data blocks of these inodes are immutable.
> 
> The real world use cases are expected to have a lot of value duplication
> such as inherited acls etc. To reduce data duplication on disk, this patch
> implements a deduplicator that allows sharing of xattr inodes.
> 
> The deduplication is based on an in-memory hash lookup that is a best
> effort sharing scheme. When a xattr inode is read from disk (i.e.
> getxattr() call), its crc32c hash is added to a hash table. Before
> creating a new xattr inode for a value being set, the hash table is
> checked to see if an existing inode holds an identical value. If such an
> inode is found, the ref count on that inode is incremented. On value
> removal the ref count is decremented and if it reaches zero the inode is
> deleted.
> 
> The quota charging for such inodes is manually managed. Every reference
> holder is charged the full size as if there was no sharing happening.
> This is consistent with how xattr blocks are also charged.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext4/acl.c   |    5 +-
>  fs/ext4/ext4.h  |    7 +-
>  fs/ext4/inode.c |    9 +-
>  fs/ext4/super.c |   22 +-
>  fs/ext4/xattr.c | 1073 +++++++++++++++++++++++++++++++++++++++++++------------
>  fs/ext4/xattr.h |   17 +-
>  fs/mbcache.c    |    9 +-
>  7 files changed, 881 insertions(+), 261 deletions(-)
> 
> diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
> index 74f7ac539e00..8db03e5c78bc 100644
> --- a/fs/ext4/acl.c
> +++ b/fs/ext4/acl.c
> @@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>  	if (error)
>  		return error;
>  retry:
> -	credits = ext4_xattr_set_credits(inode, acl_size);
> +	error = ext4_xattr_set_credits(inode, acl_size, &credits);
> +	if (error)
> +		return error;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>  	if (IS_ERR(handle))
>  		return PTR_ERR(handle);
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index d79d8d7bee88..79f06290e723 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1517,6 +1517,7 @@ struct ext4_sb_info {
>  	long s_es_nr_inode;
>  	struct ext4_es_stats s_es_stats;
>  	struct mb_cache *s_mb_cache;
> +	struct mb_cache *s_ea_inode_cache;
>  	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
>  
>  	/* Ratelimit ext4 messages. */
> @@ -2099,7 +2100,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
>  	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
>  }
>  
> -#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
> +static inline bool ext4_is_quota_file(struct inode *inode)
> +{
> +	return IS_NOQUOTA(inode) &&
> +	       !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
> +}
>  
>  /*
>   * This structure is stuffed into the struct file's private_data field
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 4d6936f0d8a4..6f5872197d6c 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4843,8 +4843,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>  	}
>  	brelse(iloc.bh);
>  	ext4_set_inode_flags(inode);
> -	if (ei->i_flags & EXT4_EA_INODE_FL)
> +
> +	if (ei->i_flags & EXT4_EA_INODE_FL) {
>  		ext4_xattr_inode_set_class(inode);
> +
> +		inode_lock(inode);
> +		inode->i_flags |= S_NOQUOTA;
> +		inode_unlock(inode);
> +	}
> +
>  	unlock_new_inode(inode);
>  	return inode;
>  
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index b02a23ec92ca..7d2b692d52ea 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
>  		invalidate_bdev(sbi->journal_bdev);
>  		ext4_blkdev_remove(sbi);
>  	}
> +	if (sbi->s_ea_inode_cache) {
> +		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
> +		sbi->s_ea_inode_cache = NULL;
> +	}
>  	if (sbi->s_mb_cache) {
>  		ext4_xattr_destroy_cache(sbi->s_mb_cache);
>  		sbi->s_mb_cache = NULL;
> @@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
>  	if (res)
>  		return res;
>  retry:
> -	credits = ext4_xattr_set_credits(inode, len);
> +	res = ext4_xattr_set_credits(inode, len, &credits);
> +	if (res)
> +		return res;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
>  	if (IS_ERR(handle))
>  		return PTR_ERR(handle);
> @@ -4067,6 +4074,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  		goto failed_mount_wq;
>  	}
>  
> +	if (ext4_has_feature_ea_inode(sb)) {
> +		sbi->s_ea_inode_cache = ext4_xattr_create_cache();
> +		if (!sbi->s_ea_inode_cache) {
> +			ext4_msg(sb, KERN_ERR,
> +				 "Failed to create an s_ea_inode_cache");
> +			goto failed_mount_wq;
> +		}
> +	}
> +
>  	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
>  	    (blocksize != PAGE_SIZE)) {
>  		ext4_msg(sb, KERN_ERR,
> @@ -4296,6 +4312,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	if (EXT4_SB(sb)->rsv_conversion_wq)
>  		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
>  failed_mount_wq:
> +	if (sbi->s_ea_inode_cache) {
> +		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
> +		sbi->s_ea_inode_cache = NULL;
> +	}
>  	if (sbi->s_mb_cache) {
>  		ext4_xattr_destroy_cache(sbi->s_mb_cache);
>  		sbi->s_mb_cache = NULL;
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 6acce1f689ab..caddc176a612 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -55,6 +55,7 @@
>  #include <linux/slab.h>
>  #include <linux/mbcache.h>
>  #include <linux/quotaops.h>
> +#include <linux/crc32c.h>
>  #include "ext4_jbd2.h"
>  #include "ext4.h"
>  #include "xattr.h"
> @@ -79,6 +80,7 @@ ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
>  			    struct mb_cache_entry **);
>  static void ext4_xattr_rehash(struct ext4_xattr_header *,
>  			      struct ext4_xattr_entry *);
> +static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash);
>  
>  static const struct xattr_handler * const ext4_xattr_handler_map[] = {
>  	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
> @@ -105,13 +107,23 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
>  	NULL
>  };
>  
> +#define EXT4_XATTR_SYSTEM_EA_INFO  "eai"
> +
>  #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
>  				inode->i_sb->s_fs_info)->s_mb_cache)
>  
> +#define EA_INODE_CACHE(inode)	(((struct ext4_sb_info *) \
> +				inode->i_sb->s_fs_info)->s_ea_inode_cache)
> +
>  static int
>  ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>  			struct inode *inode);
>  
> +static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
> +				 u32 hash);
> +static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
> +				     u64 *ref_return, u32 *hash);
> +
>  #ifdef CONFIG_LOCKDEP
>  void ext4_xattr_inode_set_class(struct inode *ea_inode)
>  {
> @@ -329,14 +341,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>  		goto error;
>  	}
>  
> -	if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
> -	    inode->i_generation != parent->i_generation) {
> -		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
> -			   "to parent is invalid.", ea_ino);
> -		err = -EINVAL;
> -		goto error;
> -	}
> -
>  	if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
>  		ext4_error(parent->i_sb, "EA inode %lu does not have "
>  			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
> @@ -351,6 +355,11 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>  	return err;
>  }
>  
> +static u32 ext4_xattr_inode_hash(const void *buffer, size_t size)
> +{
> +	return crc32c(0, buffer, size);

The metadata checksumming code uses crypto_alloc_shash to dynamically
bind to the crc32c implementation only when ext4 actually needs it.
This introduces a static module dependency on libcrc32c even though we
only need it if we're deduplicating xattrs, which is done only when
EXT4_FEATURE_INCOMPAT_EA_INODE is enabled, correct?

Or, looking through the code, maybe not; are we now capable of deduping
for any filesystem?

Anyway, if this dedupe feature is hidden behind INCOMPAT_EA_INODE then
this crc32c call binding should be done dynamically; however, if it
works for any filesystem and is therefore on all the time, the existing
users ought to be converted to use libcrc32c.

--D

> +}
> +
>  /*
>   * Read the value from the EA inode.
>   */
> @@ -358,17 +367,52 @@ static int
>  ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
>  		     size_t size)
>  {
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
>  	struct inode *ea_inode;
> -	int ret;
> +	u32 hash, calc_hash;
> +	struct mb_cache_entry *ce;
> +	int err;
>  
> -	ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -	if (ret)
> -		return ret;
> +	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> +	if (err) {
> +		ea_inode = NULL;
> +		goto out;
> +	}
>  
> -	ret = ext4_xattr_inode_read(ea_inode, buffer, size);
> -	iput(ea_inode);
> +	if (i_size_read(ea_inode) != size) {
> +		ext4_warning_inode(ea_inode,
> +				   "ea_inode file size=%llu entry size=%zu",
> +				   i_size_read(ea_inode), size);
> +		err = -EFSCORRUPTED;
> +		goto out;
> +	}
>  
> -	return ret;
> +	err = ext4_xattr_inode_read(ea_inode, buffer, size);
> +	if (!err) {
> +		if (ext4_xattr_read_ea_hash(ea_inode, &hash))
> +			goto out;
> +
> +		/* Avoid hash calculation if already cached. */
> +		ce = mb_cache_entry_get(ea_inode_cache, hash, ea_inode->i_ino);
> +		if (ce) {
> +			mb_cache_entry_put(ea_inode_cache, ce);
> +			goto out;
> +		}
> +
> +		calc_hash = ext4_xattr_inode_hash(buffer, size);
> +		if (hash != calc_hash) {
> +			ext4_warning_inode(ea_inode, "EA inode saved hash=%#x "
> +					   "does not match calc_hash=%#x",
> +					   hash, calc_hash);
> +			goto out;
> +		}
> +
> +		mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
> +				      ea_inode->i_ino, true /* reusable */);
> +	}
> +out:
> +	iput(ea_inode);
> +	return err;
>  }
>  
>  static int
> @@ -657,6 +701,101 @@ static void ext4_xattr_update_super_block(handle_t *handle,
>  	}
>  }
>  
> +static inline size_t round_up_cluster(struct inode *inode, size_t length)
> +{
> +	struct super_block *sb = inode->i_sb;
> +	size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
> +				    inode->i_blkbits);
> +	size_t mask = ~(cluster_size - 1);
> +
> +	return (length + cluster_size - 1) & mask;
> +}
> +
> +static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
> +{
> +	int err;
> +
> +	err = dquot_alloc_inode(inode);
> +	if (err)
> +		return err;
> +	err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
> +	if (err)
> +		dquot_free_inode(inode);
> +	return err;
> +}
> +
> +static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
> +{
> +	dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
> +	dquot_free_inode(inode);
> +}
> +
> +static int __ext4_xattr_set_credits(struct super_block *sb,
> +				    struct buffer_head *block_bh,
> +				    size_t value_len)
> +{
> +	int credits;
> +	int blocks;
> +
> +	/*
> +	 * 1) Owner inode update
> +	 * 2) Ref count update on old xattr block
> +	 * 3) new xattr block
> +	 * 4) block bitmap update for new xattr block
> +	 * 5) group descriptor for new xattr block
> +	 */
> +	credits = 5;
> +
> +	/* We are done if ea_inode feature is not enabled. */
> +	if (!ext4_has_feature_ea_inode(sb))
> +		return credits;
> +
> +	/* New ea_inode, inode map, block bitmap, group descriptor. */
> +	credits += 4;
> +
> +	/* Data blocks. */
> +	blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
> +
> +	/* Indirection block. */
> +	blocks += 1;
> +
> +	/* Block bitmap and group descriptor updates for each block. */
> +	credits += blocks * 2;
> +
> +	/* Blocks themselves. */
> +	credits += blocks;
> +
> +	/* Dereference ea_inode holding old xattr value.
> +	 * Old ea_inode, inode map, block bitmap, group descriptor.
> +	 */
> +	credits += 4;
> +
> +	/* Data blocks for old ea_inode. */
> +	blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
> +
> +	/* Indirection block for old ea_inode. */
> +	blocks += 1;
> +
> +	/* Block bitmap and group descriptor updates for each block. */
> +	credits += blocks * 2;
> +
> +	/* Quota updates. */
> +	credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
> +
> +	/* We may need to clone the existing xattr block in which case we need
> +	 * to increment ref counts for existing ea_inodes referenced by it.
> +	 */
> +	if (block_bh) {
> +		struct ext4_xattr_entry *entry = BFIRST(block_bh);
> +
> +		for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
> +			if (entry->e_value_inum)
> +				/* Ref count update on ea_inode. */
> +				credits += 1;
> +	}
> +	return credits;
> +}
> +
>  int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>  			      int credits, struct buffer_head *bh,
>  			      bool dirty, bool block_csum)
> @@ -706,12 +845,139 @@ int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>  	return 0;
>  }
>  
> +static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
> +				       int ref_change)
> +{
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
> +	struct ext4_iloc iloc;
> +	s64 ref_return;
> +	u32 hash;
> +	int ret;
> +
> +	inode_lock(ea_inode);
> +
> +	ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
> +	if (ret) {
> +		iloc.bh = NULL;
> +		goto out;
> +	}
> +
> +	ret = ext4_xattr_update_ea_info(ea_inode, ref_change, &ref_return,
> +					&hash);
> +	if (ret)
> +		goto out;
> +
> +	if (ref_change > 0) {
> +		WARN_ONCE(ref_return <= 0, "EA inode %lu ref_return=%lld",
> +			  ea_inode->i_ino, ref_return);
> +
> +		if (ref_return == 1) {
> +			WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
> +				  ea_inode->i_ino, ea_inode->i_nlink);
> +
> +			set_nlink(ea_inode, 1);
> +			ext4_orphan_del(handle, ea_inode);
> +
> +			mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
> +					      ea_inode->i_ino,
> +					      true /* reusable */);
> +		}
> +	} else {
> +		WARN_ONCE(ref_return < 0, "EA inode %lu ref_return=%lld",
> +			  ea_inode->i_ino, ref_return);
> +
> +		if (ref_return == 0) {
> +			WARN_ONCE(ea_inode->i_nlink != 1,
> +				  "EA inode %lu i_nlink=%u",
> +				  ea_inode->i_ino, ea_inode->i_nlink);
> +
> +			clear_nlink(ea_inode);
> +			ext4_orphan_add(handle, ea_inode);
> +
> +			mb_cache_entry_delete(ea_inode_cache, hash,
> +					      ea_inode->i_ino);
> +		}
> +	}
> +
> +	ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
> +	iloc.bh = NULL;
> +	if (ret)
> +		ext4_warning_inode(ea_inode,
> +				   "ext4_mark_iloc_dirty() failed ret=%d", ret);
> +out:
> +	brelse(iloc.bh);
> +	inode_unlock(ea_inode);
> +	return ret;
> +}
> +
> +static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
> +{
> +	return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
> +}
> +
> +static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
> +{
> +	return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
> +}
> +
> +static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
> +					struct ext4_xattr_entry *first)
> +{
> +	struct inode *ea_inode;
> +	struct ext4_xattr_entry *entry;
> +	struct ext4_xattr_entry *failed_entry;
> +	unsigned int ea_ino;
> +	int err, saved_err;
> +
> +	for (entry = first; !IS_LAST_ENTRY(entry);
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		ea_ino = le32_to_cpu(entry->e_value_inum);
> +		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +		if (err)
> +			goto cleanup;
> +		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
> +		if (err) {
> +			ext4_warning_inode(ea_inode, "inc ref error %d", err);
> +			iput(ea_inode);
> +			goto cleanup;
> +		}
> +		iput(ea_inode);
> +	}
> +	return 0;
> +
> +cleanup:
> +	saved_err = err;
> +	failed_entry = entry;
> +
> +	for (entry = first; entry != failed_entry;
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		ea_ino = le32_to_cpu(entry->e_value_inum);
> +		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +		if (err) {
> +			ext4_warning(parent->i_sb,
> +				     "cleanup ea_ino %u iget error %d", ea_ino,
> +				     err);
> +			continue;
> +		}
> +		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (err)
> +			ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
> +					   err);
> +		iput(ea_inode);
> +	}
> +	return saved_err;
> +}
> +
>  static void
> -ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
> -			    struct buffer_head *bh,
> -			    struct ext4_xattr_entry *first, bool block_csum,
> -			    struct ext4_xattr_inode_array **ea_inode_array,
> -			    int extra_credits)
> +ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
> +			     struct buffer_head *bh,
> +			     struct ext4_xattr_entry *first, bool block_csum,
> +			     struct ext4_xattr_inode_array **ea_inode_array,
> +			     int extra_credits, bool skip_quota)
>  {
>  	struct inode *ea_inode;
>  	struct ext4_xattr_entry *entry;
> @@ -748,10 +1014,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>  			continue;
>  		}
>  
> -		inode_lock(ea_inode);
> -		clear_nlink(ea_inode);
> -		ext4_orphan_add(handle, ea_inode);
> -		inode_unlock(ea_inode);
> +		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (err) {
> +			ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
> +					   err);
> +			continue;
> +		}
> +
> +		if (!skip_quota)
> +			ext4_xattr_inode_free_quota(parent,
> +					      le32_to_cpu(entry->e_value_size));
>  
>  		/*
>  		 * Forget about ea_inode within the same transaction that decrements the ref
> @@ -784,7 +1056,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>   */
>  static void
>  ext4_xattr_release_block(handle_t *handle, struct inode *inode,
> -			 struct buffer_head *bh)
> +			 struct buffer_head *bh,
> +			 struct ext4_xattr_inode_array **ea_inode_array,
> +			 int extra_credits)
>  {
>  	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
>  	u32 hash, ref;
> @@ -807,6 +1081,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>  		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
>  		get_bh(bh);
>  		unlock_buffer(bh);
> +
> +		if (ext4_has_feature_ea_inode(inode->i_sb))
> +			ext4_xattr_inode_dec_ref_all(handle, inode, bh,
> +						     BFIRST(bh),
> +						     true /* block_csum */,
> +						     ea_inode_array,
> +						     extra_credits,
> +						     true /* skip_quota */);
>  		ext4_free_blocks(handle, inode, bh, 0, 1,
>  				 EXT4_FREE_BLOCKS_METADATA |
>  				 EXT4_FREE_BLOCKS_FORGET);
> @@ -947,7 +1229,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
>   * Create an inode to store the value of a large EA.
>   */
>  static struct inode *ext4_xattr_inode_create(handle_t *handle,
> -					     struct inode *inode)
> +					     struct inode *inode, u32 hash)
>  {
>  	struct inode *ea_inode = NULL;
>  	uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
> @@ -965,67 +1247,118 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
>  		ea_inode->i_fop = &ext4_file_operations;
>  		ext4_set_aops(ea_inode);
>  		ext4_xattr_inode_set_class(ea_inode);
> -		ea_inode->i_generation = inode->i_generation;
> -		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
> -
> -		/*
> -		 * A back-pointer from EA inode to parent inode will be useful
> -		 * for e2fsck.
> -		 */
> -		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
>  		unlock_new_inode(ea_inode);
>  		err = ext4_inode_attach_jinode(ea_inode);
> +		if (!err)
> +			err = ext4_xattr_inode_init(handle, ea_inode, hash);
>  		if (err) {
>  			iput(ea_inode);
>  			return ERR_PTR(err);
>  		}
> +
> +		/*
> +		 * Xattr inodes are shared therefore quota charging is performed
> +		 * at a higher level.
> +		 */
> +		dquot_free_inode(ea_inode);
> +		dquot_drop(ea_inode);
> +		inode_lock(ea_inode);
> +		ea_inode->i_flags |= S_NOQUOTA;
> +		inode_unlock(ea_inode);
>  	}
>  
>  	return ea_inode;
>  }
>  
> -/*
> - * Unlink the inode storing the value of the EA.
> - */
> -int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
> +static struct inode *
> +ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
> +			    size_t value_len, u32 hash)
>  {
> -	struct inode *ea_inode = NULL;
> +	struct inode *ea_inode;
> +	struct mb_cache_entry *ce;
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
> +	void *ea_data = NULL;
>  	int err;
>  
> -	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -	if (err)
> -		return err;
> +	ce = mb_cache_entry_find_first(ea_inode_cache, hash);
> +	while (ce) {
> +		ea_inode = ext4_iget(inode->i_sb, ce->e_value);
> +		if (IS_ERR(ea_inode)) {
> +			ea_inode = NULL;
> +			goto next;
> +		}
>  
> -	clear_nlink(ea_inode);
> -	iput(ea_inode);
> +		if (is_bad_inode(ea_inode) ||
> +		    !(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) ||
> +		    i_size_read(ea_inode) != value_len)
> +			goto next;
>  
> -	return 0;
> +		if (!ea_data)
> +			ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
> +
> +		if (!ea_data) {
> +			iput(ea_inode);
> +			return NULL;
> +		}
> +
> +		err = ext4_xattr_inode_read(ea_inode, ea_data, value_len);
> +		if (unlikely(err))
> +			goto next;
> +
> +		if (!memcmp(value, ea_data, value_len)) {
> +			mb_cache_entry_touch(ea_inode_cache, ce);
> +			mb_cache_entry_put(ea_inode_cache, ce);
> +			kvfree(ea_data);
> +			return ea_inode;
> +		}
> +	next:
> +		iput(ea_inode);
> +		ce = mb_cache_entry_find_next(ea_inode_cache, ce);
> +	}
> +	kvfree(ea_data);
> +	return NULL;
>  }
>  
>  /*
>   * Add value of the EA in an inode.
>   */
> -static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
> -				unsigned long *ea_ino, const void *value,
> -				size_t value_len)
> +static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
> +					  const void *value, size_t value_len,
> +					  struct inode **ret_inode)
>  {
> +	u32 hash = ext4_xattr_inode_hash(value, value_len);
>  	struct inode *ea_inode;
>  	int err;
>  
> +	ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
> +	if (ea_inode) {
> +		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
> +		if (err) {
> +			iput(ea_inode);
> +			return err;
> +		}
> +
> +		*ret_inode = ea_inode;
> +		return 0;
> +	}
> +
>  	/* Create an inode for the EA value */
> -	ea_inode = ext4_xattr_inode_create(handle, inode);
> +	ea_inode = ext4_xattr_inode_create(handle, inode, hash);
>  	if (IS_ERR(ea_inode))
>  		return PTR_ERR(ea_inode);
>  
>  	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
> -	if (err)
> -		clear_nlink(ea_inode);
> -	else
> -		*ea_ino = ea_inode->i_ino;
> +	if (err) {
> +		ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		iput(ea_inode);
> +		return err;
> +	}
>  
> -	iput(ea_inode);
> +	mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
> +			      ea_inode->i_ino, true /* reusable */);
>  
> -	return err;
> +	*ret_inode = ea_inode;
> +	return 0;
>  }
>  
>  static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
> @@ -1033,11 +1366,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>  				handle_t *handle, struct inode *inode)
>  {
>  	struct ext4_xattr_entry *last;
> +	struct ext4_xattr_entry *here = s->here;
>  	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
>  	int in_inode = i->in_inode;
> -	int rc;
> +	struct inode *old_ea_inode = NULL;
> +	struct inode *new_ea_inode = NULL;
> +	int ret;
>  
> -	/* Compute min_offs and last. */
> +	/*
> +	 * Optimization for the simple case when old and new values have the
> +	 * same padded sizes. Not applicable if the existing value is stored in
> +	 * an external inode.
> +	 */
> +	if (i->value && !s->not_found && !here->e_value_inum &&
> +	    EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) ==
> +	    EXT4_XATTR_SIZE(i->value_len)) {
> +		size_t offs = le16_to_cpu(here->e_value_offs);
> +		void *val = s->base + offs;
> +		size_t size = EXT4_XATTR_SIZE(i->value_len);
> +
> +		here->e_value_size = cpu_to_le32(i->value_len);
> +		if (i->value == EXT4_ZERO_XATTR_VALUE) {
> +			memset(val, 0, size);
> +		} else {
> +			memcpy(val, i->value, i->value_len);
> +			/* Clear padding bytes. */
> +			memset(val + i->value_len, 0, size - i->value_len);
> +		}
> +		return 0;
> +	}
> +
> +	/* Find out min_offs and last to calculate the free space. */
>  	last = s->first;
>  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
>  		if (!last->e_value_inum && last->e_value_size) {
> @@ -1048,120 +1407,149 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>  	}
>  	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
>  	if (!s->not_found) {
> -		if (!in_inode &&
> -		    !s->here->e_value_inum && s->here->e_value_size) {
> -			size_t size = le32_to_cpu(s->here->e_value_size);
> +		if (!here->e_value_inum && here->e_value_size) {
> +			size_t size = le32_to_cpu(here->e_value_size);
>  			free += EXT4_XATTR_SIZE(size);
>  		}
>  		free += EXT4_XATTR_LEN(name_len);
>  	}
>  	if (i->value) {
> -		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
> +		size_t value_len = in_inode ? 0 : EXT4_XATTR_SIZE(i->value_len);
>  
> -		if (in_inode)
> -			value_len = 0;
> +		if (free < EXT4_XATTR_LEN(name_len) + value_len) {
> +			ret = -ENOSPC;
> +			goto out;
> +		}
> +	}
>  
> -		if (free < EXT4_XATTR_LEN(name_len) + value_len)
> -			return -ENOSPC;
> +	/*
> +	 * Getting access to old and new ea inodes is subject to failures.
> +	 * Finish that work before doing any modifications to the xattr data.
> +	 */
> +	if (!s->not_found && here->e_value_inum) {
> +		ret = ext4_xattr_inode_iget(inode,
> +		 			    le32_to_cpu(here->e_value_inum),
> +		 			    &old_ea_inode);
> +		if (ret) {
> +			old_ea_inode = NULL;
> +			goto out;
> +		}
>  	}
> +	if (i->value && in_inode) {
> +		WARN_ON_ONCE(!i->value_len);
>  
> -	if (i->value && s->not_found) {
> -		/* Insert the new name. */
> -		size_t size = EXT4_XATTR_LEN(name_len);
> -		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
> -		memmove((void *)s->here + size, s->here, rest);
> -		memset(s->here, 0, size);
> -		s->here->e_name_index = i->name_index;
> -		s->here->e_name_len = name_len;
> -		memcpy(s->here->e_name, i->name, name_len);
> -	} else {
> -		if (!s->here->e_value_inum && s->here->e_value_size &&
> -		    s->here->e_value_offs > 0) {
> -			void *first_val = s->base + min_offs;
> -			size_t offs = le16_to_cpu(s->here->e_value_offs);
> -			void *val = s->base + offs;
> -			size_t size = EXT4_XATTR_SIZE(
> -				le32_to_cpu(s->here->e_value_size));
> -
> -			if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
> -				/* The old and the new value have the same
> -				   size. Just replace. */
> -				s->here->e_value_size =
> -					cpu_to_le32(i->value_len);
> -				if (i->value == EXT4_ZERO_XATTR_VALUE) {
> -					memset(val, 0, size);
> -				} else {
> -					/* Clear pad bytes first. */
> -					memset(val + size - EXT4_XATTR_PAD, 0,
> -					       EXT4_XATTR_PAD);
> -					memcpy(val, i->value, i->value_len);
> -				}
> -				return 0;
> -			}
> +		ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
> +		if (ret)
> +			goto out;
>  
> -			/* Remove the old value. */
> -			memmove(first_val + size, first_val, val - first_val);
> -			memset(first_val, 0, size);
> -			s->here->e_value_size = 0;
> -			s->here->e_value_offs = 0;
> -			min_offs += size;
> -
> -			/* Adjust all value offsets. */
> -			last = s->first;
> -			while (!IS_LAST_ENTRY(last)) {
> -				size_t o = le16_to_cpu(last->e_value_offs);
> -				if (!last->e_value_inum &&
> -				    last->e_value_size && o < offs)
> -					last->e_value_offs =
> -						cpu_to_le16(o + size);
> -				last = EXT4_XATTR_NEXT(last);
> -			}
> +		ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
> +						     i->value_len,
> +						     &new_ea_inode);
> +		if (ret) {
> +			new_ea_inode = NULL;
> +			ext4_xattr_inode_free_quota(inode, i->value_len);
> +			goto out;
>  		}
> -		if (s->here->e_value_inum) {
> -			ext4_xattr_inode_unlink(inode,
> -					    le32_to_cpu(s->here->e_value_inum));
> -			s->here->e_value_inum = 0;
> +	}
> +
> +	if (old_ea_inode) {
> +		/* We are ready to release ref count on the old_ea_inode. */
> +		ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
> +		if (ret) {
> +			/* Release newly required ref count on new_ea_inode. */
> +			if (new_ea_inode) {
> +				int err;
> +
> +				err = ext4_xattr_inode_dec_ref(handle,
> +							       new_ea_inode);
> +				if (err)
> +					ext4_warning_inode(new_ea_inode,
> +						  "dec ref new_ea_inode err=%d",
> +						  err);
> +				ext4_xattr_inode_free_quota(inode,
> +							    i->value_len);
> +			}
> +			goto out;
>  		}
> -		if (!i->value) {
> -			/* Remove the old name. */
> -			size_t size = EXT4_XATTR_LEN(name_len);
> -			last = ENTRY((void *)last - size);
> -			memmove(s->here, (void *)s->here + size,
> -				(void *)last - (void *)s->here + sizeof(__u32));
> -			memset(last, 0, size);
> +
> +		ext4_xattr_inode_free_quota(inode,
> +					    le32_to_cpu(here->e_value_size));
> +	}
> +
> +	/* No failures allowed past this point. */
> +
> +	if (!s->not_found && here->e_value_offs) {
> +		/* Remove the old value. */
> +		void *first_val = s->base + min_offs;
> +		size_t offs = le16_to_cpu(here->e_value_offs);
> +		void *val = s->base + offs;
> +		size_t size = EXT4_XATTR_SIZE(
> +			le32_to_cpu(here->e_value_size));
> +
> +		memmove(first_val + size, first_val, val - first_val);
> +		memset(first_val, 0, size);
> +		min_offs += size;
> +
> +		/* Adjust all value offsets. */
> +		last = s->first;
> +		while (!IS_LAST_ENTRY(last)) {
> +			size_t o = le16_to_cpu(last->e_value_offs);
> +			if (!last->e_value_inum &&
> +			    last->e_value_size && o < offs)
> +				last->e_value_offs =
> +					cpu_to_le16(o + size);
> +			last = EXT4_XATTR_NEXT(last);
>  		}
>  	}
>  
> +	if (!s->not_found && !i->value) {
> +		/* Remove old name. */
> +		size_t size = EXT4_XATTR_LEN(name_len);
> +		last = ENTRY((void *)last - size);
> +		memmove(here, (void *)here + size,
> +			(void *)last - (void *)here + sizeof(__u32));
> +		memset(last, 0, size);
> +	} else if (s->not_found && i->value) {
> +		/* Insert new name. */
> +		size_t size = EXT4_XATTR_LEN(name_len);
> +		size_t rest = (void *)last - (void *)here + sizeof(__u32);
> +		memmove((void *)here + size, here, rest);
> +		memset(here, 0, size);
> +		here->e_name_index = i->name_index;
> +		here->e_name_len = name_len;
> +		memcpy(here->e_name, i->name, name_len);
> +	} else {
> +		WARN_ON_ONCE(s->not_found || !i->value);
> +		/* This is an update, reset value info. */
> +		here->e_value_inum = 0;
> +		here->e_value_offs = 0;
> +		here->e_value_size = 0;
> +	}
> +
>  	if (i->value) {
> -		/* Insert the new value. */
> +		/* Insert new value. */
>  		if (in_inode) {
> -			unsigned long ea_ino =
> -				le32_to_cpu(s->here->e_value_inum);
> -			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
> -						  i->value, i->value_len);
> -			if (rc)
> -				goto out;
> -			s->here->e_value_inum = cpu_to_le32(ea_ino);
> -			s->here->e_value_offs = 0;
> +			here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
>  		} else if (i->value_len) {
>  			size_t size = EXT4_XATTR_SIZE(i->value_len);
>  			void *val = s->base + min_offs - size;
> -			s->here->e_value_offs = cpu_to_le16(min_offs - size);
> -			s->here->e_value_inum = 0;
> +			here->e_value_offs = cpu_to_le16(min_offs - size);
>  			if (i->value == EXT4_ZERO_XATTR_VALUE) {
>  				memset(val, 0, size);
>  			} else {
> -				/* Clear the pad bytes first. */
> -				memset(val + size - EXT4_XATTR_PAD, 0,
> -				       EXT4_XATTR_PAD);
>  				memcpy(val, i->value, i->value_len);
> +				/* Clear padding bytes. */
> +				memset(val + i->value_len, 0,
> +				       size - i->value_len);
>  			}
>  		}
> -		s->here->e_value_size = cpu_to_le32(i->value_len);
> +		here->e_value_size = cpu_to_le32(i->value_len);
>  	}
> -
> +	ret = 0;
>  out:
> -	return rc;
> +	iput(old_ea_inode);
> +	iput(new_ea_inode);
> +	return ret;
>  }
>  
>  struct ext4_xattr_block_find {
> @@ -1223,6 +1611,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  	struct mb_cache_entry *ce = NULL;
>  	int error = 0;
>  	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
> +	struct inode *ea_inode = NULL;
> +	size_t old_ea_inode_size = 0;
>  
>  #define header(x) ((struct ext4_xattr_header *)(x))
>  
> @@ -1277,6 +1667,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  			header(s->base)->h_refcount = cpu_to_le32(1);
>  			s->here = ENTRY(s->base + offset);
>  			s->end = s->base + bs->bh->b_size;
> +
> +			/*
> +			 * If existing entry points to an xattr inode, we need
> +			 * to prevent ext4_xattr_set_entry() from decrementing
> +			 * ref count on it because the reference belongs to the
> +			 * original block. In this case, make the entry look
> +			 * like it has an empty value.
> +			 */
> +			if (!s->not_found && s->here->e_value_inum) {
> +				/*
> +				 * Defer quota free call for previous inode
> +				 * until success is guaranteed.
> +				 */
> +				old_ea_inode_size = le32_to_cpu(
> +							s->here->e_value_size);
> +				s->here->e_value_inum = 0;
> +				s->here->e_value_size = 0;
> +			}
>  		}
>  	} else {
>  		/* Allocate a buffer where we construct the new block. */
> @@ -1298,6 +1706,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  		goto bad_block;
>  	if (error)
>  		goto cleanup;
> +
> +	if (i->value && s->here->e_value_inum) {
> +		unsigned int ea_ino;
> +
> +		/*
> +		 * A ref count on ea_inode has been taken as part of the call to
> +		 * ext4_xattr_set_entry() above. We would like to drop this
> +		 * extra ref but we have to wait until the xattr block is
> +		 * initialized and has its own ref count on the ea_inode.
> +		 */
> +		ea_ino = le32_to_cpu(s->here->e_value_inum);
> +		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> +		if (error) {
> +			ea_inode = NULL;
> +			goto cleanup;
> +		}
> +	}
> +
>  	if (!IS_LAST_ENTRY(s->first))
>  		ext4_xattr_rehash(header(s->base), s->here);
>  
> @@ -1408,6 +1834,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  						 EXT4_FREE_BLOCKS_METADATA);
>  				goto cleanup;
>  			}
> +			error = ext4_xattr_inode_inc_ref_all(handle, inode,
> +						      ENTRY(header(s->base)+1));
> +			if (error)
> +				goto getblk_failed;
> +			if (ea_inode) {
> +				/* Drop the extra ref on ea_inode. */
> +				error = ext4_xattr_inode_dec_ref(handle,
> +								 ea_inode);
> +				if (error)
> +					ext4_warning_inode(ea_inode,
> +							   "dec ref error=%d",
> +							   error);
> +				iput(ea_inode);
> +				ea_inode = NULL;
> +			}
> +
>  			lock_buffer(new_bh);
>  			error = ext4_journal_get_create_access(handle, new_bh);
>  			if (error) {
> @@ -1427,15 +1869,36 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  		}
>  	}
>  
> +	if (old_ea_inode_size)
> +		ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
> +
>  	/* Update the inode. */
>  	EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
>  
>  	/* Drop the previous xattr block. */
> -	if (bs->bh && bs->bh != new_bh)
> -		ext4_xattr_release_block(handle, inode, bs->bh);
> +	if (bs->bh && bs->bh != new_bh) {
> +		struct ext4_xattr_inode_array *ea_inode_array = NULL;
> +		ext4_xattr_release_block(handle, inode, bs->bh,
> +					 &ea_inode_array,
> +					 0 /* extra_credits */);
> +		ext4_xattr_inode_array_free(ea_inode_array);
> +	}
>  	error = 0;
>  
>  cleanup:
> +	if (ea_inode) {
> +		int error2;
> +		error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (error2)
> +			ext4_warning_inode(ea_inode, "dec ref error=%d",
> +					   error2);
> +
> +		/* If there was an error, revert the quota charge. */
> +		if (error)
> +			ext4_xattr_inode_free_quota(inode,
> +						    i_size_read(ea_inode));
> +		iput(ea_inode);
> +	}
>  	if (ce)
>  		mb_cache_entry_put(ext4_mb_cache, ce);
>  	brelse(new_bh);
> @@ -1546,6 +2009,117 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
>  	return 0;
>  }
>  
> +struct ext4_xattr_ea_info {
> +	__le64 ref_count;	/* number of xattr entry references */
> +	__le32 hash;		/* crc32c hash of xattr data */
> +	__le32 reserved;	/* reserved, must be 0 */
> +};
> +
> +static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
> +				 u32 hash)
> +{
> +	struct ext4_xattr_ea_info ea_info = {
> +		.ref_count = cpu_to_le64(1),
> +		.hash = cpu_to_le32(hash),
> +		.reserved = 0,
> +	};
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +		.value = &ea_info,
> +		.value_len = sizeof(ea_info),
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	return ext4_xattr_ibody_set(handle, ea_inode, &i, &is);
> +}
> +
> +static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
> +				     u64 *ref_return, u32 *hash)
> +{
> +	struct ext4_xattr_ea_info ea_info;
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +		.value = &ea_info,
> +		.value_len = sizeof(ea_info),
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	if (WARN_ON(is.s.not_found) ||
> +	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(ea_info)))
> +		return -EFSCORRUPTED;
> +
> +	memcpy(&ea_info,
> +	       ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs),
> +	       sizeof(ea_info));
> +
> +	if (hash)
> +		*hash = le32_to_cpu(ea_info.hash);
> +
> +	*ref_return = le64_to_cpu(ea_info.ref_count) + ref_change;
> +	ea_info.ref_count = cpu_to_le64(*ref_return);
> +
> +	return ext4_xattr_set_entry(&i, &is.s, NULL, ea_inode);
> +}
> +
> +static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash)
> +{
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	struct ext4_xattr_ea_info *ea_info;
> +	void *ptr;
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	if (WARN_ON(is.s.not_found) ||
> +	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(*ea_info)))
> +		return -EFSCORRUPTED;
> +
> +	ptr = ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs);
> +	ea_info = (struct ext4_xattr_ea_info *)ptr;
> +
> +	if (WARN_ON(ea_info->reserved != 0))
> +		return -EFSCORRUPTED;
> +
> +	*hash = le32_to_cpu(ea_info->hash);
> +	return 0;
> +}
> +
>  static int ext4_xattr_value_same(struct ext4_xattr_search *s,
>  				 struct ext4_xattr_info *i)
>  {
> @@ -1560,6 +2134,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
>  	return !memcmp(value, i->value, i->value_len);
>  }
>  
> +struct buffer_head *ext4_xattr_get_block(struct inode *inode)
> +{
> +	struct buffer_head *bh;
> +	int error;
> +
> +	if (!EXT4_I(inode)->i_file_acl)
> +		return NULL;
> +	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +	if (!bh)
> +		return ERR_PTR(-EIO);
> +	error = ext4_xattr_check_block(inode, bh);
> +	if (error)
> +		return ERR_PTR(error);
> +	return bh;
> +}
> +
>  /*
>   * ext4_xattr_set_handle()
>   *
> @@ -1602,9 +2192,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  
>  	/* Check journal credits under write lock. */
>  	if (ext4_handle_valid(handle)) {
> +		struct buffer_head *bh;
>  		int credits;
>  
> -		credits = ext4_xattr_set_credits(inode, value_len);
> +		bh = ext4_xattr_get_block(inode);
> +		if (IS_ERR(bh)) {
> +			error = PTR_ERR(bh);
> +			goto cleanup;
> +		}
> +
> +		credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
> +		brelse(bh);
> +
>  		if (!ext4_handle_has_enough_credits(handle, credits)) {
>  			error = -ENOSPC;
>  			goto cleanup;
> @@ -1640,6 +2239,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  		if (flags & XATTR_CREATE)
>  			goto cleanup;
>  	}
> +
>  	if (!value) {
>  		if (!is.s.not_found)
>  			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
> @@ -1708,34 +2308,29 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  	return error;
>  }
>  
> -int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
> +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
>  {
> -	struct super_block *sb = inode->i_sb;
> -	int credits;
> -
> -	if (!EXT4_SB(sb)->s_journal)
> -		return 0;
> -
> -	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
> +	struct buffer_head *bh;
> +	int err;
>  
> -	/*
> -	 * In case of inline data, we may push out the data to a block,
> -	 * so we need to reserve credits for this eventuality
> -	 */
> -	if (ext4_has_inline_data(inode))
> -	        credits += ext4_writepage_trans_blocks(inode) + 1;
> +	*credits = 0;
>  
> -	if (ext4_has_feature_ea_inode(sb)) {
> -		int nrblocks = (value_len + sb->s_blocksize - 1) >>
> -					sb->s_blocksize_bits;
> +	if (!EXT4_SB(inode->i_sb)->s_journal)
> +		return 0;
>  
> -		/* For new inode */
> -		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
> +	down_read(&EXT4_I(inode)->xattr_sem);
>  
> -		/* For data blocks of EA inode */
> -		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
> +	bh = ext4_xattr_get_block(inode);
> +	if (IS_ERR(bh)) {
> +		err = PTR_ERR(bh);
> +	} else {
> +		*credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
> +		brelse(bh);
> +		err = 0;
>  	}
> -	return credits;
> +
> +	up_read(&EXT4_I(inode)->xattr_sem);
> +	return err;
>  }
>  
>  /*
> @@ -1760,7 +2355,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
>  		return error;
>  
>  retry:
> -	credits = ext4_xattr_set_credits(inode, value_len);
> +	error = ext4_xattr_set_credits(inode, value_len, &credits);
> +	if (error)
> +		return error;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>  	if (IS_ERR(handle)) {
>  		error = PTR_ERR(handle);
> @@ -2066,10 +2664,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
>  	return error;
>  }
>  
> -
>  #define EIA_INCR 16 /* must be 2^n */
>  #define EIA_MASK (EIA_INCR - 1)
> -/* Add the large xattr @inode into @ea_inode_array for later deletion.
> +
> +/* Add the large xattr @inode into @ea_inode_array for deferred iput().
>   * If @ea_inode_array is new or full it will be grown and the old
>   * contents copied over.
>   */
> @@ -2114,21 +2712,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>   * ext4_xattr_delete_inode()
>   *
>   * Free extended attribute resources associated with this inode. Traverse
> - * all entries and unlink any xattr inodes associated with this inode. This
> - * is called immediately before an inode is freed. We have exclusive
> - * access to the inode. If an orphan inode is deleted it will also delete any
> - * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
> - * to ensure they belong to the parent inode and were not deleted already.
> + * all entries and decrement reference on any xattr inodes associated with this
> + * inode. This is called immediately before an inode is freed. We have exclusive
> + * access to the inode. If an orphan inode is deleted it will also release its
> + * references on xattr block and xattr inodes.
>   */
> -int
> -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> -			struct ext4_xattr_inode_array **ea_inode_array,
> -			int extra_credits)
> +int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> +			    struct ext4_xattr_inode_array **ea_inode_array,
> +			    int extra_credits)
>  {
>  	struct buffer_head *bh = NULL;
>  	struct ext4_xattr_ibody_header *header;
> -	struct ext4_inode *raw_inode;
>  	struct ext4_iloc iloc = { .bh = NULL };
> +	struct ext4_xattr_entry *entry;
>  	int error;
>  
>  	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
> @@ -2140,66 +2736,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  		goto cleanup;
>  	}
>  
> -	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
> -		goto delete_external_ea;
> -
> -	error = ext4_get_inode_loc(inode, &iloc);
> -	if (error)
> -		goto cleanup;
> +	if (ext4_has_feature_ea_inode(inode->i_sb) &&
> +	    ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
>  
> -	error = ext4_journal_get_write_access(handle, iloc.bh);
> -	if (error)
> -		goto cleanup;
> +		error = ext4_get_inode_loc(inode, &iloc);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
> +			goto cleanup;
> +		}
>  
> -	raw_inode = ext4_raw_inode(&iloc);
> -	header = IHDR(inode, raw_inode);
> -	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
> -				    false /* block_csum */, ea_inode_array,
> -				    extra_credits);
> +		error = ext4_journal_get_write_access(handle, iloc.bh);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "write access (error %d)",
> +					 error);
> +			goto cleanup;
> +		}
>  
> -delete_external_ea:
> -	if (!EXT4_I(inode)->i_file_acl) {
> -		error = 0;
> -		goto cleanup;
> -	}
> -	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> -	if (!bh) {
> -		EXT4_ERROR_INODE(inode, "block %llu read error",
> -				 EXT4_I(inode)->i_file_acl);
> -		error = -EIO;
> -		goto cleanup;
> -	}
> -	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
> -	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
> -		EXT4_ERROR_INODE(inode, "bad block %llu",
> -				 EXT4_I(inode)->i_file_acl);
> -		error = -EFSCORRUPTED;
> -		goto cleanup;
> +		header = IHDR(inode, ext4_raw_inode(&iloc));
> +		if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
> +			ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
> +						     IFIRST(header),
> +						     false /* block_csum */,
> +						     ea_inode_array,
> +						     extra_credits,
> +						     false /* skip_quota */);
>  	}
>  
> -	if (ext4_has_feature_ea_inode(inode->i_sb)) {
> -		error = ext4_journal_get_write_access(handle, bh);
> -		if (error) {
> -			EXT4_ERROR_INODE(inode, "write access %llu",
> +	if (EXT4_I(inode)->i_file_acl) {
> +		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +		if (!bh) {
> +			EXT4_ERROR_INODE(inode, "block %llu read error",
>  					 EXT4_I(inode)->i_file_acl);
> +			error = -EIO;
> +			goto cleanup;
> +		}
> +		error = ext4_xattr_check_block(inode, bh);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
> +					 EXT4_I(inode)->i_file_acl, error);
>  			goto cleanup;
>  		}
> -		ext4_xattr_inode_remove_all(handle, inode, bh,
> -					    BFIRST(bh),
> -					    true /* block_csum */,
> -					    ea_inode_array,
> -					    extra_credits);
> -	}
>  
> -	ext4_xattr_release_block(handle, inode, bh);
> -	/* Update i_file_acl within the same transaction that releases block. */
> -	EXT4_I(inode)->i_file_acl = 0;
> -	error = ext4_mark_inode_dirty(handle, inode);
> -	if (error) {
> -		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> -				 error);
> -		goto cleanup;
> +		if (ext4_has_feature_ea_inode(inode->i_sb)) {
> +			for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> +			     entry = EXT4_XATTR_NEXT(entry))
> +				if (entry->e_value_inum)
> +					ext4_xattr_inode_free_quota(inode,
> +					      le32_to_cpu(entry->e_value_size));
> +
> +		}
> +
> +		ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
> +					 extra_credits);
> +		/*
> +		 * Update i_file_acl value in the same transaction that releases
> +		 * block.
> +		 */
> +		EXT4_I(inode)->i_file_acl = 0;
> +		error = ext4_mark_inode_dirty(handle, inode);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> +					 error);
> +			goto cleanup;
> +		}
>  	}
> +	error = 0;
>  cleanup:
>  	brelse(iloc.bh);
>  	brelse(bh);
> @@ -2208,17 +2809,13 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  
>  void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
>  {
> -	struct inode	*ea_inode;
> -	int		idx = 0;
> +	int idx;
>  
>  	if (ea_inode_array == NULL)
>  		return;
>  
> -	for (; idx < ea_inode_array->count; ++idx) {
> -		ea_inode = ea_inode_array->inodes[idx];
> -		clear_nlink(ea_inode);
> -		iput(ea_inode);
> -	}
> +	for (idx = 0; idx < ea_inode_array->count; ++idx)
> +		iput(ea_inode_array->inodes[idx]);
>  	kfree(ea_inode_array);
>  }
>  
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index b2005a2716d9..67616cb9a059 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -70,19 +70,6 @@ struct ext4_xattr_entry {
>  #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
>  
>  /*
> - * Link EA inode back to parent one using i_mtime field.
> - * Extra integer type conversion added to ignore higher
> - * bits in i_mtime.tv_sec which might be set by ext4_get()
> - */
> -#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
> -do {                                                  \
> -      (inode)->i_mtime.tv_sec = inum;                 \
> -} while(0)
> -
> -#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
> -((__u32)(inode)->i_mtime.tv_sec)
> -
> -/*
>   * The minimum size of EA value when you start storing it in an external inode
>   * size of block - size of header - size of 1 entry - 4 null bytes
>  */
> @@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
>  extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
>  extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
>  extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
> -extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
> +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
> +				  int *credits);
>  
> -extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
>  extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  				   struct ext4_xattr_inode_array **array,
>  				   int extra_credits);
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index 77a5b99d8f92..7dfdca822ccb 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -13,10 +13,11 @@
>   * mb_cache_entry_delete()).
>   *
>   * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
> - * They use hash of a block contents as a key and block number as a value.
> - * That's why keys need not be unique (different xattr blocks may end up having
> - * the same hash). However block number always uniquely identifies a cache
> - * entry.
> + * Ext4 also uses it for deduplication of xattr values stored in inodes.
> + * They use hash of data as a key and provide a value that may represent a
> + * block or inode number. That's why keys need not be unique (hash of different
> + * data may be the same). However user provided value always uniquely
> + * identifies a cache entry.
>   *
>   * We provide functions for creation and removal of entries, search by key,
>   * and a special "delete entry with given key-value pair" operation. Fixed
> -- 
> 2.13.0.219.gdb65acc882-goog
> 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [Ocfs2-devel] [PATCH 27/28] ext4: xattr inode deduplication
@ 2017-05-31 16:00     ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-05-31 16:00 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Deepa Dinamani, Dave Kleikamp, jfs-discussion, Theodore Ts'o,
	linux-kernel, reiserfs-devel, Jens Axboe, linux-fsdevel,
	Mike Christie, Andreas Dilger, Alexander Viro, Jan Kara,
	Fabian Frederick, linux-ext4, ocfs2-devel

On Wed, May 31, 2017 at 01:15:16AM -0700, Tahsin Erdogan wrote:
> Ext4 now supports xattr values that are up to 64k in size (vfs limit).
> Large xattr values are stored in external inodes each one holding a
> single value. Once written the data blocks of these inodes are immutable.
> 
> The real world use cases are expected to have a lot of value duplication
> such as inherited acls etc. To reduce data duplication on disk, this patch
> implements a deduplicator that allows sharing of xattr inodes.
> 
> The deduplication is based on an in-memory hash lookup that is a best
> effort sharing scheme. When a xattr inode is read from disk (i.e.
> getxattr() call), its crc32c hash is added to a hash table. Before
> creating a new xattr inode for a value being set, the hash table is
> checked to see if an existing inode holds an identical value. If such an
> inode is found, the ref count on that inode is incremented. On value
> removal the ref count is decremented and if it reaches zero the inode is
> deleted.
> 
> The quota charging for such inodes is manually managed. Every reference
> holder is charged the full size as if there was no sharing happening.
> This is consistent with how xattr blocks are also charged.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext4/acl.c   |    5 +-
>  fs/ext4/ext4.h  |    7 +-
>  fs/ext4/inode.c |    9 +-
>  fs/ext4/super.c |   22 +-
>  fs/ext4/xattr.c | 1073 +++++++++++++++++++++++++++++++++++++++++++------------
>  fs/ext4/xattr.h |   17 +-
>  fs/mbcache.c    |    9 +-
>  7 files changed, 881 insertions(+), 261 deletions(-)
> 
> diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
> index 74f7ac539e00..8db03e5c78bc 100644
> --- a/fs/ext4/acl.c
> +++ b/fs/ext4/acl.c
> @@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>  	if (error)
>  		return error;
>  retry:
> -	credits = ext4_xattr_set_credits(inode, acl_size);
> +	error = ext4_xattr_set_credits(inode, acl_size, &credits);
> +	if (error)
> +		return error;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>  	if (IS_ERR(handle))
>  		return PTR_ERR(handle);
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index d79d8d7bee88..79f06290e723 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1517,6 +1517,7 @@ struct ext4_sb_info {
>  	long s_es_nr_inode;
>  	struct ext4_es_stats s_es_stats;
>  	struct mb_cache *s_mb_cache;
> +	struct mb_cache *s_ea_inode_cache;
>  	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
>  
>  	/* Ratelimit ext4 messages. */
> @@ -2099,7 +2100,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
>  	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
>  }
>  
> -#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
> +static inline bool ext4_is_quota_file(struct inode *inode)
> +{
> +	return IS_NOQUOTA(inode) &&
> +	       !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
> +}
>  
>  /*
>   * This structure is stuffed into the struct file's private_data field
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 4d6936f0d8a4..6f5872197d6c 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4843,8 +4843,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>  	}
>  	brelse(iloc.bh);
>  	ext4_set_inode_flags(inode);
> -	if (ei->i_flags & EXT4_EA_INODE_FL)
> +
> +	if (ei->i_flags & EXT4_EA_INODE_FL) {
>  		ext4_xattr_inode_set_class(inode);
> +
> +		inode_lock(inode);
> +		inode->i_flags |= S_NOQUOTA;
> +		inode_unlock(inode);
> +	}
> +
>  	unlock_new_inode(inode);
>  	return inode;
>  
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index b02a23ec92ca..7d2b692d52ea 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
>  		invalidate_bdev(sbi->journal_bdev);
>  		ext4_blkdev_remove(sbi);
>  	}
> +	if (sbi->s_ea_inode_cache) {
> +		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
> +		sbi->s_ea_inode_cache = NULL;
> +	}
>  	if (sbi->s_mb_cache) {
>  		ext4_xattr_destroy_cache(sbi->s_mb_cache);
>  		sbi->s_mb_cache = NULL;
> @@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
>  	if (res)
>  		return res;
>  retry:
> -	credits = ext4_xattr_set_credits(inode, len);
> +	res = ext4_xattr_set_credits(inode, len, &credits);
> +	if (res)
> +		return res;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
>  	if (IS_ERR(handle))
>  		return PTR_ERR(handle);
> @@ -4067,6 +4074,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  		goto failed_mount_wq;
>  	}
>  
> +	if (ext4_has_feature_ea_inode(sb)) {
> +		sbi->s_ea_inode_cache = ext4_xattr_create_cache();
> +		if (!sbi->s_ea_inode_cache) {
> +			ext4_msg(sb, KERN_ERR,
> +				 "Failed to create an s_ea_inode_cache");
> +			goto failed_mount_wq;
> +		}
> +	}
> +
>  	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
>  	    (blocksize != PAGE_SIZE)) {
>  		ext4_msg(sb, KERN_ERR,
> @@ -4296,6 +4312,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	if (EXT4_SB(sb)->rsv_conversion_wq)
>  		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
>  failed_mount_wq:
> +	if (sbi->s_ea_inode_cache) {
> +		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
> +		sbi->s_ea_inode_cache = NULL;
> +	}
>  	if (sbi->s_mb_cache) {
>  		ext4_xattr_destroy_cache(sbi->s_mb_cache);
>  		sbi->s_mb_cache = NULL;
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 6acce1f689ab..caddc176a612 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -55,6 +55,7 @@
>  #include <linux/slab.h>
>  #include <linux/mbcache.h>
>  #include <linux/quotaops.h>
> +#include <linux/crc32c.h>
>  #include "ext4_jbd2.h"
>  #include "ext4.h"
>  #include "xattr.h"
> @@ -79,6 +80,7 @@ ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
>  			    struct mb_cache_entry **);
>  static void ext4_xattr_rehash(struct ext4_xattr_header *,
>  			      struct ext4_xattr_entry *);
> +static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash);
>  
>  static const struct xattr_handler * const ext4_xattr_handler_map[] = {
>  	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
> @@ -105,13 +107,23 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
>  	NULL
>  };
>  
> +#define EXT4_XATTR_SYSTEM_EA_INFO  "eai"
> +
>  #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
>  				inode->i_sb->s_fs_info)->s_mb_cache)
>  
> +#define EA_INODE_CACHE(inode)	(((struct ext4_sb_info *) \
> +				inode->i_sb->s_fs_info)->s_ea_inode_cache)
> +
>  static int
>  ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>  			struct inode *inode);
>  
> +static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
> +				 u32 hash);
> +static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
> +				     u64 *ref_return, u32 *hash);
> +
>  #ifdef CONFIG_LOCKDEP
>  void ext4_xattr_inode_set_class(struct inode *ea_inode)
>  {
> @@ -329,14 +341,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>  		goto error;
>  	}
>  
> -	if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
> -	    inode->i_generation != parent->i_generation) {
> -		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
> -			   "to parent is invalid.", ea_ino);
> -		err = -EINVAL;
> -		goto error;
> -	}
> -
>  	if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
>  		ext4_error(parent->i_sb, "EA inode %lu does not have "
>  			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
> @@ -351,6 +355,11 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>  	return err;
>  }
>  
> +static u32 ext4_xattr_inode_hash(const void *buffer, size_t size)
> +{
> +	return crc32c(0, buffer, size);

The metadata checksumming code uses crypto_alloc_shash to dynamically
bind to the crc32c implementation only when ext4 actually needs it.
This introduces a static module dependency on libcrc32c even though we
only need it if we're deduplicating xattrs, which is done only when
EXT4_FEATURE_INCOMPAT_EA_INODE is enabled, correct?

Or, looking through the code, maybe not; are we now capable of deduping
for any filesystem?

Anyway, if this dedupe feature is hidden behind INCOMPAT_EA_INODE then
this crc32c call binding should be done dynamically; however, if it
works for any filesystem and is therefore on all the time, the existing
users ought to be converted to use libcrc32c.

--D

> +}
> +
>  /*
>   * Read the value from the EA inode.
>   */
> @@ -358,17 +367,52 @@ static int
>  ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
>  		     size_t size)
>  {
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
>  	struct inode *ea_inode;
> -	int ret;
> +	u32 hash, calc_hash;
> +	struct mb_cache_entry *ce;
> +	int err;
>  
> -	ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -	if (ret)
> -		return ret;
> +	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> +	if (err) {
> +		ea_inode = NULL;
> +		goto out;
> +	}
>  
> -	ret = ext4_xattr_inode_read(ea_inode, buffer, size);
> -	iput(ea_inode);
> +	if (i_size_read(ea_inode) != size) {
> +		ext4_warning_inode(ea_inode,
> +				   "ea_inode file size=%llu entry size=%zu",
> +				   i_size_read(ea_inode), size);
> +		err = -EFSCORRUPTED;
> +		goto out;
> +	}
>  
> -	return ret;
> +	err = ext4_xattr_inode_read(ea_inode, buffer, size);
> +	if (!err) {
> +		if (ext4_xattr_read_ea_hash(ea_inode, &hash))
> +			goto out;
> +
> +		/* Avoid hash calculation if already cached. */
> +		ce = mb_cache_entry_get(ea_inode_cache, hash, ea_inode->i_ino);
> +		if (ce) {
> +			mb_cache_entry_put(ea_inode_cache, ce);
> +			goto out;
> +		}
> +
> +		calc_hash = ext4_xattr_inode_hash(buffer, size);
> +		if (hash != calc_hash) {
> +			ext4_warning_inode(ea_inode, "EA inode saved hash=%#x "
> +					   "does not match calc_hash=%#x",
> +					   hash, calc_hash);
> +			goto out;
> +		}
> +
> +		mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
> +				      ea_inode->i_ino, true /* reusable */);
> +	}
> +out:
> +	iput(ea_inode);
> +	return err;
>  }
>  
>  static int
> @@ -657,6 +701,101 @@ static void ext4_xattr_update_super_block(handle_t *handle,
>  	}
>  }
>  
> +static inline size_t round_up_cluster(struct inode *inode, size_t length)
> +{
> +	struct super_block *sb = inode->i_sb;
> +	size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
> +				    inode->i_blkbits);
> +	size_t mask = ~(cluster_size - 1);
> +
> +	return (length + cluster_size - 1) & mask;
> +}
> +
> +static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
> +{
> +	int err;
> +
> +	err = dquot_alloc_inode(inode);
> +	if (err)
> +		return err;
> +	err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
> +	if (err)
> +		dquot_free_inode(inode);
> +	return err;
> +}
> +
> +static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
> +{
> +	dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
> +	dquot_free_inode(inode);
> +}
> +
> +static int __ext4_xattr_set_credits(struct super_block *sb,
> +				    struct buffer_head *block_bh,
> +				    size_t value_len)
> +{
> +	int credits;
> +	int blocks;
> +
> +	/*
> +	 * 1) Owner inode update
> +	 * 2) Ref count update on old xattr block
> +	 * 3) new xattr block
> +	 * 4) block bitmap update for new xattr block
> +	 * 5) group descriptor for new xattr block
> +	 */
> +	credits = 5;
> +
> +	/* We are done if ea_inode feature is not enabled. */
> +	if (!ext4_has_feature_ea_inode(sb))
> +		return credits;
> +
> +	/* New ea_inode, inode map, block bitmap, group descriptor. */
> +	credits += 4;
> +
> +	/* Data blocks. */
> +	blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
> +
> +	/* Indirection block. */
> +	blocks += 1;
> +
> +	/* Block bitmap and group descriptor updates for each block. */
> +	credits += blocks * 2;
> +
> +	/* Blocks themselves. */
> +	credits += blocks;
> +
> +	/* Dereference ea_inode holding old xattr value.
> +	 * Old ea_inode, inode map, block bitmap, group descriptor.
> +	 */
> +	credits += 4;
> +
> +	/* Data blocks for old ea_inode. */
> +	blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
> +
> +	/* Indirection block for old ea_inode. */
> +	blocks += 1;
> +
> +	/* Block bitmap and group descriptor updates for each block. */
> +	credits += blocks * 2;
> +
> +	/* Quota updates. */
> +	credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
> +
> +	/* We may need to clone the existing xattr block in which case we need
> +	 * to increment ref counts for existing ea_inodes referenced by it.
> +	 */
> +	if (block_bh) {
> +		struct ext4_xattr_entry *entry = BFIRST(block_bh);
> +
> +		for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
> +			if (entry->e_value_inum)
> +				/* Ref count update on ea_inode. */
> +				credits += 1;
> +	}
> +	return credits;
> +}
> +
>  int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>  			      int credits, struct buffer_head *bh,
>  			      bool dirty, bool block_csum)
> @@ -706,12 +845,139 @@ int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>  	return 0;
>  }
>  
> +static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
> +				       int ref_change)
> +{
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
> +	struct ext4_iloc iloc;
> +	s64 ref_return;
> +	u32 hash;
> +	int ret;
> +
> +	inode_lock(ea_inode);
> +
> +	ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
> +	if (ret) {
> +		iloc.bh = NULL;
> +		goto out;
> +	}
> +
> +	ret = ext4_xattr_update_ea_info(ea_inode, ref_change, &ref_return,
> +					&hash);
> +	if (ret)
> +		goto out;
> +
> +	if (ref_change > 0) {
> +		WARN_ONCE(ref_return <= 0, "EA inode %lu ref_return=%lld",
> +			  ea_inode->i_ino, ref_return);
> +
> +		if (ref_return == 1) {
> +			WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
> +				  ea_inode->i_ino, ea_inode->i_nlink);
> +
> +			set_nlink(ea_inode, 1);
> +			ext4_orphan_del(handle, ea_inode);
> +
> +			mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
> +					      ea_inode->i_ino,
> +					      true /* reusable */);
> +		}
> +	} else {
> +		WARN_ONCE(ref_return < 0, "EA inode %lu ref_return=%lld",
> +			  ea_inode->i_ino, ref_return);
> +
> +		if (ref_return == 0) {
> +			WARN_ONCE(ea_inode->i_nlink != 1,
> +				  "EA inode %lu i_nlink=%u",
> +				  ea_inode->i_ino, ea_inode->i_nlink);
> +
> +			clear_nlink(ea_inode);
> +			ext4_orphan_add(handle, ea_inode);
> +
> +			mb_cache_entry_delete(ea_inode_cache, hash,
> +					      ea_inode->i_ino);
> +		}
> +	}
> +
> +	ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
> +	iloc.bh = NULL;
> +	if (ret)
> +		ext4_warning_inode(ea_inode,
> +				   "ext4_mark_iloc_dirty() failed ret=%d", ret);
> +out:
> +	brelse(iloc.bh);
> +	inode_unlock(ea_inode);
> +	return ret;
> +}
> +
> +static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
> +{
> +	return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
> +}
> +
> +static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
> +{
> +	return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
> +}
> +
> +static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
> +					struct ext4_xattr_entry *first)
> +{
> +	struct inode *ea_inode;
> +	struct ext4_xattr_entry *entry;
> +	struct ext4_xattr_entry *failed_entry;
> +	unsigned int ea_ino;
> +	int err, saved_err;
> +
> +	for (entry = first; !IS_LAST_ENTRY(entry);
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		ea_ino = le32_to_cpu(entry->e_value_inum);
> +		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +		if (err)
> +			goto cleanup;
> +		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
> +		if (err) {
> +			ext4_warning_inode(ea_inode, "inc ref error %d", err);
> +			iput(ea_inode);
> +			goto cleanup;
> +		}
> +		iput(ea_inode);
> +	}
> +	return 0;
> +
> +cleanup:
> +	saved_err = err;
> +	failed_entry = entry;
> +
> +	for (entry = first; entry != failed_entry;
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		ea_ino = le32_to_cpu(entry->e_value_inum);
> +		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +		if (err) {
> +			ext4_warning(parent->i_sb,
> +				     "cleanup ea_ino %u iget error %d", ea_ino,
> +				     err);
> +			continue;
> +		}
> +		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (err)
> +			ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
> +					   err);
> +		iput(ea_inode);
> +	}
> +	return saved_err;
> +}
> +
>  static void
> -ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
> -			    struct buffer_head *bh,
> -			    struct ext4_xattr_entry *first, bool block_csum,
> -			    struct ext4_xattr_inode_array **ea_inode_array,
> -			    int extra_credits)
> +ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
> +			     struct buffer_head *bh,
> +			     struct ext4_xattr_entry *first, bool block_csum,
> +			     struct ext4_xattr_inode_array **ea_inode_array,
> +			     int extra_credits, bool skip_quota)
>  {
>  	struct inode *ea_inode;
>  	struct ext4_xattr_entry *entry;
> @@ -748,10 +1014,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>  			continue;
>  		}
>  
> -		inode_lock(ea_inode);
> -		clear_nlink(ea_inode);
> -		ext4_orphan_add(handle, ea_inode);
> -		inode_unlock(ea_inode);
> +		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (err) {
> +			ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
> +					   err);
> +			continue;
> +		}
> +
> +		if (!skip_quota)
> +			ext4_xattr_inode_free_quota(parent,
> +					      le32_to_cpu(entry->e_value_size));
>  
>  		/*
>  		 * Forget about ea_inode within the same transaction that decrements the ref
> @@ -784,7 +1056,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>   */
>  static void
>  ext4_xattr_release_block(handle_t *handle, struct inode *inode,
> -			 struct buffer_head *bh)
> +			 struct buffer_head *bh,
> +			 struct ext4_xattr_inode_array **ea_inode_array,
> +			 int extra_credits)
>  {
>  	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
>  	u32 hash, ref;
> @@ -807,6 +1081,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>  		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
>  		get_bh(bh);
>  		unlock_buffer(bh);
> +
> +		if (ext4_has_feature_ea_inode(inode->i_sb))
> +			ext4_xattr_inode_dec_ref_all(handle, inode, bh,
> +						     BFIRST(bh),
> +						     true /* block_csum */,
> +						     ea_inode_array,
> +						     extra_credits,
> +						     true /* skip_quota */);
>  		ext4_free_blocks(handle, inode, bh, 0, 1,
>  				 EXT4_FREE_BLOCKS_METADATA |
>  				 EXT4_FREE_BLOCKS_FORGET);
> @@ -947,7 +1229,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
>   * Create an inode to store the value of a large EA.
>   */
>  static struct inode *ext4_xattr_inode_create(handle_t *handle,
> -					     struct inode *inode)
> +					     struct inode *inode, u32 hash)
>  {
>  	struct inode *ea_inode = NULL;
>  	uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
> @@ -965,67 +1247,118 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
>  		ea_inode->i_fop = &ext4_file_operations;
>  		ext4_set_aops(ea_inode);
>  		ext4_xattr_inode_set_class(ea_inode);
> -		ea_inode->i_generation = inode->i_generation;
> -		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
> -
> -		/*
> -		 * A back-pointer from EA inode to parent inode will be useful
> -		 * for e2fsck.
> -		 */
> -		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
>  		unlock_new_inode(ea_inode);
>  		err = ext4_inode_attach_jinode(ea_inode);
> +		if (!err)
> +			err = ext4_xattr_inode_init(handle, ea_inode, hash);
>  		if (err) {
>  			iput(ea_inode);
>  			return ERR_PTR(err);
>  		}
> +
> +		/*
> +		 * Xattr inodes are shared therefore quota charging is performed
> +		 * at a higher level.
> +		 */
> +		dquot_free_inode(ea_inode);
> +		dquot_drop(ea_inode);
> +		inode_lock(ea_inode);
> +		ea_inode->i_flags |= S_NOQUOTA;
> +		inode_unlock(ea_inode);
>  	}
>  
>  	return ea_inode;
>  }
>  
> -/*
> - * Unlink the inode storing the value of the EA.
> - */
> -int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
> +static struct inode *
> +ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
> +			    size_t value_len, u32 hash)
>  {
> -	struct inode *ea_inode = NULL;
> +	struct inode *ea_inode;
> +	struct mb_cache_entry *ce;
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
> +	void *ea_data = NULL;
>  	int err;
>  
> -	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -	if (err)
> -		return err;
> +	ce = mb_cache_entry_find_first(ea_inode_cache, hash);
> +	while (ce) {
> +		ea_inode = ext4_iget(inode->i_sb, ce->e_value);
> +		if (IS_ERR(ea_inode)) {
> +			ea_inode = NULL;
> +			goto next;
> +		}
>  
> -	clear_nlink(ea_inode);
> -	iput(ea_inode);
> +		if (is_bad_inode(ea_inode) ||
> +		    !(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) ||
> +		    i_size_read(ea_inode) != value_len)
> +			goto next;
>  
> -	return 0;
> +		if (!ea_data)
> +			ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
> +
> +		if (!ea_data) {
> +			iput(ea_inode);
> +			return NULL;
> +		}
> +
> +		err = ext4_xattr_inode_read(ea_inode, ea_data, value_len);
> +		if (unlikely(err))
> +			goto next;
> +
> +		if (!memcmp(value, ea_data, value_len)) {
> +			mb_cache_entry_touch(ea_inode_cache, ce);
> +			mb_cache_entry_put(ea_inode_cache, ce);
> +			kvfree(ea_data);
> +			return ea_inode;
> +		}
> +	next:
> +		iput(ea_inode);
> +		ce = mb_cache_entry_find_next(ea_inode_cache, ce);
> +	}
> +	kvfree(ea_data);
> +	return NULL;
>  }
>  
>  /*
>   * Add value of the EA in an inode.
>   */
> -static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
> -				unsigned long *ea_ino, const void *value,
> -				size_t value_len)
> +static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
> +					  const void *value, size_t value_len,
> +					  struct inode **ret_inode)
>  {
> +	u32 hash = ext4_xattr_inode_hash(value, value_len);
>  	struct inode *ea_inode;
>  	int err;
>  
> +	ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
> +	if (ea_inode) {
> +		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
> +		if (err) {
> +			iput(ea_inode);
> +			return err;
> +		}
> +
> +		*ret_inode = ea_inode;
> +		return 0;
> +	}
> +
>  	/* Create an inode for the EA value */
> -	ea_inode = ext4_xattr_inode_create(handle, inode);
> +	ea_inode = ext4_xattr_inode_create(handle, inode, hash);
>  	if (IS_ERR(ea_inode))
>  		return PTR_ERR(ea_inode);
>  
>  	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
> -	if (err)
> -		clear_nlink(ea_inode);
> -	else
> -		*ea_ino = ea_inode->i_ino;
> +	if (err) {
> +		ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		iput(ea_inode);
> +		return err;
> +	}
>  
> -	iput(ea_inode);
> +	mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
> +			      ea_inode->i_ino, true /* reusable */);
>  
> -	return err;
> +	*ret_inode = ea_inode;
> +	return 0;
>  }
>  
>  static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
> @@ -1033,11 +1366,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>  				handle_t *handle, struct inode *inode)
>  {
>  	struct ext4_xattr_entry *last;
> +	struct ext4_xattr_entry *here = s->here;
>  	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
>  	int in_inode = i->in_inode;
> -	int rc;
> +	struct inode *old_ea_inode = NULL;
> +	struct inode *new_ea_inode = NULL;
> +	int ret;
>  
> -	/* Compute min_offs and last. */
> +	/*
> +	 * Optimization for the simple case when old and new values have the
> +	 * same padded sizes. Not applicable if the existing value is stored in
> +	 * an external inode.
> +	 */
> +	if (i->value && !s->not_found && !here->e_value_inum &&
> +	    EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) ==
> +	    EXT4_XATTR_SIZE(i->value_len)) {
> +		size_t offs = le16_to_cpu(here->e_value_offs);
> +		void *val = s->base + offs;
> +		size_t size = EXT4_XATTR_SIZE(i->value_len);
> +
> +		here->e_value_size = cpu_to_le32(i->value_len);
> +		if (i->value == EXT4_ZERO_XATTR_VALUE) {
> +			memset(val, 0, size);
> +		} else {
> +			memcpy(val, i->value, i->value_len);
> +			/* Clear padding bytes. */
> +			memset(val + i->value_len, 0, size - i->value_len);
> +		}
> +		return 0;
> +	}
> +
> +	/* Find out min_offs and last to calculate the free space. */
>  	last = s->first;
>  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
>  		if (!last->e_value_inum && last->e_value_size) {
> @@ -1048,120 +1407,149 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>  	}
>  	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
>  	if (!s->not_found) {
> -		if (!in_inode &&
> -		    !s->here->e_value_inum && s->here->e_value_size) {
> -			size_t size = le32_to_cpu(s->here->e_value_size);
> +		if (!here->e_value_inum && here->e_value_size) {
> +			size_t size = le32_to_cpu(here->e_value_size);
>  			free += EXT4_XATTR_SIZE(size);
>  		}
>  		free += EXT4_XATTR_LEN(name_len);
>  	}
>  	if (i->value) {
> -		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
> +		size_t value_len = in_inode ? 0 : EXT4_XATTR_SIZE(i->value_len);
>  
> -		if (in_inode)
> -			value_len = 0;
> +		if (free < EXT4_XATTR_LEN(name_len) + value_len) {
> +			ret = -ENOSPC;
> +			goto out;
> +		}
> +	}
>  
> -		if (free < EXT4_XATTR_LEN(name_len) + value_len)
> -			return -ENOSPC;
> +	/*
> +	 * Getting access to old and new ea inodes is subject to failures.
> +	 * Finish that work before doing any modifications to the xattr data.
> +	 */
> +	if (!s->not_found && here->e_value_inum) {
> +		ret = ext4_xattr_inode_iget(inode,
> +		 			    le32_to_cpu(here->e_value_inum),
> +		 			    &old_ea_inode);
> +		if (ret) {
> +			old_ea_inode = NULL;
> +			goto out;
> +		}
>  	}
> +	if (i->value && in_inode) {
> +		WARN_ON_ONCE(!i->value_len);
>  
> -	if (i->value && s->not_found) {
> -		/* Insert the new name. */
> -		size_t size = EXT4_XATTR_LEN(name_len);
> -		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
> -		memmove((void *)s->here + size, s->here, rest);
> -		memset(s->here, 0, size);
> -		s->here->e_name_index = i->name_index;
> -		s->here->e_name_len = name_len;
> -		memcpy(s->here->e_name, i->name, name_len);
> -	} else {
> -		if (!s->here->e_value_inum && s->here->e_value_size &&
> -		    s->here->e_value_offs > 0) {
> -			void *first_val = s->base + min_offs;
> -			size_t offs = le16_to_cpu(s->here->e_value_offs);
> -			void *val = s->base + offs;
> -			size_t size = EXT4_XATTR_SIZE(
> -				le32_to_cpu(s->here->e_value_size));
> -
> -			if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
> -				/* The old and the new value have the same
> -				   size. Just replace. */
> -				s->here->e_value_size =
> -					cpu_to_le32(i->value_len);
> -				if (i->value == EXT4_ZERO_XATTR_VALUE) {
> -					memset(val, 0, size);
> -				} else {
> -					/* Clear pad bytes first. */
> -					memset(val + size - EXT4_XATTR_PAD, 0,
> -					       EXT4_XATTR_PAD);
> -					memcpy(val, i->value, i->value_len);
> -				}
> -				return 0;
> -			}
> +		ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
> +		if (ret)
> +			goto out;
>  
> -			/* Remove the old value. */
> -			memmove(first_val + size, first_val, val - first_val);
> -			memset(first_val, 0, size);
> -			s->here->e_value_size = 0;
> -			s->here->e_value_offs = 0;
> -			min_offs += size;
> -
> -			/* Adjust all value offsets. */
> -			last = s->first;
> -			while (!IS_LAST_ENTRY(last)) {
> -				size_t o = le16_to_cpu(last->e_value_offs);
> -				if (!last->e_value_inum &&
> -				    last->e_value_size && o < offs)
> -					last->e_value_offs =
> -						cpu_to_le16(o + size);
> -				last = EXT4_XATTR_NEXT(last);
> -			}
> +		ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
> +						     i->value_len,
> +						     &new_ea_inode);
> +		if (ret) {
> +			new_ea_inode = NULL;
> +			ext4_xattr_inode_free_quota(inode, i->value_len);
> +			goto out;
>  		}
> -		if (s->here->e_value_inum) {
> -			ext4_xattr_inode_unlink(inode,
> -					    le32_to_cpu(s->here->e_value_inum));
> -			s->here->e_value_inum = 0;
> +	}
> +
> +	if (old_ea_inode) {
> +		/* We are ready to release ref count on the old_ea_inode. */
> +		ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
> +		if (ret) {
> +			/* Release newly required ref count on new_ea_inode. */
> +			if (new_ea_inode) {
> +				int err;
> +
> +				err = ext4_xattr_inode_dec_ref(handle,
> +							       new_ea_inode);
> +				if (err)
> +					ext4_warning_inode(new_ea_inode,
> +						  "dec ref new_ea_inode err=%d",
> +						  err);
> +				ext4_xattr_inode_free_quota(inode,
> +							    i->value_len);
> +			}
> +			goto out;
>  		}
> -		if (!i->value) {
> -			/* Remove the old name. */
> -			size_t size = EXT4_XATTR_LEN(name_len);
> -			last = ENTRY((void *)last - size);
> -			memmove(s->here, (void *)s->here + size,
> -				(void *)last - (void *)s->here + sizeof(__u32));
> -			memset(last, 0, size);
> +
> +		ext4_xattr_inode_free_quota(inode,
> +					    le32_to_cpu(here->e_value_size));
> +	}
> +
> +	/* No failures allowed past this point. */
> +
> +	if (!s->not_found && here->e_value_offs) {
> +		/* Remove the old value. */
> +		void *first_val = s->base + min_offs;
> +		size_t offs = le16_to_cpu(here->e_value_offs);
> +		void *val = s->base + offs;
> +		size_t size = EXT4_XATTR_SIZE(
> +			le32_to_cpu(here->e_value_size));
> +
> +		memmove(first_val + size, first_val, val - first_val);
> +		memset(first_val, 0, size);
> +		min_offs += size;
> +
> +		/* Adjust all value offsets. */
> +		last = s->first;
> +		while (!IS_LAST_ENTRY(last)) {
> +			size_t o = le16_to_cpu(last->e_value_offs);
> +			if (!last->e_value_inum &&
> +			    last->e_value_size && o < offs)
> +				last->e_value_offs =
> +					cpu_to_le16(o + size);
> +			last = EXT4_XATTR_NEXT(last);
>  		}
>  	}
>  
> +	if (!s->not_found && !i->value) {
> +		/* Remove old name. */
> +		size_t size = EXT4_XATTR_LEN(name_len);
> +		last = ENTRY((void *)last - size);
> +		memmove(here, (void *)here + size,
> +			(void *)last - (void *)here + sizeof(__u32));
> +		memset(last, 0, size);
> +	} else if (s->not_found && i->value) {
> +		/* Insert new name. */
> +		size_t size = EXT4_XATTR_LEN(name_len);
> +		size_t rest = (void *)last - (void *)here + sizeof(__u32);
> +		memmove((void *)here + size, here, rest);
> +		memset(here, 0, size);
> +		here->e_name_index = i->name_index;
> +		here->e_name_len = name_len;
> +		memcpy(here->e_name, i->name, name_len);
> +	} else {
> +		WARN_ON_ONCE(s->not_found || !i->value);
> +		/* This is an update, reset value info. */
> +		here->e_value_inum = 0;
> +		here->e_value_offs = 0;
> +		here->e_value_size = 0;
> +	}
> +
>  	if (i->value) {
> -		/* Insert the new value. */
> +		/* Insert new value. */
>  		if (in_inode) {
> -			unsigned long ea_ino =
> -				le32_to_cpu(s->here->e_value_inum);
> -			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
> -						  i->value, i->value_len);
> -			if (rc)
> -				goto out;
> -			s->here->e_value_inum = cpu_to_le32(ea_ino);
> -			s->here->e_value_offs = 0;
> +			here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
>  		} else if (i->value_len) {
>  			size_t size = EXT4_XATTR_SIZE(i->value_len);
>  			void *val = s->base + min_offs - size;
> -			s->here->e_value_offs = cpu_to_le16(min_offs - size);
> -			s->here->e_value_inum = 0;
> +			here->e_value_offs = cpu_to_le16(min_offs - size);
>  			if (i->value == EXT4_ZERO_XATTR_VALUE) {
>  				memset(val, 0, size);
>  			} else {
> -				/* Clear the pad bytes first. */
> -				memset(val + size - EXT4_XATTR_PAD, 0,
> -				       EXT4_XATTR_PAD);
>  				memcpy(val, i->value, i->value_len);
> +				/* Clear padding bytes. */
> +				memset(val + i->value_len, 0,
> +				       size - i->value_len);
>  			}
>  		}
> -		s->here->e_value_size = cpu_to_le32(i->value_len);
> +		here->e_value_size = cpu_to_le32(i->value_len);
>  	}
> -
> +	ret = 0;
>  out:
> -	return rc;
> +	iput(old_ea_inode);
> +	iput(new_ea_inode);
> +	return ret;
>  }
>  
>  struct ext4_xattr_block_find {
> @@ -1223,6 +1611,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  	struct mb_cache_entry *ce = NULL;
>  	int error = 0;
>  	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
> +	struct inode *ea_inode = NULL;
> +	size_t old_ea_inode_size = 0;
>  
>  #define header(x) ((struct ext4_xattr_header *)(x))
>  
> @@ -1277,6 +1667,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  			header(s->base)->h_refcount = cpu_to_le32(1);
>  			s->here = ENTRY(s->base + offset);
>  			s->end = s->base + bs->bh->b_size;
> +
> +			/*
> +			 * If existing entry points to an xattr inode, we need
> +			 * to prevent ext4_xattr_set_entry() from decrementing
> +			 * ref count on it because the reference belongs to the
> +			 * original block. In this case, make the entry look
> +			 * like it has an empty value.
> +			 */
> +			if (!s->not_found && s->here->e_value_inum) {
> +				/*
> +				 * Defer quota free call for previous inode
> +				 * until success is guaranteed.
> +				 */
> +				old_ea_inode_size = le32_to_cpu(
> +							s->here->e_value_size);
> +				s->here->e_value_inum = 0;
> +				s->here->e_value_size = 0;
> +			}
>  		}
>  	} else {
>  		/* Allocate a buffer where we construct the new block. */
> @@ -1298,6 +1706,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  		goto bad_block;
>  	if (error)
>  		goto cleanup;
> +
> +	if (i->value && s->here->e_value_inum) {
> +		unsigned int ea_ino;
> +
> +		/*
> +		 * A ref count on ea_inode has been taken as part of the call to
> +		 * ext4_xattr_set_entry() above. We would like to drop this
> +		 * extra ref but we have to wait until the xattr block is
> +		 * initialized and has its own ref count on the ea_inode.
> +		 */
> +		ea_ino = le32_to_cpu(s->here->e_value_inum);
> +		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> +		if (error) {
> +			ea_inode = NULL;
> +			goto cleanup;
> +		}
> +	}
> +
>  	if (!IS_LAST_ENTRY(s->first))
>  		ext4_xattr_rehash(header(s->base), s->here);
>  
> @@ -1408,6 +1834,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  						 EXT4_FREE_BLOCKS_METADATA);
>  				goto cleanup;
>  			}
> +			error = ext4_xattr_inode_inc_ref_all(handle, inode,
> +						      ENTRY(header(s->base)+1));
> +			if (error)
> +				goto getblk_failed;
> +			if (ea_inode) {
> +				/* Drop the extra ref on ea_inode. */
> +				error = ext4_xattr_inode_dec_ref(handle,
> +								 ea_inode);
> +				if (error)
> +					ext4_warning_inode(ea_inode,
> +							   "dec ref error=%d",
> +							   error);
> +				iput(ea_inode);
> +				ea_inode = NULL;
> +			}
> +
>  			lock_buffer(new_bh);
>  			error = ext4_journal_get_create_access(handle, new_bh);
>  			if (error) {
> @@ -1427,15 +1869,36 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  		}
>  	}
>  
> +	if (old_ea_inode_size)
> +		ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
> +
>  	/* Update the inode. */
>  	EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
>  
>  	/* Drop the previous xattr block. */
> -	if (bs->bh && bs->bh != new_bh)
> -		ext4_xattr_release_block(handle, inode, bs->bh);
> +	if (bs->bh && bs->bh != new_bh) {
> +		struct ext4_xattr_inode_array *ea_inode_array = NULL;
> +		ext4_xattr_release_block(handle, inode, bs->bh,
> +					 &ea_inode_array,
> +					 0 /* extra_credits */);
> +		ext4_xattr_inode_array_free(ea_inode_array);
> +	}
>  	error = 0;
>  
>  cleanup:
> +	if (ea_inode) {
> +		int error2;
> +		error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (error2)
> +			ext4_warning_inode(ea_inode, "dec ref error=%d",
> +					   error2);
> +
> +		/* If there was an error, revert the quota charge. */
> +		if (error)
> +			ext4_xattr_inode_free_quota(inode,
> +						    i_size_read(ea_inode));
> +		iput(ea_inode);
> +	}
>  	if (ce)
>  		mb_cache_entry_put(ext4_mb_cache, ce);
>  	brelse(new_bh);
> @@ -1546,6 +2009,117 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
>  	return 0;
>  }
>  
> +struct ext4_xattr_ea_info {
> +	__le64 ref_count;	/* number of xattr entry references */
> +	__le32 hash;		/* crc32c hash of xattr data */
> +	__le32 reserved;	/* reserved, must be 0 */
> +};
> +
> +static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
> +				 u32 hash)
> +{
> +	struct ext4_xattr_ea_info ea_info = {
> +		.ref_count = cpu_to_le64(1),
> +		.hash = cpu_to_le32(hash),
> +		.reserved = 0,
> +	};
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +		.value = &ea_info,
> +		.value_len = sizeof(ea_info),
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	return ext4_xattr_ibody_set(handle, ea_inode, &i, &is);
> +}
> +
> +static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
> +				     u64 *ref_return, u32 *hash)
> +{
> +	struct ext4_xattr_ea_info ea_info;
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +		.value = &ea_info,
> +		.value_len = sizeof(ea_info),
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	if (WARN_ON(is.s.not_found) ||
> +	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(ea_info)))
> +		return -EFSCORRUPTED;
> +
> +	memcpy(&ea_info,
> +	       ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs),
> +	       sizeof(ea_info));
> +
> +	if (hash)
> +		*hash = le32_to_cpu(ea_info.hash);
> +
> +	*ref_return = le64_to_cpu(ea_info.ref_count) + ref_change;
> +	ea_info.ref_count = cpu_to_le64(*ref_return);
> +
> +	return ext4_xattr_set_entry(&i, &is.s, NULL, ea_inode);
> +}
> +
> +static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash)
> +{
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	struct ext4_xattr_ea_info *ea_info;
> +	void *ptr;
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	if (WARN_ON(is.s.not_found) ||
> +	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(*ea_info)))
> +		return -EFSCORRUPTED;
> +
> +	ptr = ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs);
> +	ea_info = (struct ext4_xattr_ea_info *)ptr;
> +
> +	if (WARN_ON(ea_info->reserved != 0))
> +		return -EFSCORRUPTED;
> +
> +	*hash = le32_to_cpu(ea_info->hash);
> +	return 0;
> +}
> +
>  static int ext4_xattr_value_same(struct ext4_xattr_search *s,
>  				 struct ext4_xattr_info *i)
>  {
> @@ -1560,6 +2134,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
>  	return !memcmp(value, i->value, i->value_len);
>  }
>  
> +struct buffer_head *ext4_xattr_get_block(struct inode *inode)
> +{
> +	struct buffer_head *bh;
> +	int error;
> +
> +	if (!EXT4_I(inode)->i_file_acl)
> +		return NULL;
> +	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +	if (!bh)
> +		return ERR_PTR(-EIO);
> +	error = ext4_xattr_check_block(inode, bh);
> +	if (error)
> +		return ERR_PTR(error);
> +	return bh;
> +}
> +
>  /*
>   * ext4_xattr_set_handle()
>   *
> @@ -1602,9 +2192,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  
>  	/* Check journal credits under write lock. */
>  	if (ext4_handle_valid(handle)) {
> +		struct buffer_head *bh;
>  		int credits;
>  
> -		credits = ext4_xattr_set_credits(inode, value_len);
> +		bh = ext4_xattr_get_block(inode);
> +		if (IS_ERR(bh)) {
> +			error = PTR_ERR(bh);
> +			goto cleanup;
> +		}
> +
> +		credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
> +		brelse(bh);
> +
>  		if (!ext4_handle_has_enough_credits(handle, credits)) {
>  			error = -ENOSPC;
>  			goto cleanup;
> @@ -1640,6 +2239,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  		if (flags & XATTR_CREATE)
>  			goto cleanup;
>  	}
> +
>  	if (!value) {
>  		if (!is.s.not_found)
>  			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
> @@ -1708,34 +2308,29 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  	return error;
>  }
>  
> -int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
> +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
>  {
> -	struct super_block *sb = inode->i_sb;
> -	int credits;
> -
> -	if (!EXT4_SB(sb)->s_journal)
> -		return 0;
> -
> -	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
> +	struct buffer_head *bh;
> +	int err;
>  
> -	/*
> -	 * In case of inline data, we may push out the data to a block,
> -	 * so we need to reserve credits for this eventuality
> -	 */
> -	if (ext4_has_inline_data(inode))
> -	        credits += ext4_writepage_trans_blocks(inode) + 1;
> +	*credits = 0;
>  
> -	if (ext4_has_feature_ea_inode(sb)) {
> -		int nrblocks = (value_len + sb->s_blocksize - 1) >>
> -					sb->s_blocksize_bits;
> +	if (!EXT4_SB(inode->i_sb)->s_journal)
> +		return 0;
>  
> -		/* For new inode */
> -		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
> +	down_read(&EXT4_I(inode)->xattr_sem);
>  
> -		/* For data blocks of EA inode */
> -		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
> +	bh = ext4_xattr_get_block(inode);
> +	if (IS_ERR(bh)) {
> +		err = PTR_ERR(bh);
> +	} else {
> +		*credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
> +		brelse(bh);
> +		err = 0;
>  	}
> -	return credits;
> +
> +	up_read(&EXT4_I(inode)->xattr_sem);
> +	return err;
>  }
>  
>  /*
> @@ -1760,7 +2355,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
>  		return error;
>  
>  retry:
> -	credits = ext4_xattr_set_credits(inode, value_len);
> +	error = ext4_xattr_set_credits(inode, value_len, &credits);
> +	if (error)
> +		return error;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>  	if (IS_ERR(handle)) {
>  		error = PTR_ERR(handle);
> @@ -2066,10 +2664,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
>  	return error;
>  }
>  
> -
>  #define EIA_INCR 16 /* must be 2^n */
>  #define EIA_MASK (EIA_INCR - 1)
> -/* Add the large xattr @inode into @ea_inode_array for later deletion.
> +
> +/* Add the large xattr @inode into @ea_inode_array for deferred iput().
>   * If @ea_inode_array is new or full it will be grown and the old
>   * contents copied over.
>   */
> @@ -2114,21 +2712,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>   * ext4_xattr_delete_inode()
>   *
>   * Free extended attribute resources associated with this inode. Traverse
> - * all entries and unlink any xattr inodes associated with this inode. This
> - * is called immediately before an inode is freed. We have exclusive
> - * access to the inode. If an orphan inode is deleted it will also delete any
> - * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
> - * to ensure they belong to the parent inode and were not deleted already.
> + * all entries and decrement reference on any xattr inodes associated with this
> + * inode. This is called immediately before an inode is freed. We have exclusive
> + * access to the inode. If an orphan inode is deleted it will also release its
> + * references on xattr block and xattr inodes.
>   */
> -int
> -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> -			struct ext4_xattr_inode_array **ea_inode_array,
> -			int extra_credits)
> +int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> +			    struct ext4_xattr_inode_array **ea_inode_array,
> +			    int extra_credits)
>  {
>  	struct buffer_head *bh = NULL;
>  	struct ext4_xattr_ibody_header *header;
> -	struct ext4_inode *raw_inode;
>  	struct ext4_iloc iloc = { .bh = NULL };
> +	struct ext4_xattr_entry *entry;
>  	int error;
>  
>  	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
> @@ -2140,66 +2736,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  		goto cleanup;
>  	}
>  
> -	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
> -		goto delete_external_ea;
> -
> -	error = ext4_get_inode_loc(inode, &iloc);
> -	if (error)
> -		goto cleanup;
> +	if (ext4_has_feature_ea_inode(inode->i_sb) &&
> +	    ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
>  
> -	error = ext4_journal_get_write_access(handle, iloc.bh);
> -	if (error)
> -		goto cleanup;
> +		error = ext4_get_inode_loc(inode, &iloc);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
> +			goto cleanup;
> +		}
>  
> -	raw_inode = ext4_raw_inode(&iloc);
> -	header = IHDR(inode, raw_inode);
> -	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
> -				    false /* block_csum */, ea_inode_array,
> -				    extra_credits);
> +		error = ext4_journal_get_write_access(handle, iloc.bh);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "write access (error %d)",
> +					 error);
> +			goto cleanup;
> +		}
>  
> -delete_external_ea:
> -	if (!EXT4_I(inode)->i_file_acl) {
> -		error = 0;
> -		goto cleanup;
> -	}
> -	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> -	if (!bh) {
> -		EXT4_ERROR_INODE(inode, "block %llu read error",
> -				 EXT4_I(inode)->i_file_acl);
> -		error = -EIO;
> -		goto cleanup;
> -	}
> -	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
> -	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
> -		EXT4_ERROR_INODE(inode, "bad block %llu",
> -				 EXT4_I(inode)->i_file_acl);
> -		error = -EFSCORRUPTED;
> -		goto cleanup;
> +		header = IHDR(inode, ext4_raw_inode(&iloc));
> +		if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
> +			ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
> +						     IFIRST(header),
> +						     false /* block_csum */,
> +						     ea_inode_array,
> +						     extra_credits,
> +						     false /* skip_quota */);
>  	}
>  
> -	if (ext4_has_feature_ea_inode(inode->i_sb)) {
> -		error = ext4_journal_get_write_access(handle, bh);
> -		if (error) {
> -			EXT4_ERROR_INODE(inode, "write access %llu",
> +	if (EXT4_I(inode)->i_file_acl) {
> +		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +		if (!bh) {
> +			EXT4_ERROR_INODE(inode, "block %llu read error",
>  					 EXT4_I(inode)->i_file_acl);
> +			error = -EIO;
> +			goto cleanup;
> +		}
> +		error = ext4_xattr_check_block(inode, bh);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
> +					 EXT4_I(inode)->i_file_acl, error);
>  			goto cleanup;
>  		}
> -		ext4_xattr_inode_remove_all(handle, inode, bh,
> -					    BFIRST(bh),
> -					    true /* block_csum */,
> -					    ea_inode_array,
> -					    extra_credits);
> -	}
>  
> -	ext4_xattr_release_block(handle, inode, bh);
> -	/* Update i_file_acl within the same transaction that releases block. */
> -	EXT4_I(inode)->i_file_acl = 0;
> -	error = ext4_mark_inode_dirty(handle, inode);
> -	if (error) {
> -		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> -				 error);
> -		goto cleanup;
> +		if (ext4_has_feature_ea_inode(inode->i_sb)) {
> +			for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> +			     entry = EXT4_XATTR_NEXT(entry))
> +				if (entry->e_value_inum)
> +					ext4_xattr_inode_free_quota(inode,
> +					      le32_to_cpu(entry->e_value_size));
> +
> +		}
> +
> +		ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
> +					 extra_credits);
> +		/*
> +		 * Update i_file_acl value in the same transaction that releases
> +		 * block.
> +		 */
> +		EXT4_I(inode)->i_file_acl = 0;
> +		error = ext4_mark_inode_dirty(handle, inode);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> +					 error);
> +			goto cleanup;
> +		}
>  	}
> +	error = 0;
>  cleanup:
>  	brelse(iloc.bh);
>  	brelse(bh);
> @@ -2208,17 +2809,13 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  
>  void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
>  {
> -	struct inode	*ea_inode;
> -	int		idx = 0;
> +	int idx;
>  
>  	if (ea_inode_array == NULL)
>  		return;
>  
> -	for (; idx < ea_inode_array->count; ++idx) {
> -		ea_inode = ea_inode_array->inodes[idx];
> -		clear_nlink(ea_inode);
> -		iput(ea_inode);
> -	}
> +	for (idx = 0; idx < ea_inode_array->count; ++idx)
> +		iput(ea_inode_array->inodes[idx]);
>  	kfree(ea_inode_array);
>  }
>  
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index b2005a2716d9..67616cb9a059 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -70,19 +70,6 @@ struct ext4_xattr_entry {
>  #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
>  
>  /*
> - * Link EA inode back to parent one using i_mtime field.
> - * Extra integer type conversion added to ignore higher
> - * bits in i_mtime.tv_sec which might be set by ext4_get()
> - */
> -#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
> -do {                                                  \
> -      (inode)->i_mtime.tv_sec = inum;                 \
> -} while(0)
> -
> -#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
> -((__u32)(inode)->i_mtime.tv_sec)
> -
> -/*
>   * The minimum size of EA value when you start storing it in an external inode
>   * size of block - size of header - size of 1 entry - 4 null bytes
>  */
> @@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
>  extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
>  extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
>  extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
> -extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
> +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
> +				  int *credits);
>  
> -extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
>  extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  				   struct ext4_xattr_inode_array **array,
>  				   int extra_credits);
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index 77a5b99d8f92..7dfdca822ccb 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -13,10 +13,11 @@
>   * mb_cache_entry_delete()).
>   *
>   * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
> - * They use hash of a block contents as a key and block number as a value.
> - * That's why keys need not be unique (different xattr blocks may end up having
> - * the same hash). However block number always uniquely identifies a cache
> - * entry.
> + * Ext4 also uses it for deduplication of xattr values stored in inodes.
> + * They use hash of data as a key and provide a value that may represent a
> + * block or inode number. That's why keys need not be unique (hash of different
> + * data may be the same). However user provided value always uniquely
> + * identifies a cache entry.
>   *
>   * We provide functions for creation and removal of entries, search by key,
>   * and a special "delete entry with given key-value pair" operation. Fixed
> -- 
> 2.13.0.219.gdb65acc882-goog
> 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 09/28] ext4: extended attribute value size limit is enforced by vfs
  2017-05-31  8:14 ` [PATCH 09/28] ext4: extended attribute value size limit is enforced by vfs Tahsin Erdogan
  2017-05-31 16:03     ` Darrick J. Wong
@ 2017-05-31 16:03     ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-05-31 16:03 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

On Wed, May 31, 2017 at 01:14:58AM -0700, Tahsin Erdogan wrote:
> EXT4_XATTR_MAX_LARGE_EA_SIZE definition in ext4 is currently unused.
> Besides, vfs enforces its own 64k limit which makes the 1MB limit in
> ext4 redundant. Remove it.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext4/ext4.h | 6 ------
>  1 file changed, 6 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 5d5fc0d0e2bc..2cdd6070e348 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2221,12 +2221,6 @@ struct mmpd_data {
>  #define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
>  
>  /*
> - * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb
> - * This limit is arbitrary, but is reasonable for the xattr API.
> - */
> -#define EXT4_XATTR_MAX_LARGE_EA_SIZE    (1024 * 1024)

Uhhh... didn't you add this in patch 1/28?  There's little point in
adding a symbol just to delete it shortly thereafter.

--D

> -
> -/*
>   * Function prototypes
>   */
>  
> -- 
> 2.13.0.219.gdb65acc882-goog
> 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 09/28] ext4: extended attribute value size limit is enforced by vfs
@ 2017-05-31 16:03     ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-05-31 16:03 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Deepa Dinamani, Dave Kleikamp, jfs-discussion, Theodore Ts'o,
	linux-kernel, reiserfs-devel, Jens Axboe, linux-fsdevel,
	Mike Christie, Andreas Dilger, Alexander Viro, Jan Kara,
	Fabian Frederick, linux-ext4, ocfs2-devel

On Wed, May 31, 2017 at 01:14:58AM -0700, Tahsin Erdogan wrote:
> EXT4_XATTR_MAX_LARGE_EA_SIZE definition in ext4 is currently unused.
> Besides, vfs enforces its own 64k limit which makes the 1MB limit in
> ext4 redundant. Remove it.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext4/ext4.h | 6 ------
>  1 file changed, 6 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 5d5fc0d0e2bc..2cdd6070e348 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2221,12 +2221,6 @@ struct mmpd_data {
>  #define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
>  
>  /*
> - * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb
> - * This limit is arbitrary, but is reasonable for the xattr API.
> - */
> -#define EXT4_XATTR_MAX_LARGE_EA_SIZE    (1024 * 1024)

Uhhh... didn't you add this in patch 1/28?  There's little point in
adding a symbol just to delete it shortly thereafter.

--D

> -
> -/*
>   * Function prototypes
>   */
>  
> -- 
> 2.13.0.219.gdb65acc882-goog
> 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [Ocfs2-devel] [PATCH 09/28] ext4: extended attribute value size limit is enforced by vfs
@ 2017-05-31 16:03     ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-05-31 16:03 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Deepa Dinamani, Dave Kleikamp, jfs-discussion, Theodore Ts'o,
	linux-kernel, reiserfs-devel, Jens Axboe, linux-fsdevel,
	Mike Christie, Andreas Dilger, Alexander Viro, Jan Kara,
	Fabian Frederick, linux-ext4, ocfs2-devel

On Wed, May 31, 2017 at 01:14:58AM -0700, Tahsin Erdogan wrote:
> EXT4_XATTR_MAX_LARGE_EA_SIZE definition in ext4 is currently unused.
> Besides, vfs enforces its own 64k limit which makes the 1MB limit in
> ext4 redundant. Remove it.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext4/ext4.h | 6 ------
>  1 file changed, 6 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 5d5fc0d0e2bc..2cdd6070e348 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2221,12 +2221,6 @@ struct mmpd_data {
>  #define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
>  
>  /*
> - * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb
> - * This limit is arbitrary, but is reasonable for the xattr API.
> - */
> -#define EXT4_XATTR_MAX_LARGE_EA_SIZE    (1024 * 1024)

Uhhh... didn't you add this in patch 1/28?  There's little point in
adding a symbol just to delete it shortly thereafter.

--D

> -
> -/*
>   * Function prototypes
>   */
>  
> -- 
> 2.13.0.219.gdb65acc882-goog
> 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 07/28] ext4: call journal revoke when freeing ea_inode blocks
  2017-05-31  8:14 ` [PATCH 07/28] ext4: call journal revoke when freeing ea_inode blocks Tahsin Erdogan
  2017-05-31 16:12     ` Darrick J. Wong
@ 2017-05-31 16:12     ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-05-31 16:12 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

On Wed, May 31, 2017 at 01:14:56AM -0700, Tahsin Erdogan wrote:
> ea_inode contents are treated as metadata, that's why it is journaled
> during initial writes. Failing to call revoke during freeing could cause
> user data to be overwritten with original ea_inode contents during journal
> replay.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext4/extents.c  | 3 ++-
>  fs/ext4/indirect.c | 3 ++-
>  2 files changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 3e36508610b7..e0a8425ff74d 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2488,7 +2488,8 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
>  
>  static inline int get_default_free_blocks_flags(struct inode *inode)
>  {
> -	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
> +	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
> +	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
>  		return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
>  	else if (ext4_should_journal_data(inode))
>  		return EXT4_FREE_BLOCKS_FORGET;
> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> index bc15c2c17633..7ffa290cbb8e 100644
> --- a/fs/ext4/indirect.c
> +++ b/fs/ext4/indirect.c
> @@ -829,7 +829,8 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
>  	int	flags = EXT4_FREE_BLOCKS_VALIDATED;
>  	int	err;
>  
> -	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
> +	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
> +	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))

I appreciate the thoroughness of doing this even for blockmapped
ea_inode files, and I'm not complaining about this hunk at all. :)

However, please consider requiring the extents feature + format as a
prerequisite for ea_inodes.  ext4 has traditionally been very ...
permissive about supporting a diverse range of feature options, but the
cost of that diversity is that the feature support matrix that the
community has to support is already untestably large.

I think it would be wise not to support !extents && ea_inode,
particularly since blockmaps aren't protected by metadata_csum and so in
the long run it's probably best to minimize the introduction of new
blockmap files (on ext4 anyway).

--D

>  		flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
>  	else if (ext4_should_journal_data(inode))
>  		flags |= EXT4_FREE_BLOCKS_FORGET;
> -- 
> 2.13.0.219.gdb65acc882-goog
> 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 07/28] ext4: call journal revoke when freeing ea_inode blocks
@ 2017-05-31 16:12     ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-05-31 16:12 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Deepa Dinamani, Dave Kleikamp, jfs-discussion, Theodore Ts'o,
	linux-kernel, reiserfs-devel, Jens Axboe, linux-fsdevel,
	Mike Christie, Andreas Dilger, Alexander Viro, Jan Kara,
	Fabian Frederick, linux-ext4, ocfs2-devel

On Wed, May 31, 2017 at 01:14:56AM -0700, Tahsin Erdogan wrote:
> ea_inode contents are treated as metadata, that's why it is journaled
> during initial writes. Failing to call revoke during freeing could cause
> user data to be overwritten with original ea_inode contents during journal
> replay.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext4/extents.c  | 3 ++-
>  fs/ext4/indirect.c | 3 ++-
>  2 files changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 3e36508610b7..e0a8425ff74d 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2488,7 +2488,8 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
>  
>  static inline int get_default_free_blocks_flags(struct inode *inode)
>  {
> -	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
> +	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
> +	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
>  		return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
>  	else if (ext4_should_journal_data(inode))
>  		return EXT4_FREE_BLOCKS_FORGET;
> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> index bc15c2c17633..7ffa290cbb8e 100644
> --- a/fs/ext4/indirect.c
> +++ b/fs/ext4/indirect.c
> @@ -829,7 +829,8 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
>  	int	flags = EXT4_FREE_BLOCKS_VALIDATED;
>  	int	err;
>  
> -	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
> +	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
> +	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))

I appreciate the thoroughness of doing this even for blockmapped
ea_inode files, and I'm not complaining about this hunk at all. :)

However, please consider requiring the extents feature + format as a
prerequisite for ea_inodes.  ext4 has traditionally been very ...
permissive about supporting a diverse range of feature options, but the
cost of that diversity is that the feature support matrix that the
community has to support is already untestably large.

I think it would be wise not to support !extents && ea_inode,
particularly since blockmaps aren't protected by metadata_csum and so in
the long run it's probably best to minimize the introduction of new
blockmap files (on ext4 anyway).

--D

>  		flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
>  	else if (ext4_should_journal_data(inode))
>  		flags |= EXT4_FREE_BLOCKS_FORGET;
> -- 
> 2.13.0.219.gdb65acc882-goog
> 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [Ocfs2-devel] [PATCH 07/28] ext4: call journal revoke when freeing ea_inode blocks
@ 2017-05-31 16:12     ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-05-31 16:12 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Deepa Dinamani, Dave Kleikamp, jfs-discussion, Theodore Ts'o,
	linux-kernel, reiserfs-devel, Jens Axboe, linux-fsdevel,
	Mike Christie, Andreas Dilger, Alexander Viro, Jan Kara,
	Fabian Frederick, linux-ext4, ocfs2-devel

On Wed, May 31, 2017 at 01:14:56AM -0700, Tahsin Erdogan wrote:
> ea_inode contents are treated as metadata, that's why it is journaled
> during initial writes. Failing to call revoke during freeing could cause
> user data to be overwritten with original ea_inode contents during journal
> replay.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext4/extents.c  | 3 ++-
>  fs/ext4/indirect.c | 3 ++-
>  2 files changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 3e36508610b7..e0a8425ff74d 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2488,7 +2488,8 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
>  
>  static inline int get_default_free_blocks_flags(struct inode *inode)
>  {
> -	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
> +	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
> +	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
>  		return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
>  	else if (ext4_should_journal_data(inode))
>  		return EXT4_FREE_BLOCKS_FORGET;
> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> index bc15c2c17633..7ffa290cbb8e 100644
> --- a/fs/ext4/indirect.c
> +++ b/fs/ext4/indirect.c
> @@ -829,7 +829,8 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
>  	int	flags = EXT4_FREE_BLOCKS_VALIDATED;
>  	int	err;
>  
> -	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
> +	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
> +	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))

I appreciate the thoroughness of doing this even for blockmapped
ea_inode files, and I'm not complaining about this hunk at all. :)

However, please consider requiring the extents feature + format as a
prerequisite for ea_inodes.  ext4 has traditionally been very ...
permissive about supporting a diverse range of feature options, but the
cost of that diversity is that the feature support matrix that the
community has to support is already untestably large.

I think it would be wise not to support !extents && ea_inode,
particularly since blockmaps aren't protected by metadata_csum and so in
the long run it's probably best to minimize the introduction of new
blockmap files (on ext4 anyway).

--D

>  		flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
>  	else if (ext4_should_journal_data(inode))
>  		flags |= EXT4_FREE_BLOCKS_FORGET;
> -- 
> 2.13.0.219.gdb65acc882-goog
> 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 09/28] ext4: extended attribute value size limit is enforced by vfs
  2017-05-31 16:03     ` Darrick J. Wong
  (?)
  (?)
@ 2017-05-31 16:13     ` Tahsin Erdogan
  -1 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31 16:13 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

Hi Darrick,

>> -#define EXT4_XATTR_MAX_LARGE_EA_SIZE    (1024 * 1024)
>
> Uhhh... didn't you add this in patch 1/28?  There's little point in
> adding a symbol just to delete it shortly thereafter.
>
1/28 is the original patch I've received from Andreas. I wanted to
leave his patch in its original form as much as possible so that the
modifications I made on them are clear. If preferred, I can squash
them later, but I thought it is a little clearer this way.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 01/28] ext4: xattr-in-inode support
  2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
  2017-05-31  8:14 ` [PATCH 02/28] ext4: fix lockdep warning about recursive inode locking Tahsin Erdogan
@ 2017-05-31 16:42   ` Darrick J. Wong
  2017-05-31  8:14 ` [PATCH 04/28] ext4: do not set posix acls on xattr inodes Tahsin Erdogan
                     ` (25 subsequent siblings)
  27 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-05-31 16:42 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Andreas Dilger, Kalpak Shah, James Simmons

On Wed, May 31, 2017 at 01:14:50AM -0700, Tahsin Erdogan wrote:
> From: Andreas Dilger <andreas.dilger@intel.com>
> 
> Large xattr support is implemented for EXT4_FEATURE_INCOMPAT_EA_INODE.
> 
> If the size of an xattr value is larger than will fit in a single
> external block, then the xattr value will be saved into the body
> of an external xattr inode.
> 
> The also helps support a larger number of xattr, since only the headers
> will be stored in the in-inode space or the single external block.
> 
> The inode is referenced from the xattr header via "e_value_inum",
> which was formerly "e_value_block", but that field was never used.
> The e_value_size still contains the xattr size so that listing
> xattrs does not need to look up the inode if the data is not accessed.
> 
> struct ext4_xattr_entry {
>         __u8    e_name_len;     /* length of name */
>         __u8    e_name_index;   /* attribute name index */
>         __le16  e_value_offs;   /* offset in disk block of value */
>         __le32  e_value_inum;   /* inode in which value is stored */
>         __le32  e_value_size;   /* size of attribute value */
>         __le32  e_hash;         /* hash value of name and value */
>         char    e_name[0];      /* attribute name */
> };
> 
> The xattr inode is marked with the EXT4_EA_INODE_FL flag and also
> holds a back-reference to the owning inode in its i_mtime field,
> allowing the ext4/e2fsck to verify the correct inode is accessed.

Can we store the checksum of the xattr value somewhere?  We already
checksum the values if they're stored in the ibody or a single external
block, and I'd hate to lose that protection.

We could probably reuse one of the inode fields (i_version?) for this.

--D 

> Lustre-Jira: https://jira.hpdd.intel.com/browse/LU-80
> Lustre-bugzilla: https://bugzilla.lustre.org/show_bug.cgi?id=4424
> Signed-off-by: Kalpak Shah <kalpak.shah@sun.com>
> Signed-off-by: James Simmons <uja.ornl@gmail.com>
> Signed-off-by: Andreas Dilger <andreas.dilger@intel.com>
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext4/ext4.h   |  12 ++
>  fs/ext4/ialloc.c |   1 -
>  fs/ext4/inline.c |   2 +-
>  fs/ext4/inode.c  |  49 ++++-
>  fs/ext4/xattr.c  | 565 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  fs/ext4/xattr.h  |  33 +++-
>  6 files changed, 606 insertions(+), 56 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 32191548abed..24ef56b4572f 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1797,6 +1797,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,		ENCRYPT)
>  					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
>  					 EXT4_FEATURE_INCOMPAT_64BIT| \
>  					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
> +					 EXT4_FEATURE_INCOMPAT_EA_INODE| \
>  					 EXT4_FEATURE_INCOMPAT_MMP | \
>  					 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
>  					 EXT4_FEATURE_INCOMPAT_ENCRYPT | \
> @@ -2220,6 +2221,12 @@ struct mmpd_data {
>  #define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
>  
>  /*
> + * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb
> + * This limit is arbitrary, but is reasonable for the xattr API.
> + */
> +#define EXT4_XATTR_MAX_LARGE_EA_SIZE    (1024 * 1024)
> +
> +/*
>   * Function prototypes
>   */
>  
> @@ -2231,6 +2238,10 @@ struct mmpd_data {
>  # define ATTRIB_NORET	__attribute__((noreturn))
>  # define NORET_AND	noreturn,
>  
> +struct ext4_xattr_ino_array {
> +	unsigned int xia_count;		/* # of used item in the array */
> +	unsigned int xia_inodes[0];
> +};
>  /* bitmap.c */
>  extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
>  void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
> @@ -2478,6 +2489,7 @@ extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
>  extern void ext4_set_inode_flags(struct inode *);
>  extern int ext4_alloc_da_blocks(struct inode *inode);
>  extern void ext4_set_aops(struct inode *inode);
> +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk);
>  extern int ext4_writepage_trans_blocks(struct inode *);
>  extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
>  extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 98ac2f1f23b3..e2eb3cc06820 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -294,7 +294,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
>  	 * as writing the quota to disk may need the lock as well.
>  	 */
>  	dquot_initialize(inode);
> -	ext4_xattr_delete_inode(handle, inode);
>  	dquot_free_inode(inode);
>  	dquot_drop(inode);
>  
> diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
> index 8d141c0c8ff9..28c5c3abddb3 100644
> --- a/fs/ext4/inline.c
> +++ b/fs/ext4/inline.c
> @@ -61,7 +61,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
>  
>  	/* Compute min_offs. */
>  	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
> -		if (!entry->e_value_block && entry->e_value_size) {
> +		if (!entry->e_value_inum && entry->e_value_size) {
>  			size_t offs = le16_to_cpu(entry->e_value_offs);
>  			if (offs < min_offs)
>  				min_offs = offs;
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 5cf82d03968c..e5535e5b3dc5 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -139,8 +139,6 @@ static void ext4_invalidatepage(struct page *page, unsigned int offset,
>  				unsigned int length);
>  static int __ext4_journalled_writepage(struct page *page, unsigned int len);
>  static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
> -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
> -				  int pextents);
>  
>  /*
>   * Test whether an inode is a fast symlink.
> @@ -189,6 +187,8 @@ void ext4_evict_inode(struct inode *inode)
>  {
>  	handle_t *handle;
>  	int err;
> +	int extra_credits = 3;
> +	struct ext4_xattr_ino_array *lea_ino_array = NULL;
>  
>  	trace_ext4_evict_inode(inode);
>  
> @@ -238,8 +238,8 @@ void ext4_evict_inode(struct inode *inode)
>  	 * protection against it
>  	 */
>  	sb_start_intwrite(inode->i_sb);
> -	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
> -				    ext4_blocks_for_truncate(inode)+3);
> +
> +	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits);
>  	if (IS_ERR(handle)) {
>  		ext4_std_error(inode->i_sb, PTR_ERR(handle));
>  		/*
> @@ -251,9 +251,36 @@ void ext4_evict_inode(struct inode *inode)
>  		sb_end_intwrite(inode->i_sb);
>  		goto no_delete;
>  	}
> -
>  	if (IS_SYNC(inode))
>  		ext4_handle_sync(handle);
> +
> +	/*
> +	 * Delete xattr inode before deleting the main inode.
> +	 */
> +	err = ext4_xattr_delete_inode(handle, inode, &lea_ino_array);
> +	if (err) {
> +		ext4_warning(inode->i_sb,
> +			     "couldn't delete inode's xattr (err %d)", err);
> +		goto stop_handle;
> +	}
> +
> +	if (!IS_NOQUOTA(inode))
> +		extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
> +
> +	if (!ext4_handle_has_enough_credits(handle,
> +			ext4_blocks_for_truncate(inode) + extra_credits)) {
> +		err = ext4_journal_extend(handle,
> +			ext4_blocks_for_truncate(inode) + extra_credits);
> +		if (err > 0)
> +			err = ext4_journal_restart(handle,
> +			ext4_blocks_for_truncate(inode) + extra_credits);
> +		if (err != 0) {
> +			ext4_warning(inode->i_sb,
> +				     "couldn't extend journal (err %d)", err);
> +			goto stop_handle;
> +		}
> +	}
> +
>  	inode->i_size = 0;
>  	err = ext4_mark_inode_dirty(handle, inode);
>  	if (err) {
> @@ -277,10 +304,10 @@ void ext4_evict_inode(struct inode *inode)
>  	 * enough credits left in the handle to remove the inode from
>  	 * the orphan list and set the dtime field.
>  	 */
> -	if (!ext4_handle_has_enough_credits(handle, 3)) {
> -		err = ext4_journal_extend(handle, 3);
> +	if (!ext4_handle_has_enough_credits(handle, extra_credits)) {
> +		err = ext4_journal_extend(handle, extra_credits);
>  		if (err > 0)
> -			err = ext4_journal_restart(handle, 3);
> +			err = ext4_journal_restart(handle, extra_credits);
>  		if (err != 0) {
>  			ext4_warning(inode->i_sb,
>  				     "couldn't extend journal (err %d)", err);
> @@ -315,8 +342,12 @@ void ext4_evict_inode(struct inode *inode)
>  		ext4_clear_inode(inode);
>  	else
>  		ext4_free_inode(handle, inode);
> +
>  	ext4_journal_stop(handle);
>  	sb_end_intwrite(inode->i_sb);
> +
> +	if (lea_ino_array != NULL)
> +		ext4_xattr_inode_array_free(inode, lea_ino_array);
>  	return;
>  no_delete:
>  	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
> @@ -5504,7 +5535,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
>   *
>   * Also account for superblock, inode, quota and xattr blocks
>   */
> -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
> +int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
>  				  int pextents)
>  {
>  	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 5d3c2536641c..444be5c7a1d5 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -177,9 +177,8 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
>  
>  	/* Check the values */
>  	while (!IS_LAST_ENTRY(entry)) {
> -		if (entry->e_value_block != 0)
> -			return -EFSCORRUPTED;
> -		if (entry->e_value_size != 0) {
> +		if (entry->e_value_size != 0 &&
> +		    entry->e_value_inum == 0) {
>  			u16 offs = le16_to_cpu(entry->e_value_offs);
>  			u32 size = le32_to_cpu(entry->e_value_size);
>  			void *value;
> @@ -269,6 +268,99 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
>  	return cmp ? -ENODATA : 0;
>  }
>  
> +/*
> + * Read the EA value from an inode.
> + */
> +static int
> +ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size)
> +{
> +	unsigned long block = 0;
> +	struct buffer_head *bh = NULL;
> +	int blocksize;
> +	size_t csize, ret_size = 0;
> +
> +	if (*size == 0)
> +		return 0;
> +
> +	blocksize = ea_inode->i_sb->s_blocksize;
> +
> +	while (ret_size < *size) {
> +		csize = (*size - ret_size) > blocksize ? blocksize :
> +							*size - ret_size;
> +		bh = ext4_bread(NULL, ea_inode, block, 0);
> +		if (IS_ERR(bh)) {
> +			*size = ret_size;
> +			return PTR_ERR(bh);
> +		}
> +		memcpy(buf, bh->b_data, csize);
> +		brelse(bh);
> +
> +		buf += csize;
> +		block += 1;
> +		ret_size += csize;
> +	}
> +
> +	*size = ret_size;
> +
> +	return 0;
> +}
> +
> +struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, int *err)
> +{
> +	struct inode *ea_inode = NULL;
> +
> +	ea_inode = ext4_iget(parent->i_sb, ea_ino);
> +	if (IS_ERR(ea_inode) || is_bad_inode(ea_inode)) {
> +		int rc = IS_ERR(ea_inode) ? PTR_ERR(ea_inode) : 0;
> +		ext4_error(parent->i_sb, "error while reading EA inode %lu "
> +			   "/ %d %d", ea_ino, rc, is_bad_inode(ea_inode));
> +		*err = rc != 0 ? rc : -EIO;
> +		return NULL;
> +	}
> +
> +	if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != parent->i_ino ||
> +	    ea_inode->i_generation != parent->i_generation) {
> +		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
> +			   "to parent invalid.", ea_ino);
> +		*err = -EINVAL;
> +		goto error;
> +	}
> +
> +	if (!(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL)) {
> +		ext4_error(parent->i_sb, "EA inode %lu does not have "
> +			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
> +		*err = -EINVAL;
> +		goto error;
> +	}
> +
> +	*err = 0;
> +	return ea_inode;
> +
> +error:
> +	iput(ea_inode);
> +	return NULL;
> +}
> +
> +/*
> + * Read the value from the EA inode.
> + */
> +static int
> +ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
> +		     size_t *size)
> +{
> +	struct inode *ea_inode = NULL;
> +	int err;
> +
> +	ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_inode_read(ea_inode, buffer, size);
> +	iput(ea_inode);
> +
> +	return err;
> +}
> +
>  static int
>  ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
>  		     void *buffer, size_t buffer_size)
> @@ -308,8 +400,16 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
>  		error = -ERANGE;
>  		if (size > buffer_size)
>  			goto cleanup;
> -		memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
> -		       size);
> +		if (entry->e_value_inum) {
> +			error = ext4_xattr_inode_get(inode,
> +					     le32_to_cpu(entry->e_value_inum),
> +					     buffer, &size);
> +			if (error)
> +				goto cleanup;
> +		} else {
> +			memcpy(buffer, bh->b_data +
> +			       le16_to_cpu(entry->e_value_offs), size);
> +		}
>  	}
>  	error = size;
>  
> @@ -350,8 +450,16 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
>  		error = -ERANGE;
>  		if (size > buffer_size)
>  			goto cleanup;
> -		memcpy(buffer, (void *)IFIRST(header) +
> -		       le16_to_cpu(entry->e_value_offs), size);
> +		if (entry->e_value_inum) {
> +			error = ext4_xattr_inode_get(inode,
> +					     le32_to_cpu(entry->e_value_inum),
> +					     buffer, &size);
> +			if (error)
> +				goto cleanup;
> +		} else {
> +			memcpy(buffer, (void *)IFIRST(header) +
> +			       le16_to_cpu(entry->e_value_offs), size);
> +		}
>  	}
>  	error = size;
>  
> @@ -620,7 +728,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
>  				    size_t *min_offs, void *base, int *total)
>  {
>  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
> -		if (last->e_value_size) {
> +		if (!last->e_value_inum && last->e_value_size) {
>  			size_t offs = le16_to_cpu(last->e_value_offs);
>  			if (offs < *min_offs)
>  				*min_offs = offs;
> @@ -631,16 +739,173 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
>  	return (*min_offs - ((void *)last - base) - sizeof(__u32));
>  }
>  
> -static int
> -ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
> +/*
> + * Write the value of the EA in an inode.
> + */
> +static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
> +				  const void *buf, int bufsize)
> +{
> +	struct buffer_head *bh = NULL;
> +	unsigned long block = 0;
> +	unsigned blocksize = ea_inode->i_sb->s_blocksize;
> +	unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
> +	int csize, wsize = 0;
> +	int ret = 0;
> +	int retries = 0;
> +
> +retry:
> +	while (ret >= 0 && ret < max_blocks) {
> +		struct ext4_map_blocks map;
> +		map.m_lblk = block += ret;
> +		map.m_len = max_blocks -= ret;
> +
> +		ret = ext4_map_blocks(handle, ea_inode, &map,
> +				      EXT4_GET_BLOCKS_CREATE);
> +		if (ret <= 0) {
> +			ext4_mark_inode_dirty(handle, ea_inode);
> +			if (ret == -ENOSPC &&
> +			    ext4_should_retry_alloc(ea_inode->i_sb, &retries)) {
> +				ret = 0;
> +				goto retry;
> +			}
> +			break;
> +		}
> +	}
> +
> +	if (ret < 0)
> +		return ret;
> +
> +	block = 0;
> +	while (wsize < bufsize) {
> +		if (bh != NULL)
> +			brelse(bh);
> +		csize = (bufsize - wsize) > blocksize ? blocksize :
> +								bufsize - wsize;
> +		bh = ext4_getblk(handle, ea_inode, block, 0);
> +		if (IS_ERR(bh)) {
> +			ret = PTR_ERR(bh);
> +			goto out;
> +		}
> +		ret = ext4_journal_get_write_access(handle, bh);
> +		if (ret)
> +			goto out;
> +
> +		memcpy(bh->b_data, buf, csize);
> +		set_buffer_uptodate(bh);
> +		ext4_handle_dirty_metadata(handle, ea_inode, bh);
> +
> +		buf += csize;
> +		wsize += csize;
> +		block += 1;
> +	}
> +
> +	inode_lock(ea_inode);
> +	i_size_write(ea_inode, wsize);
> +	ext4_update_i_disksize(ea_inode, wsize);
> +	inode_unlock(ea_inode);
> +
> +	ext4_mark_inode_dirty(handle, ea_inode);
> +
> +out:
> +	brelse(bh);
> +
> +	return ret;
> +}
> +
> +/*
> + * Create an inode to store the value of a large EA.
> + */
> +static struct inode *ext4_xattr_inode_create(handle_t *handle,
> +					     struct inode *inode)
> +{
> +	struct inode *ea_inode = NULL;
> +
> +	/*
> +	 * Let the next inode be the goal, so we try and allocate the EA inode
> +	 * in the same group, or nearby one.
> +	 */
> +	ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
> +				  S_IFREG | 0600, NULL, inode->i_ino + 1, NULL);
> +	if (!IS_ERR(ea_inode)) {
> +		ea_inode->i_op = &ext4_file_inode_operations;
> +		ea_inode->i_fop = &ext4_file_operations;
> +		ext4_set_aops(ea_inode);
> +		ea_inode->i_generation = inode->i_generation;
> +		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
> +
> +		/*
> +		 * A back-pointer from EA inode to parent inode will be useful
> +		 * for e2fsck.
> +		 */
> +		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
> +		unlock_new_inode(ea_inode);
> +	}
> +
> +	return ea_inode;
> +}
> +
> +/*
> + * Unlink the inode storing the value of the EA.
> + */
> +int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
> +{
> +	struct inode *ea_inode = NULL;
> +	int err;
> +
> +	ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
> +	if (err)
> +		return err;
> +
> +	clear_nlink(ea_inode);
> +	iput(ea_inode);
> +
> +	return 0;
> +}
> +
> +/*
> + * Add value of the EA in an inode.
> + */
> +static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
> +				unsigned long *ea_ino, const void *value,
> +				size_t value_len)
> +{
> +	struct inode *ea_inode;
> +	int err;
> +
> +	/* Create an inode for the EA value */
> +	ea_inode = ext4_xattr_inode_create(handle, inode);
> +	if (IS_ERR(ea_inode))
> +		return PTR_ERR(ea_inode);
> +
> +	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
> +	if (err)
> +		clear_nlink(ea_inode);
> +	else
> +		*ea_ino = ea_inode->i_ino;
> +
> +	iput(ea_inode);
> +
> +	return err;
> +}
> +
> +static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
> +				struct ext4_xattr_search *s,
> +				handle_t *handle, struct inode *inode)
>  {
>  	struct ext4_xattr_entry *last;
>  	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
> +	int in_inode = i->in_inode;
> +	int rc;
> +
> +	if (ext4_has_feature_ea_inode(inode->i_sb) &&
> +	    (EXT4_XATTR_SIZE(i->value_len) >
> +	     EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize)))
> +		in_inode = 1;
>  
>  	/* Compute min_offs and last. */
>  	last = s->first;
>  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
> -		if (last->e_value_size) {
> +		if (!last->e_value_inum && last->e_value_size) {
>  			size_t offs = le16_to_cpu(last->e_value_offs);
>  			if (offs < min_offs)
>  				min_offs = offs;
> @@ -648,15 +913,20 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
>  	}
>  	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
>  	if (!s->not_found) {
> -		if (s->here->e_value_size) {
> +		if (!in_inode &&
> +		    !s->here->e_value_inum && s->here->e_value_size) {
>  			size_t size = le32_to_cpu(s->here->e_value_size);
>  			free += EXT4_XATTR_SIZE(size);
>  		}
>  		free += EXT4_XATTR_LEN(name_len);
>  	}
>  	if (i->value) {
> -		if (free < EXT4_XATTR_LEN(name_len) +
> -			   EXT4_XATTR_SIZE(i->value_len))
> +		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
> +
> +		if (in_inode)
> +			value_len = 0;
> +
> +		if (free < EXT4_XATTR_LEN(name_len) + value_len)
>  			return -ENOSPC;
>  	}
>  
> @@ -670,7 +940,8 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
>  		s->here->e_name_len = name_len;
>  		memcpy(s->here->e_name, i->name, name_len);
>  	} else {
> -		if (s->here->e_value_size) {
> +		if (!s->here->e_value_inum && s->here->e_value_size &&
> +		    s->here->e_value_offs > 0) {
>  			void *first_val = s->base + min_offs;
>  			size_t offs = le16_to_cpu(s->here->e_value_offs);
>  			void *val = s->base + offs;
> @@ -704,12 +975,18 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
>  			last = s->first;
>  			while (!IS_LAST_ENTRY(last)) {
>  				size_t o = le16_to_cpu(last->e_value_offs);
> -				if (last->e_value_size && o < offs)
> +				if (!last->e_value_inum &&
> +				    last->e_value_size && o < offs)
>  					last->e_value_offs =
>  						cpu_to_le16(o + size);
>  				last = EXT4_XATTR_NEXT(last);
>  			}
>  		}
> +		if (s->here->e_value_inum) {
> +			ext4_xattr_inode_unlink(inode,
> +					    le32_to_cpu(s->here->e_value_inum));
> +			s->here->e_value_inum = 0;
> +		}
>  		if (!i->value) {
>  			/* Remove the old name. */
>  			size_t size = EXT4_XATTR_LEN(name_len);
> @@ -722,11 +999,20 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
>  
>  	if (i->value) {
>  		/* Insert the new value. */
> -		s->here->e_value_size = cpu_to_le32(i->value_len);
> -		if (i->value_len) {
> +		if (in_inode) {
> +			unsigned long ea_ino =
> +				le32_to_cpu(s->here->e_value_inum);
> +			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
> +						  i->value, i->value_len);
> +			if (rc)
> +				goto out;
> +			s->here->e_value_inum = cpu_to_le32(ea_ino);
> +			s->here->e_value_offs = 0;
> +		} else if (i->value_len) {
>  			size_t size = EXT4_XATTR_SIZE(i->value_len);
>  			void *val = s->base + min_offs - size;
>  			s->here->e_value_offs = cpu_to_le16(min_offs - size);
> +			s->here->e_value_inum = 0;
>  			if (i->value == EXT4_ZERO_XATTR_VALUE) {
>  				memset(val, 0, size);
>  			} else {
> @@ -736,8 +1022,11 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
>  				memcpy(val, i->value, i->value_len);
>  			}
>  		}
> +		s->here->e_value_size = cpu_to_le32(i->value_len);
>  	}
> -	return 0;
> +
> +out:
> +	return rc;
>  }
>  
>  struct ext4_xattr_block_find {
> @@ -801,8 +1090,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  
>  #define header(x) ((struct ext4_xattr_header *)(x))
>  
> -	if (i->value && i->value_len > sb->s_blocksize)
> -		return -ENOSPC;
>  	if (s->base) {
>  		BUFFER_TRACE(bs->bh, "get_write_access");
>  		error = ext4_journal_get_write_access(handle, bs->bh);
> @@ -821,7 +1108,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  			mb_cache_entry_delete_block(ext4_mb_cache, hash,
>  						    bs->bh->b_blocknr);
>  			ea_bdebug(bs->bh, "modifying in-place");
> -			error = ext4_xattr_set_entry(i, s);
> +			error = ext4_xattr_set_entry(i, s, handle, inode);
>  			if (!error) {
>  				if (!IS_LAST_ENTRY(s->first))
>  					ext4_xattr_rehash(header(s->base),
> @@ -870,7 +1157,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  		s->end = s->base + sb->s_blocksize;
>  	}
>  
> -	error = ext4_xattr_set_entry(i, s);
> +	error = ext4_xattr_set_entry(i, s, handle, inode);
>  	if (error == -EFSCORRUPTED)
>  		goto bad_block;
>  	if (error)
> @@ -1070,7 +1357,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
>  
>  	if (EXT4_I(inode)->i_extra_isize == 0)
>  		return -ENOSPC;
> -	error = ext4_xattr_set_entry(i, s);
> +	error = ext4_xattr_set_entry(i, s, handle, inode);
>  	if (error) {
>  		if (error == -ENOSPC &&
>  		    ext4_has_inline_data(inode)) {
> @@ -1082,7 +1369,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
>  			error = ext4_xattr_ibody_find(inode, i, is);
>  			if (error)
>  				return error;
> -			error = ext4_xattr_set_entry(i, s);
> +			error = ext4_xattr_set_entry(i, s, handle, inode);
>  		}
>  		if (error)
>  			return error;
> @@ -1098,7 +1385,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
>  	return 0;
>  }
>  
> -static int ext4_xattr_ibody_set(struct inode *inode,
> +static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
>  				struct ext4_xattr_info *i,
>  				struct ext4_xattr_ibody_find *is)
>  {
> @@ -1108,7 +1395,7 @@ static int ext4_xattr_ibody_set(struct inode *inode,
>  
>  	if (EXT4_I(inode)->i_extra_isize == 0)
>  		return -ENOSPC;
> -	error = ext4_xattr_set_entry(i, s);
> +	error = ext4_xattr_set_entry(i, s, handle, inode);
>  	if (error)
>  		return error;
>  	header = IHDR(inode, ext4_raw_inode(&is->iloc));
> @@ -1155,7 +1442,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  		.name = name,
>  		.value = value,
>  		.value_len = value_len,
> -
> +		.in_inode = 0,
>  	};
>  	struct ext4_xattr_ibody_find is = {
>  		.s = { .not_found = -ENODATA, },
> @@ -1204,7 +1491,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  	}
>  	if (!value) {
>  		if (!is.s.not_found)
> -			error = ext4_xattr_ibody_set(inode, &i, &is);
> +			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
>  		else if (!bs.s.not_found)
>  			error = ext4_xattr_block_set(handle, inode, &i, &bs);
>  	} else {
> @@ -1215,7 +1502,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  		if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
>  			goto cleanup;
>  
> -		error = ext4_xattr_ibody_set(inode, &i, &is);
> +		error = ext4_xattr_ibody_set(handle, inode, &i, &is);
>  		if (!error && !bs.s.not_found) {
>  			i.value = NULL;
>  			error = ext4_xattr_block_set(handle, inode, &i, &bs);
> @@ -1226,11 +1513,20 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  					goto cleanup;
>  			}
>  			error = ext4_xattr_block_set(handle, inode, &i, &bs);
> +			if (ext4_has_feature_ea_inode(inode->i_sb) &&
> +			    error == -ENOSPC) {
> +				/* xattr not fit to block, store at external
> +				 * inode */
> +				i.in_inode = 1;
> +				error = ext4_xattr_ibody_set(handle, inode,
> +							     &i, &is);
> +			}
>  			if (error)
>  				goto cleanup;
>  			if (!is.s.not_found) {
>  				i.value = NULL;
> -				error = ext4_xattr_ibody_set(inode, &i, &is);
> +				error = ext4_xattr_ibody_set(handle, inode, &i,
> +							     &is);
>  			}
>  		}
>  	}
> @@ -1269,12 +1565,26 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
>  	       const void *value, size_t value_len, int flags)
>  {
>  	handle_t *handle;
> +	struct super_block *sb = inode->i_sb;
>  	int error, retries = 0;
>  	int credits = ext4_jbd2_credits_xattr(inode);
>  
>  	error = dquot_initialize(inode);
>  	if (error)
>  		return error;
> +
> +	if ((value_len >= EXT4_XATTR_MIN_LARGE_EA_SIZE(sb->s_blocksize)) &&
> +	    ext4_has_feature_ea_inode(sb)) {
> +		int nrblocks = (value_len + sb->s_blocksize - 1) >>
> +					sb->s_blocksize_bits;
> +
> +		/* For new inode */
> +		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
> +
> +		/* For data blocks of EA inode */
> +		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
> +	}
> +
>  retry:
>  	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>  	if (IS_ERR(handle)) {
> @@ -1286,7 +1596,7 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
>  					      value, value_len, flags);
>  		error2 = ext4_journal_stop(handle);
>  		if (error == -ENOSPC &&
> -		    ext4_should_retry_alloc(inode->i_sb, &retries))
> +		    ext4_should_retry_alloc(sb, &retries))
>  			goto retry;
>  		if (error == 0)
>  			error = error2;
> @@ -1311,7 +1621,7 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
>  
>  	/* Adjust the value offsets of the entries */
>  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
> -		if (last->e_value_size) {
> +		if (!last->e_value_inum && last->e_value_size) {
>  			new_offs = le16_to_cpu(last->e_value_offs) +
>  							value_offs_shift;
>  			last->e_value_offs = cpu_to_le16(new_offs);
> @@ -1372,7 +1682,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
>  		goto out;
>  
>  	/* Remove the chosen entry from the inode */
> -	error = ext4_xattr_ibody_set(inode, &i, is);
> +	error = ext4_xattr_ibody_set(handle, inode, &i, is);
>  	if (error)
>  		goto out;
>  
> @@ -1572,21 +1882,135 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
>  }
>  
>  
> +#define EIA_INCR 16 /* must be 2^n */
> +#define EIA_MASK (EIA_INCR - 1)
> +/* Add the large xattr @ino into @lea_ino_array for later deletion.
> + * If @lea_ino_array is new or full it will be grown and the old
> + * contents copied over.
> + */
> +static int
> +ext4_expand_ino_array(struct ext4_xattr_ino_array **lea_ino_array, __u32 ino)
> +{
> +	if (*lea_ino_array == NULL) {
> +		/*
> +		 * Start with 15 inodes, so it fits into a power-of-two size.
> +		 * If *lea_ino_array is NULL, this is essentially offsetof()
> +		 */
> +		(*lea_ino_array) =
> +			kmalloc(offsetof(struct ext4_xattr_ino_array,
> +					 xia_inodes[EIA_MASK]),
> +				GFP_NOFS);
> +		if (*lea_ino_array == NULL)
> +			return -ENOMEM;
> +		(*lea_ino_array)->xia_count = 0;
> +	} else if (((*lea_ino_array)->xia_count & EIA_MASK) == EIA_MASK) {
> +		/* expand the array once all 15 + n * 16 slots are full */
> +		struct ext4_xattr_ino_array *new_array = NULL;
> +		int count = (*lea_ino_array)->xia_count;
> +
> +		/* if new_array is NULL, this is essentially offsetof() */
> +		new_array = kmalloc(
> +				offsetof(struct ext4_xattr_ino_array,
> +					 xia_inodes[count + EIA_INCR]),
> +				GFP_NOFS);
> +		if (new_array == NULL)
> +			return -ENOMEM;
> +		memcpy(new_array, *lea_ino_array,
> +		       offsetof(struct ext4_xattr_ino_array,
> +				xia_inodes[count]));
> +		kfree(*lea_ino_array);
> +		*lea_ino_array = new_array;
> +	}
> +	(*lea_ino_array)->xia_inodes[(*lea_ino_array)->xia_count++] = ino;
> +	return 0;
> +}
> +
> +/**
> + * Add xattr inode to orphan list
> + */
> +static int
> +ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode,
> +			int credits, struct ext4_xattr_ino_array *lea_ino_array)
> +{
> +	struct inode *ea_inode = NULL;
> +	int idx = 0, error = 0;
> +
> +	if (lea_ino_array == NULL)
> +		return 0;
> +
> +	for (; idx < lea_ino_array->xia_count; ++idx) {
> +		if (!ext4_handle_has_enough_credits(handle, credits)) {
> +			error = ext4_journal_extend(handle, credits);
> +			if (error > 0)
> +				error = ext4_journal_restart(handle, credits);
> +
> +			if (error != 0) {
> +				ext4_warning(inode->i_sb,
> +					"couldn't extend journal "
> +					"(err %d)", error);
> +				return error;
> +			}
> +		}
> +		ea_inode = ext4_xattr_inode_iget(inode,
> +				lea_ino_array->xia_inodes[idx], &error);
> +		if (error)
> +			continue;
> +		ext4_orphan_add(handle, ea_inode);
> +		/* the inode's i_count will be released by caller */
> +	}
> +
> +	return 0;
> +}
>  
>  /*
>   * ext4_xattr_delete_inode()
>   *
> - * Free extended attribute resources associated with this inode. This
> + * Free extended attribute resources associated with this inode. Traverse
> + * all entries and unlink any xattr inodes associated with this inode. This
>   * is called immediately before an inode is freed. We have exclusive
> - * access to the inode.
> + * access to the inode. If an orphan inode is deleted it will also delete any
> + * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
> + * to ensure they belong to the parent inode and were not deleted already.
>   */
> -void
> -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
> +int
> +ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> +			struct ext4_xattr_ino_array **lea_ino_array)
>  {
>  	struct buffer_head *bh = NULL;
> +	struct ext4_xattr_ibody_header *header;
> +	struct ext4_inode *raw_inode;
> +	struct ext4_iloc iloc;
> +	struct ext4_xattr_entry *entry;
> +	int credits = 3, error = 0;
>  
> -	if (!EXT4_I(inode)->i_file_acl)
> +	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
> +		goto delete_external_ea;
> +
> +	error = ext4_get_inode_loc(inode, &iloc);
> +	if (error)
> +		goto cleanup;
> +	raw_inode = ext4_raw_inode(&iloc);
> +	header = IHDR(inode, raw_inode);
> +	for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		if (ext4_expand_ino_array(lea_ino_array,
> +					  entry->e_value_inum) != 0) {
> +			brelse(iloc.bh);
> +			goto cleanup;
> +		}
> +		entry->e_value_inum = 0;
> +	}
> +	brelse(iloc.bh);
> +
> +delete_external_ea:
> +	if (!EXT4_I(inode)->i_file_acl) {
> +		/* add xattr inode to orphan list */
> +		ext4_xattr_inode_orphan_add(handle, inode, credits,
> +						*lea_ino_array);
>  		goto cleanup;
> +	}
>  	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
>  	if (!bh) {
>  		EXT4_ERROR_INODE(inode, "block %llu read error",
> @@ -1599,11 +2023,69 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
>  				 EXT4_I(inode)->i_file_acl);
>  		goto cleanup;
>  	}
> +
> +	for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		if (ext4_expand_ino_array(lea_ino_array,
> +					  entry->e_value_inum) != 0)
> +			goto cleanup;
> +		entry->e_value_inum = 0;
> +	}
> +
> +	/* add xattr inode to orphan list */
> +	error = ext4_xattr_inode_orphan_add(handle, inode, credits,
> +					*lea_ino_array);
> +	if (error != 0)
> +		goto cleanup;
> +
> +	if (!IS_NOQUOTA(inode))
> +		credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
> +
> +	if (!ext4_handle_has_enough_credits(handle, credits)) {
> +		error = ext4_journal_extend(handle, credits);
> +		if (error > 0)
> +			error = ext4_journal_restart(handle, credits);
> +		if (error != 0) {
> +			ext4_warning(inode->i_sb,
> +				"couldn't extend journal (err %d)", error);
> +			goto cleanup;
> +		}
> +	}
> +
>  	ext4_xattr_release_block(handle, inode, bh);
>  	EXT4_I(inode)->i_file_acl = 0;
>  
>  cleanup:
>  	brelse(bh);
> +
> +	return error;
> +}
> +
> +void
> +ext4_xattr_inode_array_free(struct inode *inode,
> +			    struct ext4_xattr_ino_array *lea_ino_array)
> +{
> +	struct inode	*ea_inode = NULL;
> +	int		idx = 0;
> +	int		err;
> +
> +	if (lea_ino_array == NULL)
> +		return;
> +
> +	for (; idx < lea_ino_array->xia_count; ++idx) {
> +		ea_inode = ext4_xattr_inode_iget(inode,
> +				lea_ino_array->xia_inodes[idx], &err);
> +		if (err)
> +			continue;
> +		/* for inode's i_count get from ext4_xattr_delete_inode */
> +		if (!list_empty(&EXT4_I(ea_inode)->i_orphan))
> +			iput(ea_inode);
> +		clear_nlink(ea_inode);
> +		iput(ea_inode);
> +	}
> +	kfree(lea_ino_array);
>  }
>  
>  /*
> @@ -1655,10 +2137,9 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
>  		    entry1->e_name_index != entry2->e_name_index ||
>  		    entry1->e_name_len != entry2->e_name_len ||
>  		    entry1->e_value_size != entry2->e_value_size ||
> +		    entry1->e_value_inum != entry2->e_value_inum ||
>  		    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
>  			return 1;
> -		if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
> -			return -EFSCORRUPTED;
>  		if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
>  			   (char *)header2 + le16_to_cpu(entry2->e_value_offs),
>  			   le32_to_cpu(entry1->e_value_size)))
> @@ -1730,7 +2211,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
>  		       *name++;
>  	}
>  
> -	if (entry->e_value_size != 0) {
> +	if (!entry->e_value_inum && entry->e_value_size) {
>  		__le32 *value = (__le32 *)((char *)header +
>  			le16_to_cpu(entry->e_value_offs));
>  		for (n = (le32_to_cpu(entry->e_value_size) +
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index 099c8b670ef5..6e10ff9393d4 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -44,7 +44,7 @@ struct ext4_xattr_entry {
>  	__u8	e_name_len;	/* length of name */
>  	__u8	e_name_index;	/* attribute name index */
>  	__le16	e_value_offs;	/* offset in disk block of value */
> -	__le32	e_value_block;	/* disk block attribute is stored on (n/i) */
> +	__le32	e_value_inum;	/* inode in which the value is stored */
>  	__le32	e_value_size;	/* size of attribute value */
>  	__le32	e_hash;		/* hash value of name and value */
>  	char	e_name[0];	/* attribute name */
> @@ -69,6 +69,26 @@ struct ext4_xattr_entry {
>  		EXT4_I(inode)->i_extra_isize))
>  #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
>  
> +/*
> + * Link EA inode back to parent one using i_mtime field.
> + * Extra integer type conversion added to ignore higher
> + * bits in i_mtime.tv_sec which might be set by ext4_get()
> + */
> +#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
> +do {                                                  \
> +      (inode)->i_mtime.tv_sec = inum;                 \
> +} while(0)
> +
> +#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
> +((__u32)(inode)->i_mtime.tv_sec)
> +
> +/*
> + * The minimum size of EA value when you start storing it in an external inode
> + * size of block - size of header - size of 1 entry - 4 null bytes
> +*/
> +#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b)					\
> +	((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)
> +
>  #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
>  #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
>  #define BFIRST(bh) ENTRY(BHDR(bh)+1)
> @@ -77,10 +97,11 @@ struct ext4_xattr_entry {
>  #define EXT4_ZERO_XATTR_VALUE ((void *)-1)
>  
>  struct ext4_xattr_info {
> -	int name_index;
>  	const char *name;
>  	const void *value;
>  	size_t value_len;
> +	int name_index;
> +	int in_inode;
>  };
>  
>  struct ext4_xattr_search {
> @@ -140,7 +161,13 @@ extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
>  extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
>  extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
>  
> -extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
> +extern struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
> +					   int *err);
> +extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
> +extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> +				   struct ext4_xattr_ino_array **array);
> +extern void ext4_xattr_inode_array_free(struct inode *inode,
> +					struct ext4_xattr_ino_array *array);
>  
>  extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
>  			    struct ext4_inode *raw_inode, handle_t *handle);
> -- 
> 2.13.0.219.gdb65acc882-goog
> 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 01/28] ext4: xattr-in-inode support
@ 2017-05-31 16:42   ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-05-31 16:42 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Deepa Dinamani, Dave Kleikamp, jfs-discussion, Theodore Ts'o,
	Kalpak Shah, linux-kernel, reiserfs-devel, Jens Axboe,
	linux-fsdevel, Mike Christie, Andreas Dilger, Alexander Viro,
	Jan Kara, Fabian Frederick, Andreas Dilger, linux-ext4,
	James Simmons, ocfs2-devel

On Wed, May 31, 2017 at 01:14:50AM -0700, Tahsin Erdogan wrote:
> From: Andreas Dilger <andreas.dilger@intel.com>
> 
> Large xattr support is implemented for EXT4_FEATURE_INCOMPAT_EA_INODE.
> 
> If the size of an xattr value is larger than will fit in a single
> external block, then the xattr value will be saved into the body
> of an external xattr inode.
> 
> The also helps support a larger number of xattr, since only the headers
> will be stored in the in-inode space or the single external block.
> 
> The inode is referenced from the xattr header via "e_value_inum",
> which was formerly "e_value_block", but that field was never used.
> The e_value_size still contains the xattr size so that listing
> xattrs does not need to look up the inode if the data is not accessed.
> 
> struct ext4_xattr_entry {
>         __u8    e_name_len;     /* length of name */
>         __u8    e_name_index;   /* attribute name index */
>         __le16  e_value_offs;   /* offset in disk block of value */
>         __le32  e_value_inum;   /* inode in which value is stored */
>         __le32  e_value_size;   /* size of attribute value */
>         __le32  e_hash;         /* hash value of name and value */
>         char    e_name[0];      /* attribute name */
> };
> 
> The xattr inode is marked with the EXT4_EA_INODE_FL flag and also
> holds a back-reference to the owning inode in its i_mtime field,
> allowing the ext4/e2fsck to verify the correct inode is accessed.

Can we store the checksum of the xattr value somewhere?  We already
checksum the values if they're stored in the ibody or a single external
block, and I'd hate to lose that protection.

We could probably reuse one of the inode fields (i_version?) for this.

--D 

> Lustre-Jira: https://jira.hpdd.intel.com/browse/LU-80
> Lustre-bugzilla: https://bugzilla.lustre.org/show_bug.cgi?id=4424
> Signed-off-by: Kalpak Shah <kalpak.shah@sun.com>
> Signed-off-by: James Simmons <uja.ornl@gmail.com>
> Signed-off-by: Andreas Dilger <andreas.dilger@intel.com>
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext4/ext4.h   |  12 ++
>  fs/ext4/ialloc.c |   1 -
>  fs/ext4/inline.c |   2 +-
>  fs/ext4/inode.c  |  49 ++++-
>  fs/ext4/xattr.c  | 565 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  fs/ext4/xattr.h  |  33 +++-
>  6 files changed, 606 insertions(+), 56 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 32191548abed..24ef56b4572f 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1797,6 +1797,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,		ENCRYPT)
>  					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
>  					 EXT4_FEATURE_INCOMPAT_64BIT| \
>  					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
> +					 EXT4_FEATURE_INCOMPAT_EA_INODE| \
>  					 EXT4_FEATURE_INCOMPAT_MMP | \
>  					 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
>  					 EXT4_FEATURE_INCOMPAT_ENCRYPT | \
> @@ -2220,6 +2221,12 @@ struct mmpd_data {
>  #define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
>  
>  /*
> + * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb
> + * This limit is arbitrary, but is reasonable for the xattr API.
> + */
> +#define EXT4_XATTR_MAX_LARGE_EA_SIZE    (1024 * 1024)
> +
> +/*
>   * Function prototypes
>   */
>  
> @@ -2231,6 +2238,10 @@ struct mmpd_data {
>  # define ATTRIB_NORET	__attribute__((noreturn))
>  # define NORET_AND	noreturn,
>  
> +struct ext4_xattr_ino_array {
> +	unsigned int xia_count;		/* # of used item in the array */
> +	unsigned int xia_inodes[0];
> +};
>  /* bitmap.c */
>  extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
>  void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
> @@ -2478,6 +2489,7 @@ extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
>  extern void ext4_set_inode_flags(struct inode *);
>  extern int ext4_alloc_da_blocks(struct inode *inode);
>  extern void ext4_set_aops(struct inode *inode);
> +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk);
>  extern int ext4_writepage_trans_blocks(struct inode *);
>  extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
>  extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 98ac2f1f23b3..e2eb3cc06820 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -294,7 +294,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
>  	 * as writing the quota to disk may need the lock as well.
>  	 */
>  	dquot_initialize(inode);
> -	ext4_xattr_delete_inode(handle, inode);
>  	dquot_free_inode(inode);
>  	dquot_drop(inode);
>  
> diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
> index 8d141c0c8ff9..28c5c3abddb3 100644
> --- a/fs/ext4/inline.c
> +++ b/fs/ext4/inline.c
> @@ -61,7 +61,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
>  
>  	/* Compute min_offs. */
>  	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
> -		if (!entry->e_value_block && entry->e_value_size) {
> +		if (!entry->e_value_inum && entry->e_value_size) {
>  			size_t offs = le16_to_cpu(entry->e_value_offs);
>  			if (offs < min_offs)
>  				min_offs = offs;
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 5cf82d03968c..e5535e5b3dc5 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -139,8 +139,6 @@ static void ext4_invalidatepage(struct page *page, unsigned int offset,
>  				unsigned int length);
>  static int __ext4_journalled_writepage(struct page *page, unsigned int len);
>  static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
> -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
> -				  int pextents);
>  
>  /*
>   * Test whether an inode is a fast symlink.
> @@ -189,6 +187,8 @@ void ext4_evict_inode(struct inode *inode)
>  {
>  	handle_t *handle;
>  	int err;
> +	int extra_credits = 3;
> +	struct ext4_xattr_ino_array *lea_ino_array = NULL;
>  
>  	trace_ext4_evict_inode(inode);
>  
> @@ -238,8 +238,8 @@ void ext4_evict_inode(struct inode *inode)
>  	 * protection against it
>  	 */
>  	sb_start_intwrite(inode->i_sb);
> -	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
> -				    ext4_blocks_for_truncate(inode)+3);
> +
> +	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits);
>  	if (IS_ERR(handle)) {
>  		ext4_std_error(inode->i_sb, PTR_ERR(handle));
>  		/*
> @@ -251,9 +251,36 @@ void ext4_evict_inode(struct inode *inode)
>  		sb_end_intwrite(inode->i_sb);
>  		goto no_delete;
>  	}
> -
>  	if (IS_SYNC(inode))
>  		ext4_handle_sync(handle);
> +
> +	/*
> +	 * Delete xattr inode before deleting the main inode.
> +	 */
> +	err = ext4_xattr_delete_inode(handle, inode, &lea_ino_array);
> +	if (err) {
> +		ext4_warning(inode->i_sb,
> +			     "couldn't delete inode's xattr (err %d)", err);
> +		goto stop_handle;
> +	}
> +
> +	if (!IS_NOQUOTA(inode))
> +		extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
> +
> +	if (!ext4_handle_has_enough_credits(handle,
> +			ext4_blocks_for_truncate(inode) + extra_credits)) {
> +		err = ext4_journal_extend(handle,
> +			ext4_blocks_for_truncate(inode) + extra_credits);
> +		if (err > 0)
> +			err = ext4_journal_restart(handle,
> +			ext4_blocks_for_truncate(inode) + extra_credits);
> +		if (err != 0) {
> +			ext4_warning(inode->i_sb,
> +				     "couldn't extend journal (err %d)", err);
> +			goto stop_handle;
> +		}
> +	}
> +
>  	inode->i_size = 0;
>  	err = ext4_mark_inode_dirty(handle, inode);
>  	if (err) {
> @@ -277,10 +304,10 @@ void ext4_evict_inode(struct inode *inode)
>  	 * enough credits left in the handle to remove the inode from
>  	 * the orphan list and set the dtime field.
>  	 */
> -	if (!ext4_handle_has_enough_credits(handle, 3)) {
> -		err = ext4_journal_extend(handle, 3);
> +	if (!ext4_handle_has_enough_credits(handle, extra_credits)) {
> +		err = ext4_journal_extend(handle, extra_credits);
>  		if (err > 0)
> -			err = ext4_journal_restart(handle, 3);
> +			err = ext4_journal_restart(handle, extra_credits);
>  		if (err != 0) {
>  			ext4_warning(inode->i_sb,
>  				     "couldn't extend journal (err %d)", err);
> @@ -315,8 +342,12 @@ void ext4_evict_inode(struct inode *inode)
>  		ext4_clear_inode(inode);
>  	else
>  		ext4_free_inode(handle, inode);
> +
>  	ext4_journal_stop(handle);
>  	sb_end_intwrite(inode->i_sb);
> +
> +	if (lea_ino_array != NULL)
> +		ext4_xattr_inode_array_free(inode, lea_ino_array);
>  	return;
>  no_delete:
>  	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
> @@ -5504,7 +5535,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
>   *
>   * Also account for superblock, inode, quota and xattr blocks
>   */
> -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
> +int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
>  				  int pextents)
>  {
>  	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 5d3c2536641c..444be5c7a1d5 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -177,9 +177,8 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
>  
>  	/* Check the values */
>  	while (!IS_LAST_ENTRY(entry)) {
> -		if (entry->e_value_block != 0)
> -			return -EFSCORRUPTED;
> -		if (entry->e_value_size != 0) {
> +		if (entry->e_value_size != 0 &&
> +		    entry->e_value_inum == 0) {
>  			u16 offs = le16_to_cpu(entry->e_value_offs);
>  			u32 size = le32_to_cpu(entry->e_value_size);
>  			void *value;
> @@ -269,6 +268,99 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
>  	return cmp ? -ENODATA : 0;
>  }
>  
> +/*
> + * Read the EA value from an inode.
> + */
> +static int
> +ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size)
> +{
> +	unsigned long block = 0;
> +	struct buffer_head *bh = NULL;
> +	int blocksize;
> +	size_t csize, ret_size = 0;
> +
> +	if (*size == 0)
> +		return 0;
> +
> +	blocksize = ea_inode->i_sb->s_blocksize;
> +
> +	while (ret_size < *size) {
> +		csize = (*size - ret_size) > blocksize ? blocksize :
> +							*size - ret_size;
> +		bh = ext4_bread(NULL, ea_inode, block, 0);
> +		if (IS_ERR(bh)) {
> +			*size = ret_size;
> +			return PTR_ERR(bh);
> +		}
> +		memcpy(buf, bh->b_data, csize);
> +		brelse(bh);
> +
> +		buf += csize;
> +		block += 1;
> +		ret_size += csize;
> +	}
> +
> +	*size = ret_size;
> +
> +	return 0;
> +}
> +
> +struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, int *err)
> +{
> +	struct inode *ea_inode = NULL;
> +
> +	ea_inode = ext4_iget(parent->i_sb, ea_ino);
> +	if (IS_ERR(ea_inode) || is_bad_inode(ea_inode)) {
> +		int rc = IS_ERR(ea_inode) ? PTR_ERR(ea_inode) : 0;
> +		ext4_error(parent->i_sb, "error while reading EA inode %lu "
> +			   "/ %d %d", ea_ino, rc, is_bad_inode(ea_inode));
> +		*err = rc != 0 ? rc : -EIO;
> +		return NULL;
> +	}
> +
> +	if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != parent->i_ino ||
> +	    ea_inode->i_generation != parent->i_generation) {
> +		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
> +			   "to parent invalid.", ea_ino);
> +		*err = -EINVAL;
> +		goto error;
> +	}
> +
> +	if (!(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL)) {
> +		ext4_error(parent->i_sb, "EA inode %lu does not have "
> +			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
> +		*err = -EINVAL;
> +		goto error;
> +	}
> +
> +	*err = 0;
> +	return ea_inode;
> +
> +error:
> +	iput(ea_inode);
> +	return NULL;
> +}
> +
> +/*
> + * Read the value from the EA inode.
> + */
> +static int
> +ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
> +		     size_t *size)
> +{
> +	struct inode *ea_inode = NULL;
> +	int err;
> +
> +	ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_inode_read(ea_inode, buffer, size);
> +	iput(ea_inode);
> +
> +	return err;
> +}
> +
>  static int
>  ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
>  		     void *buffer, size_t buffer_size)
> @@ -308,8 +400,16 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
>  		error = -ERANGE;
>  		if (size > buffer_size)
>  			goto cleanup;
> -		memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
> -		       size);
> +		if (entry->e_value_inum) {
> +			error = ext4_xattr_inode_get(inode,
> +					     le32_to_cpu(entry->e_value_inum),
> +					     buffer, &size);
> +			if (error)
> +				goto cleanup;
> +		} else {
> +			memcpy(buffer, bh->b_data +
> +			       le16_to_cpu(entry->e_value_offs), size);
> +		}
>  	}
>  	error = size;
>  
> @@ -350,8 +450,16 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
>  		error = -ERANGE;
>  		if (size > buffer_size)
>  			goto cleanup;
> -		memcpy(buffer, (void *)IFIRST(header) +
> -		       le16_to_cpu(entry->e_value_offs), size);
> +		if (entry->e_value_inum) {
> +			error = ext4_xattr_inode_get(inode,
> +					     le32_to_cpu(entry->e_value_inum),
> +					     buffer, &size);
> +			if (error)
> +				goto cleanup;
> +		} else {
> +			memcpy(buffer, (void *)IFIRST(header) +
> +			       le16_to_cpu(entry->e_value_offs), size);
> +		}
>  	}
>  	error = size;
>  
> @@ -620,7 +728,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
>  				    size_t *min_offs, void *base, int *total)
>  {
>  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
> -		if (last->e_value_size) {
> +		if (!last->e_value_inum && last->e_value_size) {
>  			size_t offs = le16_to_cpu(last->e_value_offs);
>  			if (offs < *min_offs)
>  				*min_offs = offs;
> @@ -631,16 +739,173 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
>  	return (*min_offs - ((void *)last - base) - sizeof(__u32));
>  }
>  
> -static int
> -ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
> +/*
> + * Write the value of the EA in an inode.
> + */
> +static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
> +				  const void *buf, int bufsize)
> +{
> +	struct buffer_head *bh = NULL;
> +	unsigned long block = 0;
> +	unsigned blocksize = ea_inode->i_sb->s_blocksize;
> +	unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
> +	int csize, wsize = 0;
> +	int ret = 0;
> +	int retries = 0;
> +
> +retry:
> +	while (ret >= 0 && ret < max_blocks) {
> +		struct ext4_map_blocks map;
> +		map.m_lblk = block += ret;
> +		map.m_len = max_blocks -= ret;
> +
> +		ret = ext4_map_blocks(handle, ea_inode, &map,
> +				      EXT4_GET_BLOCKS_CREATE);
> +		if (ret <= 0) {
> +			ext4_mark_inode_dirty(handle, ea_inode);
> +			if (ret == -ENOSPC &&
> +			    ext4_should_retry_alloc(ea_inode->i_sb, &retries)) {
> +				ret = 0;
> +				goto retry;
> +			}
> +			break;
> +		}
> +	}
> +
> +	if (ret < 0)
> +		return ret;
> +
> +	block = 0;
> +	while (wsize < bufsize) {
> +		if (bh != NULL)
> +			brelse(bh);
> +		csize = (bufsize - wsize) > blocksize ? blocksize :
> +								bufsize - wsize;
> +		bh = ext4_getblk(handle, ea_inode, block, 0);
> +		if (IS_ERR(bh)) {
> +			ret = PTR_ERR(bh);
> +			goto out;
> +		}
> +		ret = ext4_journal_get_write_access(handle, bh);
> +		if (ret)
> +			goto out;
> +
> +		memcpy(bh->b_data, buf, csize);
> +		set_buffer_uptodate(bh);
> +		ext4_handle_dirty_metadata(handle, ea_inode, bh);
> +
> +		buf += csize;
> +		wsize += csize;
> +		block += 1;
> +	}
> +
> +	inode_lock(ea_inode);
> +	i_size_write(ea_inode, wsize);
> +	ext4_update_i_disksize(ea_inode, wsize);
> +	inode_unlock(ea_inode);
> +
> +	ext4_mark_inode_dirty(handle, ea_inode);
> +
> +out:
> +	brelse(bh);
> +
> +	return ret;
> +}
> +
> +/*
> + * Create an inode to store the value of a large EA.
> + */
> +static struct inode *ext4_xattr_inode_create(handle_t *handle,
> +					     struct inode *inode)
> +{
> +	struct inode *ea_inode = NULL;
> +
> +	/*
> +	 * Let the next inode be the goal, so we try and allocate the EA inode
> +	 * in the same group, or nearby one.
> +	 */
> +	ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
> +				  S_IFREG | 0600, NULL, inode->i_ino + 1, NULL);
> +	if (!IS_ERR(ea_inode)) {
> +		ea_inode->i_op = &ext4_file_inode_operations;
> +		ea_inode->i_fop = &ext4_file_operations;
> +		ext4_set_aops(ea_inode);
> +		ea_inode->i_generation = inode->i_generation;
> +		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
> +
> +		/*
> +		 * A back-pointer from EA inode to parent inode will be useful
> +		 * for e2fsck.
> +		 */
> +		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
> +		unlock_new_inode(ea_inode);
> +	}
> +
> +	return ea_inode;
> +}
> +
> +/*
> + * Unlink the inode storing the value of the EA.
> + */
> +int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
> +{
> +	struct inode *ea_inode = NULL;
> +	int err;
> +
> +	ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
> +	if (err)
> +		return err;
> +
> +	clear_nlink(ea_inode);
> +	iput(ea_inode);
> +
> +	return 0;
> +}
> +
> +/*
> + * Add value of the EA in an inode.
> + */
> +static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
> +				unsigned long *ea_ino, const void *value,
> +				size_t value_len)
> +{
> +	struct inode *ea_inode;
> +	int err;
> +
> +	/* Create an inode for the EA value */
> +	ea_inode = ext4_xattr_inode_create(handle, inode);
> +	if (IS_ERR(ea_inode))
> +		return PTR_ERR(ea_inode);
> +
> +	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
> +	if (err)
> +		clear_nlink(ea_inode);
> +	else
> +		*ea_ino = ea_inode->i_ino;
> +
> +	iput(ea_inode);
> +
> +	return err;
> +}
> +
> +static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
> +				struct ext4_xattr_search *s,
> +				handle_t *handle, struct inode *inode)
>  {
>  	struct ext4_xattr_entry *last;
>  	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
> +	int in_inode = i->in_inode;
> +	int rc;
> +
> +	if (ext4_has_feature_ea_inode(inode->i_sb) &&
> +	    (EXT4_XATTR_SIZE(i->value_len) >
> +	     EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize)))
> +		in_inode = 1;
>  
>  	/* Compute min_offs and last. */
>  	last = s->first;
>  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
> -		if (last->e_value_size) {
> +		if (!last->e_value_inum && last->e_value_size) {
>  			size_t offs = le16_to_cpu(last->e_value_offs);
>  			if (offs < min_offs)
>  				min_offs = offs;
> @@ -648,15 +913,20 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
>  	}
>  	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
>  	if (!s->not_found) {
> -		if (s->here->e_value_size) {
> +		if (!in_inode &&
> +		    !s->here->e_value_inum && s->here->e_value_size) {
>  			size_t size = le32_to_cpu(s->here->e_value_size);
>  			free += EXT4_XATTR_SIZE(size);
>  		}
>  		free += EXT4_XATTR_LEN(name_len);
>  	}
>  	if (i->value) {
> -		if (free < EXT4_XATTR_LEN(name_len) +
> -			   EXT4_XATTR_SIZE(i->value_len))
> +		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
> +
> +		if (in_inode)
> +			value_len = 0;
> +
> +		if (free < EXT4_XATTR_LEN(name_len) + value_len)
>  			return -ENOSPC;
>  	}
>  
> @@ -670,7 +940,8 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
>  		s->here->e_name_len = name_len;
>  		memcpy(s->here->e_name, i->name, name_len);
>  	} else {
> -		if (s->here->e_value_size) {
> +		if (!s->here->e_value_inum && s->here->e_value_size &&
> +		    s->here->e_value_offs > 0) {
>  			void *first_val = s->base + min_offs;
>  			size_t offs = le16_to_cpu(s->here->e_value_offs);
>  			void *val = s->base + offs;
> @@ -704,12 +975,18 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
>  			last = s->first;
>  			while (!IS_LAST_ENTRY(last)) {
>  				size_t o = le16_to_cpu(last->e_value_offs);
> -				if (last->e_value_size && o < offs)
> +				if (!last->e_value_inum &&
> +				    last->e_value_size && o < offs)
>  					last->e_value_offs =
>  						cpu_to_le16(o + size);
>  				last = EXT4_XATTR_NEXT(last);
>  			}
>  		}
> +		if (s->here->e_value_inum) {
> +			ext4_xattr_inode_unlink(inode,
> +					    le32_to_cpu(s->here->e_value_inum));
> +			s->here->e_value_inum = 0;
> +		}
>  		if (!i->value) {
>  			/* Remove the old name. */
>  			size_t size = EXT4_XATTR_LEN(name_len);
> @@ -722,11 +999,20 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
>  
>  	if (i->value) {
>  		/* Insert the new value. */
> -		s->here->e_value_size = cpu_to_le32(i->value_len);
> -		if (i->value_len) {
> +		if (in_inode) {
> +			unsigned long ea_ino =
> +				le32_to_cpu(s->here->e_value_inum);
> +			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
> +						  i->value, i->value_len);
> +			if (rc)
> +				goto out;
> +			s->here->e_value_inum = cpu_to_le32(ea_ino);
> +			s->here->e_value_offs = 0;
> +		} else if (i->value_len) {
>  			size_t size = EXT4_XATTR_SIZE(i->value_len);
>  			void *val = s->base + min_offs - size;
>  			s->here->e_value_offs = cpu_to_le16(min_offs - size);
> +			s->here->e_value_inum = 0;
>  			if (i->value == EXT4_ZERO_XATTR_VALUE) {
>  				memset(val, 0, size);
>  			} else {
> @@ -736,8 +1022,11 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
>  				memcpy(val, i->value, i->value_len);
>  			}
>  		}
> +		s->here->e_value_size = cpu_to_le32(i->value_len);
>  	}
> -	return 0;
> +
> +out:
> +	return rc;
>  }
>  
>  struct ext4_xattr_block_find {
> @@ -801,8 +1090,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  
>  #define header(x) ((struct ext4_xattr_header *)(x))
>  
> -	if (i->value && i->value_len > sb->s_blocksize)
> -		return -ENOSPC;
>  	if (s->base) {
>  		BUFFER_TRACE(bs->bh, "get_write_access");
>  		error = ext4_journal_get_write_access(handle, bs->bh);
> @@ -821,7 +1108,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  			mb_cache_entry_delete_block(ext4_mb_cache, hash,
>  						    bs->bh->b_blocknr);
>  			ea_bdebug(bs->bh, "modifying in-place");
> -			error = ext4_xattr_set_entry(i, s);
> +			error = ext4_xattr_set_entry(i, s, handle, inode);
>  			if (!error) {
>  				if (!IS_LAST_ENTRY(s->first))
>  					ext4_xattr_rehash(header(s->base),
> @@ -870,7 +1157,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  		s->end = s->base + sb->s_blocksize;
>  	}
>  
> -	error = ext4_xattr_set_entry(i, s);
> +	error = ext4_xattr_set_entry(i, s, handle, inode);
>  	if (error == -EFSCORRUPTED)
>  		goto bad_block;
>  	if (error)
> @@ -1070,7 +1357,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
>  
>  	if (EXT4_I(inode)->i_extra_isize == 0)
>  		return -ENOSPC;
> -	error = ext4_xattr_set_entry(i, s);
> +	error = ext4_xattr_set_entry(i, s, handle, inode);
>  	if (error) {
>  		if (error == -ENOSPC &&
>  		    ext4_has_inline_data(inode)) {
> @@ -1082,7 +1369,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
>  			error = ext4_xattr_ibody_find(inode, i, is);
>  			if (error)
>  				return error;
> -			error = ext4_xattr_set_entry(i, s);
> +			error = ext4_xattr_set_entry(i, s, handle, inode);
>  		}
>  		if (error)
>  			return error;
> @@ -1098,7 +1385,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
>  	return 0;
>  }
>  
> -static int ext4_xattr_ibody_set(struct inode *inode,
> +static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
>  				struct ext4_xattr_info *i,
>  				struct ext4_xattr_ibody_find *is)
>  {
> @@ -1108,7 +1395,7 @@ static int ext4_xattr_ibody_set(struct inode *inode,
>  
>  	if (EXT4_I(inode)->i_extra_isize == 0)
>  		return -ENOSPC;
> -	error = ext4_xattr_set_entry(i, s);
> +	error = ext4_xattr_set_entry(i, s, handle, inode);
>  	if (error)
>  		return error;
>  	header = IHDR(inode, ext4_raw_inode(&is->iloc));
> @@ -1155,7 +1442,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  		.name = name,
>  		.value = value,
>  		.value_len = value_len,
> -
> +		.in_inode = 0,
>  	};
>  	struct ext4_xattr_ibody_find is = {
>  		.s = { .not_found = -ENODATA, },
> @@ -1204,7 +1491,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  	}
>  	if (!value) {
>  		if (!is.s.not_found)
> -			error = ext4_xattr_ibody_set(inode, &i, &is);
> +			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
>  		else if (!bs.s.not_found)
>  			error = ext4_xattr_block_set(handle, inode, &i, &bs);
>  	} else {
> @@ -1215,7 +1502,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  		if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
>  			goto cleanup;
>  
> -		error = ext4_xattr_ibody_set(inode, &i, &is);
> +		error = ext4_xattr_ibody_set(handle, inode, &i, &is);
>  		if (!error && !bs.s.not_found) {
>  			i.value = NULL;
>  			error = ext4_xattr_block_set(handle, inode, &i, &bs);
> @@ -1226,11 +1513,20 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  					goto cleanup;
>  			}
>  			error = ext4_xattr_block_set(handle, inode, &i, &bs);
> +			if (ext4_has_feature_ea_inode(inode->i_sb) &&
> +			    error == -ENOSPC) {
> +				/* xattr not fit to block, store at external
> +				 * inode */
> +				i.in_inode = 1;
> +				error = ext4_xattr_ibody_set(handle, inode,
> +							     &i, &is);
> +			}
>  			if (error)
>  				goto cleanup;
>  			if (!is.s.not_found) {
>  				i.value = NULL;
> -				error = ext4_xattr_ibody_set(inode, &i, &is);
> +				error = ext4_xattr_ibody_set(handle, inode, &i,
> +							     &is);
>  			}
>  		}
>  	}
> @@ -1269,12 +1565,26 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
>  	       const void *value, size_t value_len, int flags)
>  {
>  	handle_t *handle;
> +	struct super_block *sb = inode->i_sb;
>  	int error, retries = 0;
>  	int credits = ext4_jbd2_credits_xattr(inode);
>  
>  	error = dquot_initialize(inode);
>  	if (error)
>  		return error;
> +
> +	if ((value_len >= EXT4_XATTR_MIN_LARGE_EA_SIZE(sb->s_blocksize)) &&
> +	    ext4_has_feature_ea_inode(sb)) {
> +		int nrblocks = (value_len + sb->s_blocksize - 1) >>
> +					sb->s_blocksize_bits;
> +
> +		/* For new inode */
> +		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
> +
> +		/* For data blocks of EA inode */
> +		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
> +	}
> +
>  retry:
>  	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>  	if (IS_ERR(handle)) {
> @@ -1286,7 +1596,7 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
>  					      value, value_len, flags);
>  		error2 = ext4_journal_stop(handle);
>  		if (error == -ENOSPC &&
> -		    ext4_should_retry_alloc(inode->i_sb, &retries))
> +		    ext4_should_retry_alloc(sb, &retries))
>  			goto retry;
>  		if (error == 0)
>  			error = error2;
> @@ -1311,7 +1621,7 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
>  
>  	/* Adjust the value offsets of the entries */
>  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
> -		if (last->e_value_size) {
> +		if (!last->e_value_inum && last->e_value_size) {
>  			new_offs = le16_to_cpu(last->e_value_offs) +
>  							value_offs_shift;
>  			last->e_value_offs = cpu_to_le16(new_offs);
> @@ -1372,7 +1682,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
>  		goto out;
>  
>  	/* Remove the chosen entry from the inode */
> -	error = ext4_xattr_ibody_set(inode, &i, is);
> +	error = ext4_xattr_ibody_set(handle, inode, &i, is);
>  	if (error)
>  		goto out;
>  
> @@ -1572,21 +1882,135 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
>  }
>  
>  
> +#define EIA_INCR 16 /* must be 2^n */
> +#define EIA_MASK (EIA_INCR - 1)
> +/* Add the large xattr @ino into @lea_ino_array for later deletion.
> + * If @lea_ino_array is new or full it will be grown and the old
> + * contents copied over.
> + */
> +static int
> +ext4_expand_ino_array(struct ext4_xattr_ino_array **lea_ino_array, __u32 ino)
> +{
> +	if (*lea_ino_array == NULL) {
> +		/*
> +		 * Start with 15 inodes, so it fits into a power-of-two size.
> +		 * If *lea_ino_array is NULL, this is essentially offsetof()
> +		 */
> +		(*lea_ino_array) =
> +			kmalloc(offsetof(struct ext4_xattr_ino_array,
> +					 xia_inodes[EIA_MASK]),
> +				GFP_NOFS);
> +		if (*lea_ino_array == NULL)
> +			return -ENOMEM;
> +		(*lea_ino_array)->xia_count = 0;
> +	} else if (((*lea_ino_array)->xia_count & EIA_MASK) == EIA_MASK) {
> +		/* expand the array once all 15 + n * 16 slots are full */
> +		struct ext4_xattr_ino_array *new_array = NULL;
> +		int count = (*lea_ino_array)->xia_count;
> +
> +		/* if new_array is NULL, this is essentially offsetof() */
> +		new_array = kmalloc(
> +				offsetof(struct ext4_xattr_ino_array,
> +					 xia_inodes[count + EIA_INCR]),
> +				GFP_NOFS);
> +		if (new_array == NULL)
> +			return -ENOMEM;
> +		memcpy(new_array, *lea_ino_array,
> +		       offsetof(struct ext4_xattr_ino_array,
> +				xia_inodes[count]));
> +		kfree(*lea_ino_array);
> +		*lea_ino_array = new_array;
> +	}
> +	(*lea_ino_array)->xia_inodes[(*lea_ino_array)->xia_count++] = ino;
> +	return 0;
> +}
> +
> +/**
> + * Add xattr inode to orphan list
> + */
> +static int
> +ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode,
> +			int credits, struct ext4_xattr_ino_array *lea_ino_array)
> +{
> +	struct inode *ea_inode = NULL;
> +	int idx = 0, error = 0;
> +
> +	if (lea_ino_array == NULL)
> +		return 0;
> +
> +	for (; idx < lea_ino_array->xia_count; ++idx) {
> +		if (!ext4_handle_has_enough_credits(handle, credits)) {
> +			error = ext4_journal_extend(handle, credits);
> +			if (error > 0)
> +				error = ext4_journal_restart(handle, credits);
> +
> +			if (error != 0) {
> +				ext4_warning(inode->i_sb,
> +					"couldn't extend journal "
> +					"(err %d)", error);
> +				return error;
> +			}
> +		}
> +		ea_inode = ext4_xattr_inode_iget(inode,
> +				lea_ino_array->xia_inodes[idx], &error);
> +		if (error)
> +			continue;
> +		ext4_orphan_add(handle, ea_inode);
> +		/* the inode's i_count will be released by caller */
> +	}
> +
> +	return 0;
> +}
>  
>  /*
>   * ext4_xattr_delete_inode()
>   *
> - * Free extended attribute resources associated with this inode. This
> + * Free extended attribute resources associated with this inode. Traverse
> + * all entries and unlink any xattr inodes associated with this inode. This
>   * is called immediately before an inode is freed. We have exclusive
> - * access to the inode.
> + * access to the inode. If an orphan inode is deleted it will also delete any
> + * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
> + * to ensure they belong to the parent inode and were not deleted already.
>   */
> -void
> -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
> +int
> +ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> +			struct ext4_xattr_ino_array **lea_ino_array)
>  {
>  	struct buffer_head *bh = NULL;
> +	struct ext4_xattr_ibody_header *header;
> +	struct ext4_inode *raw_inode;
> +	struct ext4_iloc iloc;
> +	struct ext4_xattr_entry *entry;
> +	int credits = 3, error = 0;
>  
> -	if (!EXT4_I(inode)->i_file_acl)
> +	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
> +		goto delete_external_ea;
> +
> +	error = ext4_get_inode_loc(inode, &iloc);
> +	if (error)
> +		goto cleanup;
> +	raw_inode = ext4_raw_inode(&iloc);
> +	header = IHDR(inode, raw_inode);
> +	for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		if (ext4_expand_ino_array(lea_ino_array,
> +					  entry->e_value_inum) != 0) {
> +			brelse(iloc.bh);
> +			goto cleanup;
> +		}
> +		entry->e_value_inum = 0;
> +	}
> +	brelse(iloc.bh);
> +
> +delete_external_ea:
> +	if (!EXT4_I(inode)->i_file_acl) {
> +		/* add xattr inode to orphan list */
> +		ext4_xattr_inode_orphan_add(handle, inode, credits,
> +						*lea_ino_array);
>  		goto cleanup;
> +	}
>  	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
>  	if (!bh) {
>  		EXT4_ERROR_INODE(inode, "block %llu read error",
> @@ -1599,11 +2023,69 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
>  				 EXT4_I(inode)->i_file_acl);
>  		goto cleanup;
>  	}
> +
> +	for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		if (ext4_expand_ino_array(lea_ino_array,
> +					  entry->e_value_inum) != 0)
> +			goto cleanup;
> +		entry->e_value_inum = 0;
> +	}
> +
> +	/* add xattr inode to orphan list */
> +	error = ext4_xattr_inode_orphan_add(handle, inode, credits,
> +					*lea_ino_array);
> +	if (error != 0)
> +		goto cleanup;
> +
> +	if (!IS_NOQUOTA(inode))
> +		credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
> +
> +	if (!ext4_handle_has_enough_credits(handle, credits)) {
> +		error = ext4_journal_extend(handle, credits);
> +		if (error > 0)
> +			error = ext4_journal_restart(handle, credits);
> +		if (error != 0) {
> +			ext4_warning(inode->i_sb,
> +				"couldn't extend journal (err %d)", error);
> +			goto cleanup;
> +		}
> +	}
> +
>  	ext4_xattr_release_block(handle, inode, bh);
>  	EXT4_I(inode)->i_file_acl = 0;
>  
>  cleanup:
>  	brelse(bh);
> +
> +	return error;
> +}
> +
> +void
> +ext4_xattr_inode_array_free(struct inode *inode,
> +			    struct ext4_xattr_ino_array *lea_ino_array)
> +{
> +	struct inode	*ea_inode = NULL;
> +	int		idx = 0;
> +	int		err;
> +
> +	if (lea_ino_array == NULL)
> +		return;
> +
> +	for (; idx < lea_ino_array->xia_count; ++idx) {
> +		ea_inode = ext4_xattr_inode_iget(inode,
> +				lea_ino_array->xia_inodes[idx], &err);
> +		if (err)
> +			continue;
> +		/* for inode's i_count get from ext4_xattr_delete_inode */
> +		if (!list_empty(&EXT4_I(ea_inode)->i_orphan))
> +			iput(ea_inode);
> +		clear_nlink(ea_inode);
> +		iput(ea_inode);
> +	}
> +	kfree(lea_ino_array);
>  }
>  
>  /*
> @@ -1655,10 +2137,9 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
>  		    entry1->e_name_index != entry2->e_name_index ||
>  		    entry1->e_name_len != entry2->e_name_len ||
>  		    entry1->e_value_size != entry2->e_value_size ||
> +		    entry1->e_value_inum != entry2->e_value_inum ||
>  		    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
>  			return 1;
> -		if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
> -			return -EFSCORRUPTED;
>  		if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
>  			   (char *)header2 + le16_to_cpu(entry2->e_value_offs),
>  			   le32_to_cpu(entry1->e_value_size)))
> @@ -1730,7 +2211,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
>  		       *name++;
>  	}
>  
> -	if (entry->e_value_size != 0) {
> +	if (!entry->e_value_inum && entry->e_value_size) {
>  		__le32 *value = (__le32 *)((char *)header +
>  			le16_to_cpu(entry->e_value_offs));
>  		for (n = (le32_to_cpu(entry->e_value_size) +
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index 099c8b670ef5..6e10ff9393d4 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -44,7 +44,7 @@ struct ext4_xattr_entry {
>  	__u8	e_name_len;	/* length of name */
>  	__u8	e_name_index;	/* attribute name index */
>  	__le16	e_value_offs;	/* offset in disk block of value */
> -	__le32	e_value_block;	/* disk block attribute is stored on (n/i) */
> +	__le32	e_value_inum;	/* inode in which the value is stored */
>  	__le32	e_value_size;	/* size of attribute value */
>  	__le32	e_hash;		/* hash value of name and value */
>  	char	e_name[0];	/* attribute name */
> @@ -69,6 +69,26 @@ struct ext4_xattr_entry {
>  		EXT4_I(inode)->i_extra_isize))
>  #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
>  
> +/*
> + * Link EA inode back to parent one using i_mtime field.
> + * Extra integer type conversion added to ignore higher
> + * bits in i_mtime.tv_sec which might be set by ext4_get()
> + */
> +#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
> +do {                                                  \
> +      (inode)->i_mtime.tv_sec = inum;                 \
> +} while(0)
> +
> +#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
> +((__u32)(inode)->i_mtime.tv_sec)
> +
> +/*
> + * The minimum size of EA value when you start storing it in an external inode
> + * size of block - size of header - size of 1 entry - 4 null bytes
> +*/
> +#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b)					\
> +	((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)
> +
>  #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
>  #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
>  #define BFIRST(bh) ENTRY(BHDR(bh)+1)
> @@ -77,10 +97,11 @@ struct ext4_xattr_entry {
>  #define EXT4_ZERO_XATTR_VALUE ((void *)-1)
>  
>  struct ext4_xattr_info {
> -	int name_index;
>  	const char *name;
>  	const void *value;
>  	size_t value_len;
> +	int name_index;
> +	int in_inode;
>  };
>  
>  struct ext4_xattr_search {
> @@ -140,7 +161,13 @@ extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
>  extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
>  extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
>  
> -extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
> +extern struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
> +					   int *err);
> +extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
> +extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> +				   struct ext4_xattr_ino_array **array);
> +extern void ext4_xattr_inode_array_free(struct inode *inode,
> +					struct ext4_xattr_ino_array *array);
>  
>  extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
>  			    struct ext4_inode *raw_inode, handle_t *handle);
> -- 
> 2.13.0.219.gdb65acc882-goog
> 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [Ocfs2-devel] [PATCH 01/28] ext4: xattr-in-inode support
@ 2017-05-31 16:42   ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-05-31 16:42 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Deepa Dinamani, Dave Kleikamp, jfs-discussion, Theodore Ts'o,
	Kalpak Shah, linux-kernel, reiserfs-devel, Jens Axboe,
	linux-fsdevel, Mike Christie, Andreas Dilger, Alexander Viro,
	Jan Kara, Fabian Frederick, Andreas Dilger, linux-ext4,
	James Simmons, ocfs2-devel

On Wed, May 31, 2017 at 01:14:50AM -0700, Tahsin Erdogan wrote:
> From: Andreas Dilger <andreas.dilger@intel.com>
> 
> Large xattr support is implemented for EXT4_FEATURE_INCOMPAT_EA_INODE.
> 
> If the size of an xattr value is larger than will fit in a single
> external block, then the xattr value will be saved into the body
> of an external xattr inode.
> 
> The also helps support a larger number of xattr, since only the headers
> will be stored in the in-inode space or the single external block.
> 
> The inode is referenced from the xattr header via "e_value_inum",
> which was formerly "e_value_block", but that field was never used.
> The e_value_size still contains the xattr size so that listing
> xattrs does not need to look up the inode if the data is not accessed.
> 
> struct ext4_xattr_entry {
>         __u8    e_name_len;     /* length of name */
>         __u8    e_name_index;   /* attribute name index */
>         __le16  e_value_offs;   /* offset in disk block of value */
>         __le32  e_value_inum;   /* inode in which value is stored */
>         __le32  e_value_size;   /* size of attribute value */
>         __le32  e_hash;         /* hash value of name and value */
>         char    e_name[0];      /* attribute name */
> };
> 
> The xattr inode is marked with the EXT4_EA_INODE_FL flag and also
> holds a back-reference to the owning inode in its i_mtime field,
> allowing the ext4/e2fsck to verify the correct inode is accessed.

Can we store the checksum of the xattr value somewhere?  We already
checksum the values if they're stored in the ibody or a single external
block, and I'd hate to lose that protection.

We could probably reuse one of the inode fields (i_version?) for this.

--D 

> Lustre-Jira: https://jira.hpdd.intel.com/browse/LU-80
> Lustre-bugzilla: https://bugzilla.lustre.org/show_bug.cgi?id=4424
> Signed-off-by: Kalpak Shah <kalpak.shah@sun.com>
> Signed-off-by: James Simmons <uja.ornl@gmail.com>
> Signed-off-by: Andreas Dilger <andreas.dilger@intel.com>
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext4/ext4.h   |  12 ++
>  fs/ext4/ialloc.c |   1 -
>  fs/ext4/inline.c |   2 +-
>  fs/ext4/inode.c  |  49 ++++-
>  fs/ext4/xattr.c  | 565 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  fs/ext4/xattr.h  |  33 +++-
>  6 files changed, 606 insertions(+), 56 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 32191548abed..24ef56b4572f 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1797,6 +1797,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,		ENCRYPT)
>  					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
>  					 EXT4_FEATURE_INCOMPAT_64BIT| \
>  					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
> +					 EXT4_FEATURE_INCOMPAT_EA_INODE| \
>  					 EXT4_FEATURE_INCOMPAT_MMP | \
>  					 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
>  					 EXT4_FEATURE_INCOMPAT_ENCRYPT | \
> @@ -2220,6 +2221,12 @@ struct mmpd_data {
>  #define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
>  
>  /*
> + * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb
> + * This limit is arbitrary, but is reasonable for the xattr API.
> + */
> +#define EXT4_XATTR_MAX_LARGE_EA_SIZE    (1024 * 1024)
> +
> +/*
>   * Function prototypes
>   */
>  
> @@ -2231,6 +2238,10 @@ struct mmpd_data {
>  # define ATTRIB_NORET	__attribute__((noreturn))
>  # define NORET_AND	noreturn,
>  
> +struct ext4_xattr_ino_array {
> +	unsigned int xia_count;		/* # of used item in the array */
> +	unsigned int xia_inodes[0];
> +};
>  /* bitmap.c */
>  extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
>  void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
> @@ -2478,6 +2489,7 @@ extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
>  extern void ext4_set_inode_flags(struct inode *);
>  extern int ext4_alloc_da_blocks(struct inode *inode);
>  extern void ext4_set_aops(struct inode *inode);
> +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk);
>  extern int ext4_writepage_trans_blocks(struct inode *);
>  extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
>  extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 98ac2f1f23b3..e2eb3cc06820 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -294,7 +294,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
>  	 * as writing the quota to disk may need the lock as well.
>  	 */
>  	dquot_initialize(inode);
> -	ext4_xattr_delete_inode(handle, inode);
>  	dquot_free_inode(inode);
>  	dquot_drop(inode);
>  
> diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
> index 8d141c0c8ff9..28c5c3abddb3 100644
> --- a/fs/ext4/inline.c
> +++ b/fs/ext4/inline.c
> @@ -61,7 +61,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
>  
>  	/* Compute min_offs. */
>  	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
> -		if (!entry->e_value_block && entry->e_value_size) {
> +		if (!entry->e_value_inum && entry->e_value_size) {
>  			size_t offs = le16_to_cpu(entry->e_value_offs);
>  			if (offs < min_offs)
>  				min_offs = offs;
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 5cf82d03968c..e5535e5b3dc5 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -139,8 +139,6 @@ static void ext4_invalidatepage(struct page *page, unsigned int offset,
>  				unsigned int length);
>  static int __ext4_journalled_writepage(struct page *page, unsigned int len);
>  static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
> -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
> -				  int pextents);
>  
>  /*
>   * Test whether an inode is a fast symlink.
> @@ -189,6 +187,8 @@ void ext4_evict_inode(struct inode *inode)
>  {
>  	handle_t *handle;
>  	int err;
> +	int extra_credits = 3;
> +	struct ext4_xattr_ino_array *lea_ino_array = NULL;
>  
>  	trace_ext4_evict_inode(inode);
>  
> @@ -238,8 +238,8 @@ void ext4_evict_inode(struct inode *inode)
>  	 * protection against it
>  	 */
>  	sb_start_intwrite(inode->i_sb);
> -	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
> -				    ext4_blocks_for_truncate(inode)+3);
> +
> +	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits);
>  	if (IS_ERR(handle)) {
>  		ext4_std_error(inode->i_sb, PTR_ERR(handle));
>  		/*
> @@ -251,9 +251,36 @@ void ext4_evict_inode(struct inode *inode)
>  		sb_end_intwrite(inode->i_sb);
>  		goto no_delete;
>  	}
> -
>  	if (IS_SYNC(inode))
>  		ext4_handle_sync(handle);
> +
> +	/*
> +	 * Delete xattr inode before deleting the main inode.
> +	 */
> +	err = ext4_xattr_delete_inode(handle, inode, &lea_ino_array);
> +	if (err) {
> +		ext4_warning(inode->i_sb,
> +			     "couldn't delete inode's xattr (err %d)", err);
> +		goto stop_handle;
> +	}
> +
> +	if (!IS_NOQUOTA(inode))
> +		extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
> +
> +	if (!ext4_handle_has_enough_credits(handle,
> +			ext4_blocks_for_truncate(inode) + extra_credits)) {
> +		err = ext4_journal_extend(handle,
> +			ext4_blocks_for_truncate(inode) + extra_credits);
> +		if (err > 0)
> +			err = ext4_journal_restart(handle,
> +			ext4_blocks_for_truncate(inode) + extra_credits);
> +		if (err != 0) {
> +			ext4_warning(inode->i_sb,
> +				     "couldn't extend journal (err %d)", err);
> +			goto stop_handle;
> +		}
> +	}
> +
>  	inode->i_size = 0;
>  	err = ext4_mark_inode_dirty(handle, inode);
>  	if (err) {
> @@ -277,10 +304,10 @@ void ext4_evict_inode(struct inode *inode)
>  	 * enough credits left in the handle to remove the inode from
>  	 * the orphan list and set the dtime field.
>  	 */
> -	if (!ext4_handle_has_enough_credits(handle, 3)) {
> -		err = ext4_journal_extend(handle, 3);
> +	if (!ext4_handle_has_enough_credits(handle, extra_credits)) {
> +		err = ext4_journal_extend(handle, extra_credits);
>  		if (err > 0)
> -			err = ext4_journal_restart(handle, 3);
> +			err = ext4_journal_restart(handle, extra_credits);
>  		if (err != 0) {
>  			ext4_warning(inode->i_sb,
>  				     "couldn't extend journal (err %d)", err);
> @@ -315,8 +342,12 @@ void ext4_evict_inode(struct inode *inode)
>  		ext4_clear_inode(inode);
>  	else
>  		ext4_free_inode(handle, inode);
> +
>  	ext4_journal_stop(handle);
>  	sb_end_intwrite(inode->i_sb);
> +
> +	if (lea_ino_array != NULL)
> +		ext4_xattr_inode_array_free(inode, lea_ino_array);
>  	return;
>  no_delete:
>  	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
> @@ -5504,7 +5535,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
>   *
>   * Also account for superblock, inode, quota and xattr blocks
>   */
> -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
> +int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
>  				  int pextents)
>  {
>  	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 5d3c2536641c..444be5c7a1d5 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -177,9 +177,8 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
>  
>  	/* Check the values */
>  	while (!IS_LAST_ENTRY(entry)) {
> -		if (entry->e_value_block != 0)
> -			return -EFSCORRUPTED;
> -		if (entry->e_value_size != 0) {
> +		if (entry->e_value_size != 0 &&
> +		    entry->e_value_inum == 0) {
>  			u16 offs = le16_to_cpu(entry->e_value_offs);
>  			u32 size = le32_to_cpu(entry->e_value_size);
>  			void *value;
> @@ -269,6 +268,99 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
>  	return cmp ? -ENODATA : 0;
>  }
>  
> +/*
> + * Read the EA value from an inode.
> + */
> +static int
> +ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size)
> +{
> +	unsigned long block = 0;
> +	struct buffer_head *bh = NULL;
> +	int blocksize;
> +	size_t csize, ret_size = 0;
> +
> +	if (*size == 0)
> +		return 0;
> +
> +	blocksize = ea_inode->i_sb->s_blocksize;
> +
> +	while (ret_size < *size) {
> +		csize = (*size - ret_size) > blocksize ? blocksize :
> +							*size - ret_size;
> +		bh = ext4_bread(NULL, ea_inode, block, 0);
> +		if (IS_ERR(bh)) {
> +			*size = ret_size;
> +			return PTR_ERR(bh);
> +		}
> +		memcpy(buf, bh->b_data, csize);
> +		brelse(bh);
> +
> +		buf += csize;
> +		block += 1;
> +		ret_size += csize;
> +	}
> +
> +	*size = ret_size;
> +
> +	return 0;
> +}
> +
> +struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, int *err)
> +{
> +	struct inode *ea_inode = NULL;
> +
> +	ea_inode = ext4_iget(parent->i_sb, ea_ino);
> +	if (IS_ERR(ea_inode) || is_bad_inode(ea_inode)) {
> +		int rc = IS_ERR(ea_inode) ? PTR_ERR(ea_inode) : 0;
> +		ext4_error(parent->i_sb, "error while reading EA inode %lu "
> +			   "/ %d %d", ea_ino, rc, is_bad_inode(ea_inode));
> +		*err = rc != 0 ? rc : -EIO;
> +		return NULL;
> +	}
> +
> +	if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != parent->i_ino ||
> +	    ea_inode->i_generation != parent->i_generation) {
> +		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
> +			   "to parent invalid.", ea_ino);
> +		*err = -EINVAL;
> +		goto error;
> +	}
> +
> +	if (!(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL)) {
> +		ext4_error(parent->i_sb, "EA inode %lu does not have "
> +			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
> +		*err = -EINVAL;
> +		goto error;
> +	}
> +
> +	*err = 0;
> +	return ea_inode;
> +
> +error:
> +	iput(ea_inode);
> +	return NULL;
> +}
> +
> +/*
> + * Read the value from the EA inode.
> + */
> +static int
> +ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
> +		     size_t *size)
> +{
> +	struct inode *ea_inode = NULL;
> +	int err;
> +
> +	ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_inode_read(ea_inode, buffer, size);
> +	iput(ea_inode);
> +
> +	return err;
> +}
> +
>  static int
>  ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
>  		     void *buffer, size_t buffer_size)
> @@ -308,8 +400,16 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
>  		error = -ERANGE;
>  		if (size > buffer_size)
>  			goto cleanup;
> -		memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
> -		       size);
> +		if (entry->e_value_inum) {
> +			error = ext4_xattr_inode_get(inode,
> +					     le32_to_cpu(entry->e_value_inum),
> +					     buffer, &size);
> +			if (error)
> +				goto cleanup;
> +		} else {
> +			memcpy(buffer, bh->b_data +
> +			       le16_to_cpu(entry->e_value_offs), size);
> +		}
>  	}
>  	error = size;
>  
> @@ -350,8 +450,16 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
>  		error = -ERANGE;
>  		if (size > buffer_size)
>  			goto cleanup;
> -		memcpy(buffer, (void *)IFIRST(header) +
> -		       le16_to_cpu(entry->e_value_offs), size);
> +		if (entry->e_value_inum) {
> +			error = ext4_xattr_inode_get(inode,
> +					     le32_to_cpu(entry->e_value_inum),
> +					     buffer, &size);
> +			if (error)
> +				goto cleanup;
> +		} else {
> +			memcpy(buffer, (void *)IFIRST(header) +
> +			       le16_to_cpu(entry->e_value_offs), size);
> +		}
>  	}
>  	error = size;
>  
> @@ -620,7 +728,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
>  				    size_t *min_offs, void *base, int *total)
>  {
>  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
> -		if (last->e_value_size) {
> +		if (!last->e_value_inum && last->e_value_size) {
>  			size_t offs = le16_to_cpu(last->e_value_offs);
>  			if (offs < *min_offs)
>  				*min_offs = offs;
> @@ -631,16 +739,173 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
>  	return (*min_offs - ((void *)last - base) - sizeof(__u32));
>  }
>  
> -static int
> -ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
> +/*
> + * Write the value of the EA in an inode.
> + */
> +static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
> +				  const void *buf, int bufsize)
> +{
> +	struct buffer_head *bh = NULL;
> +	unsigned long block = 0;
> +	unsigned blocksize = ea_inode->i_sb->s_blocksize;
> +	unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
> +	int csize, wsize = 0;
> +	int ret = 0;
> +	int retries = 0;
> +
> +retry:
> +	while (ret >= 0 && ret < max_blocks) {
> +		struct ext4_map_blocks map;
> +		map.m_lblk = block += ret;
> +		map.m_len = max_blocks -= ret;
> +
> +		ret = ext4_map_blocks(handle, ea_inode, &map,
> +				      EXT4_GET_BLOCKS_CREATE);
> +		if (ret <= 0) {
> +			ext4_mark_inode_dirty(handle, ea_inode);
> +			if (ret == -ENOSPC &&
> +			    ext4_should_retry_alloc(ea_inode->i_sb, &retries)) {
> +				ret = 0;
> +				goto retry;
> +			}
> +			break;
> +		}
> +	}
> +
> +	if (ret < 0)
> +		return ret;
> +
> +	block = 0;
> +	while (wsize < bufsize) {
> +		if (bh != NULL)
> +			brelse(bh);
> +		csize = (bufsize - wsize) > blocksize ? blocksize :
> +								bufsize - wsize;
> +		bh = ext4_getblk(handle, ea_inode, block, 0);
> +		if (IS_ERR(bh)) {
> +			ret = PTR_ERR(bh);
> +			goto out;
> +		}
> +		ret = ext4_journal_get_write_access(handle, bh);
> +		if (ret)
> +			goto out;
> +
> +		memcpy(bh->b_data, buf, csize);
> +		set_buffer_uptodate(bh);
> +		ext4_handle_dirty_metadata(handle, ea_inode, bh);
> +
> +		buf += csize;
> +		wsize += csize;
> +		block += 1;
> +	}
> +
> +	inode_lock(ea_inode);
> +	i_size_write(ea_inode, wsize);
> +	ext4_update_i_disksize(ea_inode, wsize);
> +	inode_unlock(ea_inode);
> +
> +	ext4_mark_inode_dirty(handle, ea_inode);
> +
> +out:
> +	brelse(bh);
> +
> +	return ret;
> +}
> +
> +/*
> + * Create an inode to store the value of a large EA.
> + */
> +static struct inode *ext4_xattr_inode_create(handle_t *handle,
> +					     struct inode *inode)
> +{
> +	struct inode *ea_inode = NULL;
> +
> +	/*
> +	 * Let the next inode be the goal, so we try and allocate the EA inode
> +	 * in the same group, or nearby one.
> +	 */
> +	ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
> +				  S_IFREG | 0600, NULL, inode->i_ino + 1, NULL);
> +	if (!IS_ERR(ea_inode)) {
> +		ea_inode->i_op = &ext4_file_inode_operations;
> +		ea_inode->i_fop = &ext4_file_operations;
> +		ext4_set_aops(ea_inode);
> +		ea_inode->i_generation = inode->i_generation;
> +		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
> +
> +		/*
> +		 * A back-pointer from EA inode to parent inode will be useful
> +		 * for e2fsck.
> +		 */
> +		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
> +		unlock_new_inode(ea_inode);
> +	}
> +
> +	return ea_inode;
> +}
> +
> +/*
> + * Unlink the inode storing the value of the EA.
> + */
> +int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
> +{
> +	struct inode *ea_inode = NULL;
> +	int err;
> +
> +	ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
> +	if (err)
> +		return err;
> +
> +	clear_nlink(ea_inode);
> +	iput(ea_inode);
> +
> +	return 0;
> +}
> +
> +/*
> + * Add value of the EA in an inode.
> + */
> +static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
> +				unsigned long *ea_ino, const void *value,
> +				size_t value_len)
> +{
> +	struct inode *ea_inode;
> +	int err;
> +
> +	/* Create an inode for the EA value */
> +	ea_inode = ext4_xattr_inode_create(handle, inode);
> +	if (IS_ERR(ea_inode))
> +		return PTR_ERR(ea_inode);
> +
> +	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
> +	if (err)
> +		clear_nlink(ea_inode);
> +	else
> +		*ea_ino = ea_inode->i_ino;
> +
> +	iput(ea_inode);
> +
> +	return err;
> +}
> +
> +static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
> +				struct ext4_xattr_search *s,
> +				handle_t *handle, struct inode *inode)
>  {
>  	struct ext4_xattr_entry *last;
>  	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
> +	int in_inode = i->in_inode;
> +	int rc;
> +
> +	if (ext4_has_feature_ea_inode(inode->i_sb) &&
> +	    (EXT4_XATTR_SIZE(i->value_len) >
> +	     EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize)))
> +		in_inode = 1;
>  
>  	/* Compute min_offs and last. */
>  	last = s->first;
>  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
> -		if (last->e_value_size) {
> +		if (!last->e_value_inum && last->e_value_size) {
>  			size_t offs = le16_to_cpu(last->e_value_offs);
>  			if (offs < min_offs)
>  				min_offs = offs;
> @@ -648,15 +913,20 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
>  	}
>  	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
>  	if (!s->not_found) {
> -		if (s->here->e_value_size) {
> +		if (!in_inode &&
> +		    !s->here->e_value_inum && s->here->e_value_size) {
>  			size_t size = le32_to_cpu(s->here->e_value_size);
>  			free += EXT4_XATTR_SIZE(size);
>  		}
>  		free += EXT4_XATTR_LEN(name_len);
>  	}
>  	if (i->value) {
> -		if (free < EXT4_XATTR_LEN(name_len) +
> -			   EXT4_XATTR_SIZE(i->value_len))
> +		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
> +
> +		if (in_inode)
> +			value_len = 0;
> +
> +		if (free < EXT4_XATTR_LEN(name_len) + value_len)
>  			return -ENOSPC;
>  	}
>  
> @@ -670,7 +940,8 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
>  		s->here->e_name_len = name_len;
>  		memcpy(s->here->e_name, i->name, name_len);
>  	} else {
> -		if (s->here->e_value_size) {
> +		if (!s->here->e_value_inum && s->here->e_value_size &&
> +		    s->here->e_value_offs > 0) {
>  			void *first_val = s->base + min_offs;
>  			size_t offs = le16_to_cpu(s->here->e_value_offs);
>  			void *val = s->base + offs;
> @@ -704,12 +975,18 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
>  			last = s->first;
>  			while (!IS_LAST_ENTRY(last)) {
>  				size_t o = le16_to_cpu(last->e_value_offs);
> -				if (last->e_value_size && o < offs)
> +				if (!last->e_value_inum &&
> +				    last->e_value_size && o < offs)
>  					last->e_value_offs =
>  						cpu_to_le16(o + size);
>  				last = EXT4_XATTR_NEXT(last);
>  			}
>  		}
> +		if (s->here->e_value_inum) {
> +			ext4_xattr_inode_unlink(inode,
> +					    le32_to_cpu(s->here->e_value_inum));
> +			s->here->e_value_inum = 0;
> +		}
>  		if (!i->value) {
>  			/* Remove the old name. */
>  			size_t size = EXT4_XATTR_LEN(name_len);
> @@ -722,11 +999,20 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
>  
>  	if (i->value) {
>  		/* Insert the new value. */
> -		s->here->e_value_size = cpu_to_le32(i->value_len);
> -		if (i->value_len) {
> +		if (in_inode) {
> +			unsigned long ea_ino =
> +				le32_to_cpu(s->here->e_value_inum);
> +			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
> +						  i->value, i->value_len);
> +			if (rc)
> +				goto out;
> +			s->here->e_value_inum = cpu_to_le32(ea_ino);
> +			s->here->e_value_offs = 0;
> +		} else if (i->value_len) {
>  			size_t size = EXT4_XATTR_SIZE(i->value_len);
>  			void *val = s->base + min_offs - size;
>  			s->here->e_value_offs = cpu_to_le16(min_offs - size);
> +			s->here->e_value_inum = 0;
>  			if (i->value == EXT4_ZERO_XATTR_VALUE) {
>  				memset(val, 0, size);
>  			} else {
> @@ -736,8 +1022,11 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
>  				memcpy(val, i->value, i->value_len);
>  			}
>  		}
> +		s->here->e_value_size = cpu_to_le32(i->value_len);
>  	}
> -	return 0;
> +
> +out:
> +	return rc;
>  }
>  
>  struct ext4_xattr_block_find {
> @@ -801,8 +1090,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  
>  #define header(x) ((struct ext4_xattr_header *)(x))
>  
> -	if (i->value && i->value_len > sb->s_blocksize)
> -		return -ENOSPC;
>  	if (s->base) {
>  		BUFFER_TRACE(bs->bh, "get_write_access");
>  		error = ext4_journal_get_write_access(handle, bs->bh);
> @@ -821,7 +1108,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  			mb_cache_entry_delete_block(ext4_mb_cache, hash,
>  						    bs->bh->b_blocknr);
>  			ea_bdebug(bs->bh, "modifying in-place");
> -			error = ext4_xattr_set_entry(i, s);
> +			error = ext4_xattr_set_entry(i, s, handle, inode);
>  			if (!error) {
>  				if (!IS_LAST_ENTRY(s->first))
>  					ext4_xattr_rehash(header(s->base),
> @@ -870,7 +1157,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  		s->end = s->base + sb->s_blocksize;
>  	}
>  
> -	error = ext4_xattr_set_entry(i, s);
> +	error = ext4_xattr_set_entry(i, s, handle, inode);
>  	if (error == -EFSCORRUPTED)
>  		goto bad_block;
>  	if (error)
> @@ -1070,7 +1357,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
>  
>  	if (EXT4_I(inode)->i_extra_isize == 0)
>  		return -ENOSPC;
> -	error = ext4_xattr_set_entry(i, s);
> +	error = ext4_xattr_set_entry(i, s, handle, inode);
>  	if (error) {
>  		if (error == -ENOSPC &&
>  		    ext4_has_inline_data(inode)) {
> @@ -1082,7 +1369,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
>  			error = ext4_xattr_ibody_find(inode, i, is);
>  			if (error)
>  				return error;
> -			error = ext4_xattr_set_entry(i, s);
> +			error = ext4_xattr_set_entry(i, s, handle, inode);
>  		}
>  		if (error)
>  			return error;
> @@ -1098,7 +1385,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
>  	return 0;
>  }
>  
> -static int ext4_xattr_ibody_set(struct inode *inode,
> +static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
>  				struct ext4_xattr_info *i,
>  				struct ext4_xattr_ibody_find *is)
>  {
> @@ -1108,7 +1395,7 @@ static int ext4_xattr_ibody_set(struct inode *inode,
>  
>  	if (EXT4_I(inode)->i_extra_isize == 0)
>  		return -ENOSPC;
> -	error = ext4_xattr_set_entry(i, s);
> +	error = ext4_xattr_set_entry(i, s, handle, inode);
>  	if (error)
>  		return error;
>  	header = IHDR(inode, ext4_raw_inode(&is->iloc));
> @@ -1155,7 +1442,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  		.name = name,
>  		.value = value,
>  		.value_len = value_len,
> -
> +		.in_inode = 0,
>  	};
>  	struct ext4_xattr_ibody_find is = {
>  		.s = { .not_found = -ENODATA, },
> @@ -1204,7 +1491,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  	}
>  	if (!value) {
>  		if (!is.s.not_found)
> -			error = ext4_xattr_ibody_set(inode, &i, &is);
> +			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
>  		else if (!bs.s.not_found)
>  			error = ext4_xattr_block_set(handle, inode, &i, &bs);
>  	} else {
> @@ -1215,7 +1502,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  		if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
>  			goto cleanup;
>  
> -		error = ext4_xattr_ibody_set(inode, &i, &is);
> +		error = ext4_xattr_ibody_set(handle, inode, &i, &is);
>  		if (!error && !bs.s.not_found) {
>  			i.value = NULL;
>  			error = ext4_xattr_block_set(handle, inode, &i, &bs);
> @@ -1226,11 +1513,20 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  					goto cleanup;
>  			}
>  			error = ext4_xattr_block_set(handle, inode, &i, &bs);
> +			if (ext4_has_feature_ea_inode(inode->i_sb) &&
> +			    error == -ENOSPC) {
> +				/* xattr not fit to block, store at external
> +				 * inode */
> +				i.in_inode = 1;
> +				error = ext4_xattr_ibody_set(handle, inode,
> +							     &i, &is);
> +			}
>  			if (error)
>  				goto cleanup;
>  			if (!is.s.not_found) {
>  				i.value = NULL;
> -				error = ext4_xattr_ibody_set(inode, &i, &is);
> +				error = ext4_xattr_ibody_set(handle, inode, &i,
> +							     &is);
>  			}
>  		}
>  	}
> @@ -1269,12 +1565,26 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
>  	       const void *value, size_t value_len, int flags)
>  {
>  	handle_t *handle;
> +	struct super_block *sb = inode->i_sb;
>  	int error, retries = 0;
>  	int credits = ext4_jbd2_credits_xattr(inode);
>  
>  	error = dquot_initialize(inode);
>  	if (error)
>  		return error;
> +
> +	if ((value_len >= EXT4_XATTR_MIN_LARGE_EA_SIZE(sb->s_blocksize)) &&
> +	    ext4_has_feature_ea_inode(sb)) {
> +		int nrblocks = (value_len + sb->s_blocksize - 1) >>
> +					sb->s_blocksize_bits;
> +
> +		/* For new inode */
> +		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
> +
> +		/* For data blocks of EA inode */
> +		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
> +	}
> +
>  retry:
>  	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>  	if (IS_ERR(handle)) {
> @@ -1286,7 +1596,7 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
>  					      value, value_len, flags);
>  		error2 = ext4_journal_stop(handle);
>  		if (error == -ENOSPC &&
> -		    ext4_should_retry_alloc(inode->i_sb, &retries))
> +		    ext4_should_retry_alloc(sb, &retries))
>  			goto retry;
>  		if (error == 0)
>  			error = error2;
> @@ -1311,7 +1621,7 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
>  
>  	/* Adjust the value offsets of the entries */
>  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
> -		if (last->e_value_size) {
> +		if (!last->e_value_inum && last->e_value_size) {
>  			new_offs = le16_to_cpu(last->e_value_offs) +
>  							value_offs_shift;
>  			last->e_value_offs = cpu_to_le16(new_offs);
> @@ -1372,7 +1682,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
>  		goto out;
>  
>  	/* Remove the chosen entry from the inode */
> -	error = ext4_xattr_ibody_set(inode, &i, is);
> +	error = ext4_xattr_ibody_set(handle, inode, &i, is);
>  	if (error)
>  		goto out;
>  
> @@ -1572,21 +1882,135 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
>  }
>  
>  
> +#define EIA_INCR 16 /* must be 2^n */
> +#define EIA_MASK (EIA_INCR - 1)
> +/* Add the large xattr @ino into @lea_ino_array for later deletion.
> + * If @lea_ino_array is new or full it will be grown and the old
> + * contents copied over.
> + */
> +static int
> +ext4_expand_ino_array(struct ext4_xattr_ino_array **lea_ino_array, __u32 ino)
> +{
> +	if (*lea_ino_array == NULL) {
> +		/*
> +		 * Start with 15 inodes, so it fits into a power-of-two size.
> +		 * If *lea_ino_array is NULL, this is essentially offsetof()
> +		 */
> +		(*lea_ino_array) =
> +			kmalloc(offsetof(struct ext4_xattr_ino_array,
> +					 xia_inodes[EIA_MASK]),
> +				GFP_NOFS);
> +		if (*lea_ino_array == NULL)
> +			return -ENOMEM;
> +		(*lea_ino_array)->xia_count = 0;
> +	} else if (((*lea_ino_array)->xia_count & EIA_MASK) == EIA_MASK) {
> +		/* expand the array once all 15 + n * 16 slots are full */
> +		struct ext4_xattr_ino_array *new_array = NULL;
> +		int count = (*lea_ino_array)->xia_count;
> +
> +		/* if new_array is NULL, this is essentially offsetof() */
> +		new_array = kmalloc(
> +				offsetof(struct ext4_xattr_ino_array,
> +					 xia_inodes[count + EIA_INCR]),
> +				GFP_NOFS);
> +		if (new_array == NULL)
> +			return -ENOMEM;
> +		memcpy(new_array, *lea_ino_array,
> +		       offsetof(struct ext4_xattr_ino_array,
> +				xia_inodes[count]));
> +		kfree(*lea_ino_array);
> +		*lea_ino_array = new_array;
> +	}
> +	(*lea_ino_array)->xia_inodes[(*lea_ino_array)->xia_count++] = ino;
> +	return 0;
> +}
> +
> +/**
> + * Add xattr inode to orphan list
> + */
> +static int
> +ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode,
> +			int credits, struct ext4_xattr_ino_array *lea_ino_array)
> +{
> +	struct inode *ea_inode = NULL;
> +	int idx = 0, error = 0;
> +
> +	if (lea_ino_array == NULL)
> +		return 0;
> +
> +	for (; idx < lea_ino_array->xia_count; ++idx) {
> +		if (!ext4_handle_has_enough_credits(handle, credits)) {
> +			error = ext4_journal_extend(handle, credits);
> +			if (error > 0)
> +				error = ext4_journal_restart(handle, credits);
> +
> +			if (error != 0) {
> +				ext4_warning(inode->i_sb,
> +					"couldn't extend journal "
> +					"(err %d)", error);
> +				return error;
> +			}
> +		}
> +		ea_inode = ext4_xattr_inode_iget(inode,
> +				lea_ino_array->xia_inodes[idx], &error);
> +		if (error)
> +			continue;
> +		ext4_orphan_add(handle, ea_inode);
> +		/* the inode's i_count will be released by caller */
> +	}
> +
> +	return 0;
> +}
>  
>  /*
>   * ext4_xattr_delete_inode()
>   *
> - * Free extended attribute resources associated with this inode. This
> + * Free extended attribute resources associated with this inode. Traverse
> + * all entries and unlink any xattr inodes associated with this inode. This
>   * is called immediately before an inode is freed. We have exclusive
> - * access to the inode.
> + * access to the inode. If an orphan inode is deleted it will also delete any
> + * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
> + * to ensure they belong to the parent inode and were not deleted already.
>   */
> -void
> -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
> +int
> +ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> +			struct ext4_xattr_ino_array **lea_ino_array)
>  {
>  	struct buffer_head *bh = NULL;
> +	struct ext4_xattr_ibody_header *header;
> +	struct ext4_inode *raw_inode;
> +	struct ext4_iloc iloc;
> +	struct ext4_xattr_entry *entry;
> +	int credits = 3, error = 0;
>  
> -	if (!EXT4_I(inode)->i_file_acl)
> +	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
> +		goto delete_external_ea;
> +
> +	error = ext4_get_inode_loc(inode, &iloc);
> +	if (error)
> +		goto cleanup;
> +	raw_inode = ext4_raw_inode(&iloc);
> +	header = IHDR(inode, raw_inode);
> +	for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		if (ext4_expand_ino_array(lea_ino_array,
> +					  entry->e_value_inum) != 0) {
> +			brelse(iloc.bh);
> +			goto cleanup;
> +		}
> +		entry->e_value_inum = 0;
> +	}
> +	brelse(iloc.bh);
> +
> +delete_external_ea:
> +	if (!EXT4_I(inode)->i_file_acl) {
> +		/* add xattr inode to orphan list */
> +		ext4_xattr_inode_orphan_add(handle, inode, credits,
> +						*lea_ino_array);
>  		goto cleanup;
> +	}
>  	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
>  	if (!bh) {
>  		EXT4_ERROR_INODE(inode, "block %llu read error",
> @@ -1599,11 +2023,69 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
>  				 EXT4_I(inode)->i_file_acl);
>  		goto cleanup;
>  	}
> +
> +	for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		if (ext4_expand_ino_array(lea_ino_array,
> +					  entry->e_value_inum) != 0)
> +			goto cleanup;
> +		entry->e_value_inum = 0;
> +	}
> +
> +	/* add xattr inode to orphan list */
> +	error = ext4_xattr_inode_orphan_add(handle, inode, credits,
> +					*lea_ino_array);
> +	if (error != 0)
> +		goto cleanup;
> +
> +	if (!IS_NOQUOTA(inode))
> +		credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
> +
> +	if (!ext4_handle_has_enough_credits(handle, credits)) {
> +		error = ext4_journal_extend(handle, credits);
> +		if (error > 0)
> +			error = ext4_journal_restart(handle, credits);
> +		if (error != 0) {
> +			ext4_warning(inode->i_sb,
> +				"couldn't extend journal (err %d)", error);
> +			goto cleanup;
> +		}
> +	}
> +
>  	ext4_xattr_release_block(handle, inode, bh);
>  	EXT4_I(inode)->i_file_acl = 0;
>  
>  cleanup:
>  	brelse(bh);
> +
> +	return error;
> +}
> +
> +void
> +ext4_xattr_inode_array_free(struct inode *inode,
> +			    struct ext4_xattr_ino_array *lea_ino_array)
> +{
> +	struct inode	*ea_inode = NULL;
> +	int		idx = 0;
> +	int		err;
> +
> +	if (lea_ino_array == NULL)
> +		return;
> +
> +	for (; idx < lea_ino_array->xia_count; ++idx) {
> +		ea_inode = ext4_xattr_inode_iget(inode,
> +				lea_ino_array->xia_inodes[idx], &err);
> +		if (err)
> +			continue;
> +		/* for inode's i_count get from ext4_xattr_delete_inode */
> +		if (!list_empty(&EXT4_I(ea_inode)->i_orphan))
> +			iput(ea_inode);
> +		clear_nlink(ea_inode);
> +		iput(ea_inode);
> +	}
> +	kfree(lea_ino_array);
>  }
>  
>  /*
> @@ -1655,10 +2137,9 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
>  		    entry1->e_name_index != entry2->e_name_index ||
>  		    entry1->e_name_len != entry2->e_name_len ||
>  		    entry1->e_value_size != entry2->e_value_size ||
> +		    entry1->e_value_inum != entry2->e_value_inum ||
>  		    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
>  			return 1;
> -		if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
> -			return -EFSCORRUPTED;
>  		if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
>  			   (char *)header2 + le16_to_cpu(entry2->e_value_offs),
>  			   le32_to_cpu(entry1->e_value_size)))
> @@ -1730,7 +2211,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
>  		       *name++;
>  	}
>  
> -	if (entry->e_value_size != 0) {
> +	if (!entry->e_value_inum && entry->e_value_size) {
>  		__le32 *value = (__le32 *)((char *)header +
>  			le16_to_cpu(entry->e_value_offs));
>  		for (n = (le32_to_cpu(entry->e_value_size) +
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index 099c8b670ef5..6e10ff9393d4 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -44,7 +44,7 @@ struct ext4_xattr_entry {
>  	__u8	e_name_len;	/* length of name */
>  	__u8	e_name_index;	/* attribute name index */
>  	__le16	e_value_offs;	/* offset in disk block of value */
> -	__le32	e_value_block;	/* disk block attribute is stored on (n/i) */
> +	__le32	e_value_inum;	/* inode in which the value is stored */
>  	__le32	e_value_size;	/* size of attribute value */
>  	__le32	e_hash;		/* hash value of name and value */
>  	char	e_name[0];	/* attribute name */
> @@ -69,6 +69,26 @@ struct ext4_xattr_entry {
>  		EXT4_I(inode)->i_extra_isize))
>  #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
>  
> +/*
> + * Link EA inode back to parent one using i_mtime field.
> + * Extra integer type conversion added to ignore higher
> + * bits in i_mtime.tv_sec which might be set by ext4_get()
> + */
> +#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
> +do {                                                  \
> +      (inode)->i_mtime.tv_sec = inum;                 \
> +} while(0)
> +
> +#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
> +((__u32)(inode)->i_mtime.tv_sec)
> +
> +/*
> + * The minimum size of EA value when you start storing it in an external inode
> + * size of block - size of header - size of 1 entry - 4 null bytes
> +*/
> +#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b)					\
> +	((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)
> +
>  #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
>  #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
>  #define BFIRST(bh) ENTRY(BHDR(bh)+1)
> @@ -77,10 +97,11 @@ struct ext4_xattr_entry {
>  #define EXT4_ZERO_XATTR_VALUE ((void *)-1)
>  
>  struct ext4_xattr_info {
> -	int name_index;
>  	const char *name;
>  	const void *value;
>  	size_t value_len;
> +	int name_index;
> +	int in_inode;
>  };
>  
>  struct ext4_xattr_search {
> @@ -140,7 +161,13 @@ extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
>  extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
>  extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
>  
> -extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
> +extern struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
> +					   int *err);
> +extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
> +extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> +				   struct ext4_xattr_ino_array **array);
> +extern void ext4_xattr_inode_array_free(struct inode *inode,
> +					struct ext4_xattr_ino_array *array);
>  
>  extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
>  			    struct ext4_inode *raw_inode, handle_t *handle);
> -- 
> 2.13.0.219.gdb65acc882-goog
> 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 01/28] ext4: xattr-in-inode support
  2017-05-31 16:42   ` Darrick J. Wong
  (?)
  (?)
@ 2017-05-31 19:59   ` Tahsin Erdogan
  2017-06-01 15:50     ` [PATCH v2 " Tahsin Erdogan
  -1 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31 19:59 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Andreas Dilger, Kalpak Shah, James Simmons

>> allowing the ext4/e2fsck to verify the correct inode is accessed.
>
> Can we store the checksum of the xattr value somewhere?  We already
> checksum the values if they're stored in the ibody or a single external
> block, and I'd hate to lose that protection.
>
> We could probably reuse one of the inode fields (i_version?) for this.
>
The crc32c value of the xattr data is currently stored in the xattr inode:

struct ext4_xattr_ea_info {
__le64 ref_count; /* number of xattr entry references */
__le32 hash; /* crc32c hash of xattr data */
__le32 reserved; /* reserved, must be 0 */
};

We could also save that value in the ext4_xattr_entry->e_value_offs
for stronger binding between parent and xattr inodes. That field is
currently set to 0 for xattr inode references.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 07/28] ext4: call journal revoke when freeing ea_inode blocks
  2017-05-31 16:12     ` Darrick J. Wong
  (?)
  (?)
@ 2017-05-31 21:01     ` Tahsin Erdogan
  -1 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31 21:01 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

> I think it would be wise not to support !extents && ea_inode,
> particularly since blockmaps aren't protected by metadata_csum and so in
> the long run it's probably best to minimize the introduction of new
> blockmap files (on ext4 anyway).

Sounds good. I will add that to e2fsprogs patches. thanks

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [PATCH v2 27/28] ext4: xattr inode deduplication
  2017-05-31 16:00     ` Darrick J. Wong
  (?)
  (?)
@ 2017-05-31 22:33     ` Tahsin Erdogan
  2017-06-02  5:41         ` Darrick J. Wong
  -1 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-05-31 22:33 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

Ext4 now supports xattr values that are up to 64k in size (vfs limit).
Large xattr values are stored in external inodes each one holding a
single value. Once written the data blocks of these inodes are immutable.

The real world use cases are expected to have a lot of value duplication
such as inherited acls etc. To reduce data duplication on disk, this patch
implements a deduplicator that allows sharing of xattr inodes.

The deduplication is based on an in-memory hash lookup that is a best
effort sharing scheme. When a xattr inode is read from disk (i.e.
getxattr() call), its crc32c hash is added to a hash table. Before
creating a new xattr inode for a value being set, the hash table is
checked to see if an existing inode holds an identical value. If such an
inode is found, the ref count on that inode is incremented. On value
removal the ref count is decremented and if it reaches zero the inode is
deleted.

The quota charging for such inodes is manually managed. Every reference
holder is charged the full size as if there was no sharing happening.
This is consistent with how xattr blocks are also charged.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v2:
 - make dependency on crc32c dynamic
 - update ext4_has_metadata_csum() and ext4_has_group_desc_csum() so that
   they do not misinterpret existence of EXT4_SB(sb)->s_chksum_driver

 fs/ext4/acl.c   |    5 +-
 fs/ext4/ext4.h  |   22 +-
 fs/ext4/inode.c |    9 +-
 fs/ext4/super.c |   25 +-
 fs/ext4/xattr.c | 1075 +++++++++++++++++++++++++++++++++++++++++++------------
 fs/ext4/xattr.h |   17 +-
 fs/mbcache.c    |    9 +-
 7 files changed, 893 insertions(+), 269 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 74f7ac539e00..8db03e5c78bc 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	if (error)
 		return error;
 retry:
-	credits = ext4_xattr_set_credits(inode, acl_size);
+	error = ext4_xattr_set_credits(inode, acl_size, &credits);
+	if (error)
+		return error;
+
 	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d79d8d7bee88..7ceb1f81e4b8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1517,6 +1517,7 @@ struct ext4_sb_info {
 	long s_es_nr_inode;
 	struct ext4_es_stats s_es_stats;
 	struct mb_cache *s_mb_cache;
+	struct mb_cache *s_ea_inode_cache;
 	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
 
 	/* Ratelimit ext4 messages. */
@@ -2099,7 +2100,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
 	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
 }
 
-#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
+static inline bool ext4_is_quota_file(struct inode *inode)
+{
+	return IS_NOQUOTA(inode) &&
+	       !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
+}
 
 /*
  * This structure is stuffed into the struct file's private_data field
@@ -2709,19 +2714,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
 extern int ext4_register_li_request(struct super_block *sb,
 				    ext4_group_t first_not_zeroed);
 
-static inline int ext4_has_group_desc_csum(struct super_block *sb)
-{
-	return ext4_has_feature_gdt_csum(sb) ||
-	       EXT4_SB(sb)->s_chksum_driver != NULL;
-}
-
 static inline int ext4_has_metadata_csum(struct super_block *sb)
 {
 	WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
 		     !EXT4_SB(sb)->s_chksum_driver);
 
-	return (EXT4_SB(sb)->s_chksum_driver != NULL);
+	return ext4_has_feature_metadata_csum(sb) &&
+	       (EXT4_SB(sb)->s_chksum_driver != NULL);
 }
+
+static inline int ext4_has_group_desc_csum(struct super_block *sb)
+{
+	return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
+}
+
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
 	return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4d6936f0d8a4..6f5872197d6c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4843,8 +4843,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	}
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
-	if (ei->i_flags & EXT4_EA_INODE_FL)
+
+	if (ei->i_flags & EXT4_EA_INODE_FL) {
 		ext4_xattr_inode_set_class(inode);
+
+		inode_lock(inode);
+		inode->i_flags |= S_NOQUOTA;
+		inode_unlock(inode);
+	}
+
 	unlock_new_inode(inode);
 	return inode;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b02a23ec92ca..9fcd29e21dc7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
 		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
+	if (sbi->s_ea_inode_cache) {
+		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+		sbi->s_ea_inode_cache = NULL;
+	}
 	if (sbi->s_mb_cache) {
 		ext4_xattr_destroy_cache(sbi->s_mb_cache);
 		sbi->s_mb_cache = NULL;
@@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 	if (res)
 		return res;
 retry:
-	credits = ext4_xattr_set_credits(inode, len);
+	res = ext4_xattr_set_credits(inode, len, &credits);
+	if (res)
+		return res;
+
 	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -3445,7 +3452,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* Load the checksum driver */
-	if (ext4_has_feature_metadata_csum(sb)) {
+	if (ext4_has_feature_metadata_csum(sb) ||
+	    ext4_has_feature_ea_inode(sb)) {
 		sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
 		if (IS_ERR(sbi->s_chksum_driver)) {
 			ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
@@ -4067,6 +4075,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount_wq;
 	}
 
+	if (ext4_has_feature_ea_inode(sb)) {
+		sbi->s_ea_inode_cache = ext4_xattr_create_cache();
+		if (!sbi->s_ea_inode_cache) {
+			ext4_msg(sb, KERN_ERR,
+				 "Failed to create an s_ea_inode_cache");
+			goto failed_mount_wq;
+		}
+	}
+
 	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
 	    (blocksize != PAGE_SIZE)) {
 		ext4_msg(sb, KERN_ERR,
@@ -4296,6 +4313,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (EXT4_SB(sb)->rsv_conversion_wq)
 		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
 failed_mount_wq:
+	if (sbi->s_ea_inode_cache) {
+		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+		sbi->s_ea_inode_cache = NULL;
+	}
 	if (sbi->s_mb_cache) {
 		ext4_xattr_destroy_cache(sbi->s_mb_cache);
 		sbi->s_mb_cache = NULL;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 6acce1f689ab..4c394411bf6f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -79,6 +79,7 @@ ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
 			    struct mb_cache_entry **);
 static void ext4_xattr_rehash(struct ext4_xattr_header *,
 			      struct ext4_xattr_entry *);
+static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash);
 
 static const struct xattr_handler * const ext4_xattr_handler_map[] = {
 	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
@@ -105,13 +106,23 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
 	NULL
 };
 
+#define EXT4_XATTR_SYSTEM_EA_INFO  "eai"
+
 #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
 				inode->i_sb->s_fs_info)->s_mb_cache)
 
+#define EA_INODE_CACHE(inode)	(((struct ext4_sb_info *) \
+				inode->i_sb->s_fs_info)->s_ea_inode_cache)
+
 static int
 ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
 			struct inode *inode);
 
+static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
+				 u32 hash);
+static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
+				     u64 *ref_return, u32 *hash);
+
 #ifdef CONFIG_LOCKDEP
 void ext4_xattr_inode_set_class(struct inode *ea_inode)
 {
@@ -329,14 +340,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
 		goto error;
 	}
 
-	if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
-	    inode->i_generation != parent->i_generation) {
-		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
-			   "to parent is invalid.", ea_ino);
-		err = -EINVAL;
-		goto error;
-	}
-
 	if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
 		ext4_error(parent->i_sb, "EA inode %lu does not have "
 			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
@@ -351,6 +354,12 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
 	return err;
 }
 
+static u32
+ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
+{
+	return ext4_chksum(sbi, 0, buffer, size);
+}
+
 /*
  * Read the value from the EA inode.
  */
@@ -358,17 +367,53 @@ static int
 ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
 		     size_t size)
 {
+	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
 	struct inode *ea_inode;
-	int ret;
+	u32 hash, calc_hash;
+	struct mb_cache_entry *ce;
+	int err;
 
-	ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-	if (ret)
-		return ret;
+	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+	if (err) {
+		ea_inode = NULL;
+		goto out;
+	}
 
-	ret = ext4_xattr_inode_read(ea_inode, buffer, size);
-	iput(ea_inode);
+	if (i_size_read(ea_inode) != size) {
+		ext4_warning_inode(ea_inode,
+				   "ea_inode file size=%llu entry size=%zu",
+				   i_size_read(ea_inode), size);
+		err = -EFSCORRUPTED;
+		goto out;
+	}
 
-	return ret;
+	err = ext4_xattr_inode_read(ea_inode, buffer, size);
+	if (!err) {
+		if (ext4_xattr_read_ea_hash(ea_inode, &hash))
+			goto out;
+
+		/* Avoid hash calculation if already cached. */
+		ce = mb_cache_entry_get(ea_inode_cache, hash, ea_inode->i_ino);
+		if (ce) {
+			mb_cache_entry_put(ea_inode_cache, ce);
+			goto out;
+		}
+
+		calc_hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), buffer,
+						  size);
+		if (hash != calc_hash) {
+			ext4_warning_inode(ea_inode, "EA inode saved hash=%#x "
+					   "does not match calc_hash=%#x",
+					   hash, calc_hash);
+			goto out;
+		}
+
+		mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
+				      ea_inode->i_ino, true /* reusable */);
+	}
+out:
+	iput(ea_inode);
+	return err;
 }
 
 static int
@@ -657,6 +702,101 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 	}
 }
 
+static inline size_t round_up_cluster(struct inode *inode, size_t length)
+{
+	struct super_block *sb = inode->i_sb;
+	size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
+				    inode->i_blkbits);
+	size_t mask = ~(cluster_size - 1);
+
+	return (length + cluster_size - 1) & mask;
+}
+
+static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
+{
+	int err;
+
+	err = dquot_alloc_inode(inode);
+	if (err)
+		return err;
+	err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
+	if (err)
+		dquot_free_inode(inode);
+	return err;
+}
+
+static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
+{
+	dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
+	dquot_free_inode(inode);
+}
+
+static int __ext4_xattr_set_credits(struct super_block *sb,
+				    struct buffer_head *block_bh,
+				    size_t value_len)
+{
+	int credits;
+	int blocks;
+
+	/*
+	 * 1) Owner inode update
+	 * 2) Ref count update on old xattr block
+	 * 3) new xattr block
+	 * 4) block bitmap update for new xattr block
+	 * 5) group descriptor for new xattr block
+	 */
+	credits = 5;
+
+	/* We are done if ea_inode feature is not enabled. */
+	if (!ext4_has_feature_ea_inode(sb))
+		return credits;
+
+	/* New ea_inode, inode map, block bitmap, group descriptor. */
+	credits += 4;
+
+	/* Data blocks. */
+	blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+
+	/* Indirection block. */
+	blocks += 1;
+
+	/* Block bitmap and group descriptor updates for each block. */
+	credits += blocks * 2;
+
+	/* Blocks themselves. */
+	credits += blocks;
+
+	/* Dereference ea_inode holding old xattr value.
+	 * Old ea_inode, inode map, block bitmap, group descriptor.
+	 */
+	credits += 4;
+
+	/* Data blocks for old ea_inode. */
+	blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
+
+	/* Indirection block for old ea_inode. */
+	blocks += 1;
+
+	/* Block bitmap and group descriptor updates for each block. */
+	credits += blocks * 2;
+
+	/* Quota updates. */
+	credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
+
+	/* We may need to clone the existing xattr block in which case we need
+	 * to increment ref counts for existing ea_inodes referenced by it.
+	 */
+	if (block_bh) {
+		struct ext4_xattr_entry *entry = BFIRST(block_bh);
+
+		for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
+			if (entry->e_value_inum)
+				/* Ref count update on ea_inode. */
+				credits += 1;
+	}
+	return credits;
+}
+
 int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
 			      int credits, struct buffer_head *bh,
 			      bool dirty, bool block_csum)
@@ -706,12 +846,139 @@ int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
+static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
+				       int ref_change)
+{
+	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
+	struct ext4_iloc iloc;
+	s64 ref_return;
+	u32 hash;
+	int ret;
+
+	inode_lock(ea_inode);
+
+	ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
+	if (ret) {
+		iloc.bh = NULL;
+		goto out;
+	}
+
+	ret = ext4_xattr_update_ea_info(ea_inode, ref_change, &ref_return,
+					&hash);
+	if (ret)
+		goto out;
+
+	if (ref_change > 0) {
+		WARN_ONCE(ref_return <= 0, "EA inode %lu ref_return=%lld",
+			  ea_inode->i_ino, ref_return);
+
+		if (ref_return == 1) {
+			WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
+				  ea_inode->i_ino, ea_inode->i_nlink);
+
+			set_nlink(ea_inode, 1);
+			ext4_orphan_del(handle, ea_inode);
+
+			mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
+					      ea_inode->i_ino,
+					      true /* reusable */);
+		}
+	} else {
+		WARN_ONCE(ref_return < 0, "EA inode %lu ref_return=%lld",
+			  ea_inode->i_ino, ref_return);
+
+		if (ref_return == 0) {
+			WARN_ONCE(ea_inode->i_nlink != 1,
+				  "EA inode %lu i_nlink=%u",
+				  ea_inode->i_ino, ea_inode->i_nlink);
+
+			clear_nlink(ea_inode);
+			ext4_orphan_add(handle, ea_inode);
+
+			mb_cache_entry_delete(ea_inode_cache, hash,
+					      ea_inode->i_ino);
+		}
+	}
+
+	ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
+	iloc.bh = NULL;
+	if (ret)
+		ext4_warning_inode(ea_inode,
+				   "ext4_mark_iloc_dirty() failed ret=%d", ret);
+out:
+	brelse(iloc.bh);
+	inode_unlock(ea_inode);
+	return ret;
+}
+
+static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
+{
+	return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
+}
+
+static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
+{
+	return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
+}
+
+static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
+					struct ext4_xattr_entry *first)
+{
+	struct inode *ea_inode;
+	struct ext4_xattr_entry *entry;
+	struct ext4_xattr_entry *failed_entry;
+	unsigned int ea_ino;
+	int err, saved_err;
+
+	for (entry = first; !IS_LAST_ENTRY(entry);
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		ea_ino = le32_to_cpu(entry->e_value_inum);
+		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		if (err)
+			goto cleanup;
+		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
+		if (err) {
+			ext4_warning_inode(ea_inode, "inc ref error %d", err);
+			iput(ea_inode);
+			goto cleanup;
+		}
+		iput(ea_inode);
+	}
+	return 0;
+
+cleanup:
+	saved_err = err;
+	failed_entry = entry;
+
+	for (entry = first; entry != failed_entry;
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		ea_ino = le32_to_cpu(entry->e_value_inum);
+		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		if (err) {
+			ext4_warning(parent->i_sb,
+				     "cleanup ea_ino %u iget error %d", ea_ino,
+				     err);
+			continue;
+		}
+		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (err)
+			ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
+					   err);
+		iput(ea_inode);
+	}
+	return saved_err;
+}
+
 static void
-ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
-			    struct buffer_head *bh,
-			    struct ext4_xattr_entry *first, bool block_csum,
-			    struct ext4_xattr_inode_array **ea_inode_array,
-			    int extra_credits)
+ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
+			     struct buffer_head *bh,
+			     struct ext4_xattr_entry *first, bool block_csum,
+			     struct ext4_xattr_inode_array **ea_inode_array,
+			     int extra_credits, bool skip_quota)
 {
 	struct inode *ea_inode;
 	struct ext4_xattr_entry *entry;
@@ -748,10 +1015,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
 			continue;
 		}
 
-		inode_lock(ea_inode);
-		clear_nlink(ea_inode);
-		ext4_orphan_add(handle, ea_inode);
-		inode_unlock(ea_inode);
+		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (err) {
+			ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
+					   err);
+			continue;
+		}
+
+		if (!skip_quota)
+			ext4_xattr_inode_free_quota(parent,
+					      le32_to_cpu(entry->e_value_size));
 
 		/*
 		 * Forget about ea_inode within the same transaction that decrements the ref
@@ -784,7 +1057,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
  */
 static void
 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
-			 struct buffer_head *bh)
+			 struct buffer_head *bh,
+			 struct ext4_xattr_inode_array **ea_inode_array,
+			 int extra_credits)
 {
 	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 	u32 hash, ref;
@@ -807,6 +1082,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
 		get_bh(bh);
 		unlock_buffer(bh);
+
+		if (ext4_has_feature_ea_inode(inode->i_sb))
+			ext4_xattr_inode_dec_ref_all(handle, inode, bh,
+						     BFIRST(bh),
+						     true /* block_csum */,
+						     ea_inode_array,
+						     extra_credits,
+						     true /* skip_quota */);
 		ext4_free_blocks(handle, inode, bh, 0, 1,
 				 EXT4_FREE_BLOCKS_METADATA |
 				 EXT4_FREE_BLOCKS_FORGET);
@@ -947,7 +1230,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
  * Create an inode to store the value of a large EA.
  */
 static struct inode *ext4_xattr_inode_create(handle_t *handle,
-					     struct inode *inode)
+					     struct inode *inode, u32 hash)
 {
 	struct inode *ea_inode = NULL;
 	uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
@@ -965,67 +1248,119 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 		ea_inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(ea_inode);
 		ext4_xattr_inode_set_class(ea_inode);
-		ea_inode->i_generation = inode->i_generation;
-		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
-
-		/*
-		 * A back-pointer from EA inode to parent inode will be useful
-		 * for e2fsck.
-		 */
-		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
 		unlock_new_inode(ea_inode);
 		err = ext4_inode_attach_jinode(ea_inode);
+		if (!err)
+			err = ext4_xattr_inode_init(handle, ea_inode, hash);
 		if (err) {
 			iput(ea_inode);
 			return ERR_PTR(err);
 		}
+
+		/*
+		 * Xattr inodes are shared therefore quota charging is performed
+		 * at a higher level.
+		 */
+		dquot_free_inode(ea_inode);
+		dquot_drop(ea_inode);
+		inode_lock(ea_inode);
+		ea_inode->i_flags |= S_NOQUOTA;
+		inode_unlock(ea_inode);
 	}
 
 	return ea_inode;
 }
 
-/*
- * Unlink the inode storing the value of the EA.
- */
-int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
+static struct inode *
+ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
+			    size_t value_len, u32 hash)
 {
-	struct inode *ea_inode = NULL;
+	struct inode *ea_inode;
+	struct mb_cache_entry *ce;
+	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
+	void *ea_data = NULL;
 	int err;
 
-	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-	if (err)
-		return err;
+	ce = mb_cache_entry_find_first(ea_inode_cache, hash);
+	while (ce) {
+		ea_inode = ext4_iget(inode->i_sb, ce->e_value);
+		if (IS_ERR(ea_inode)) {
+			ea_inode = NULL;
+			goto next;
+		}
 
-	clear_nlink(ea_inode);
-	iput(ea_inode);
+		if (is_bad_inode(ea_inode) ||
+		    !(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) ||
+		    i_size_read(ea_inode) != value_len)
+			goto next;
 
-	return 0;
+		if (!ea_data)
+			ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
+
+		if (!ea_data) {
+			iput(ea_inode);
+			return NULL;
+		}
+
+		err = ext4_xattr_inode_read(ea_inode, ea_data, value_len);
+		if (unlikely(err))
+			goto next;
+
+		if (!memcmp(value, ea_data, value_len)) {
+			mb_cache_entry_touch(ea_inode_cache, ce);
+			mb_cache_entry_put(ea_inode_cache, ce);
+			kvfree(ea_data);
+			return ea_inode;
+		}
+	next:
+		iput(ea_inode);
+		ce = mb_cache_entry_find_next(ea_inode_cache, ce);
+	}
+	kvfree(ea_data);
+	return NULL;
 }
 
 /*
  * Add value of the EA in an inode.
  */
-static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
-				unsigned long *ea_ino, const void *value,
-				size_t value_len)
+static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
+					  const void *value, size_t value_len,
+					  struct inode **ret_inode)
 {
 	struct inode *ea_inode;
+	u32 hash;
 	int err;
 
+	hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
+	ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
+	if (ea_inode) {
+		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
+		if (err) {
+			iput(ea_inode);
+			return err;
+		}
+
+		*ret_inode = ea_inode;
+		return 0;
+	}
+
 	/* Create an inode for the EA value */
-	ea_inode = ext4_xattr_inode_create(handle, inode);
+	ea_inode = ext4_xattr_inode_create(handle, inode, hash);
 	if (IS_ERR(ea_inode))
 		return PTR_ERR(ea_inode);
 
 	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
-	if (err)
-		clear_nlink(ea_inode);
-	else
-		*ea_ino = ea_inode->i_ino;
+	if (err) {
+		ext4_xattr_inode_dec_ref(handle, ea_inode);
+		iput(ea_inode);
+		return err;
+	}
 
-	iput(ea_inode);
+	mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
+			      ea_inode->i_ino, true /* reusable */);
 
-	return err;
+	*ret_inode = ea_inode;
+	return 0;
 }
 
 static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
@@ -1033,11 +1368,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 				handle_t *handle, struct inode *inode)
 {
 	struct ext4_xattr_entry *last;
+	struct ext4_xattr_entry *here = s->here;
 	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
 	int in_inode = i->in_inode;
-	int rc;
+	struct inode *old_ea_inode = NULL;
+	struct inode *new_ea_inode = NULL;
+	int ret;
 
-	/* Compute min_offs and last. */
+	/*
+	 * Optimization for the simple case when old and new values have the
+	 * same padded sizes. Not applicable if the existing value is stored in
+	 * an external inode.
+	 */
+	if (i->value && !s->not_found && !here->e_value_inum &&
+	    EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) ==
+	    EXT4_XATTR_SIZE(i->value_len)) {
+		size_t offs = le16_to_cpu(here->e_value_offs);
+		void *val = s->base + offs;
+		size_t size = EXT4_XATTR_SIZE(i->value_len);
+
+		here->e_value_size = cpu_to_le32(i->value_len);
+		if (i->value == EXT4_ZERO_XATTR_VALUE) {
+			memset(val, 0, size);
+		} else {
+			memcpy(val, i->value, i->value_len);
+			/* Clear padding bytes. */
+			memset(val + i->value_len, 0, size - i->value_len);
+		}
+		return 0;
+	}
+
+	/* Find out min_offs and last to calculate the free space. */
 	last = s->first;
 	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
 		if (!last->e_value_inum && last->e_value_size) {
@@ -1048,120 +1409,149 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 	}
 	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
 	if (!s->not_found) {
-		if (!in_inode &&
-		    !s->here->e_value_inum && s->here->e_value_size) {
-			size_t size = le32_to_cpu(s->here->e_value_size);
+		if (!here->e_value_inum && here->e_value_size) {
+			size_t size = le32_to_cpu(here->e_value_size);
 			free += EXT4_XATTR_SIZE(size);
 		}
 		free += EXT4_XATTR_LEN(name_len);
 	}
 	if (i->value) {
-		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
+		size_t value_len = in_inode ? 0 : EXT4_XATTR_SIZE(i->value_len);
 
-		if (in_inode)
-			value_len = 0;
+		if (free < EXT4_XATTR_LEN(name_len) + value_len) {
+			ret = -ENOSPC;
+			goto out;
+		}
+	}
 
-		if (free < EXT4_XATTR_LEN(name_len) + value_len)
-			return -ENOSPC;
+	/*
+	 * Getting access to old and new ea inodes is subject to failures.
+	 * Finish that work before doing any modifications to the xattr data.
+	 */
+	if (!s->not_found && here->e_value_inum) {
+		ret = ext4_xattr_inode_iget(inode,
+		 			    le32_to_cpu(here->e_value_inum),
+		 			    &old_ea_inode);
+		if (ret) {
+			old_ea_inode = NULL;
+			goto out;
+		}
 	}
+	if (i->value && in_inode) {
+		WARN_ON_ONCE(!i->value_len);
 
-	if (i->value && s->not_found) {
-		/* Insert the new name. */
-		size_t size = EXT4_XATTR_LEN(name_len);
-		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
-		memmove((void *)s->here + size, s->here, rest);
-		memset(s->here, 0, size);
-		s->here->e_name_index = i->name_index;
-		s->here->e_name_len = name_len;
-		memcpy(s->here->e_name, i->name, name_len);
-	} else {
-		if (!s->here->e_value_inum && s->here->e_value_size &&
-		    s->here->e_value_offs > 0) {
-			void *first_val = s->base + min_offs;
-			size_t offs = le16_to_cpu(s->here->e_value_offs);
-			void *val = s->base + offs;
-			size_t size = EXT4_XATTR_SIZE(
-				le32_to_cpu(s->here->e_value_size));
-
-			if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
-				/* The old and the new value have the same
-				   size. Just replace. */
-				s->here->e_value_size =
-					cpu_to_le32(i->value_len);
-				if (i->value == EXT4_ZERO_XATTR_VALUE) {
-					memset(val, 0, size);
-				} else {
-					/* Clear pad bytes first. */
-					memset(val + size - EXT4_XATTR_PAD, 0,
-					       EXT4_XATTR_PAD);
-					memcpy(val, i->value, i->value_len);
-				}
-				return 0;
-			}
+		ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
+		if (ret)
+			goto out;
 
-			/* Remove the old value. */
-			memmove(first_val + size, first_val, val - first_val);
-			memset(first_val, 0, size);
-			s->here->e_value_size = 0;
-			s->here->e_value_offs = 0;
-			min_offs += size;
-
-			/* Adjust all value offsets. */
-			last = s->first;
-			while (!IS_LAST_ENTRY(last)) {
-				size_t o = le16_to_cpu(last->e_value_offs);
-				if (!last->e_value_inum &&
-				    last->e_value_size && o < offs)
-					last->e_value_offs =
-						cpu_to_le16(o + size);
-				last = EXT4_XATTR_NEXT(last);
-			}
+		ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
+						     i->value_len,
+						     &new_ea_inode);
+		if (ret) {
+			new_ea_inode = NULL;
+			ext4_xattr_inode_free_quota(inode, i->value_len);
+			goto out;
 		}
-		if (s->here->e_value_inum) {
-			ext4_xattr_inode_unlink(inode,
-					    le32_to_cpu(s->here->e_value_inum));
-			s->here->e_value_inum = 0;
+	}
+
+	if (old_ea_inode) {
+		/* We are ready to release ref count on the old_ea_inode. */
+		ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
+		if (ret) {
+			/* Release newly required ref count on new_ea_inode. */
+			if (new_ea_inode) {
+				int err;
+
+				err = ext4_xattr_inode_dec_ref(handle,
+							       new_ea_inode);
+				if (err)
+					ext4_warning_inode(new_ea_inode,
+						  "dec ref new_ea_inode err=%d",
+						  err);
+				ext4_xattr_inode_free_quota(inode,
+							    i->value_len);
+			}
+			goto out;
 		}
-		if (!i->value) {
-			/* Remove the old name. */
-			size_t size = EXT4_XATTR_LEN(name_len);
-			last = ENTRY((void *)last - size);
-			memmove(s->here, (void *)s->here + size,
-				(void *)last - (void *)s->here + sizeof(__u32));
-			memset(last, 0, size);
+
+		ext4_xattr_inode_free_quota(inode,
+					    le32_to_cpu(here->e_value_size));
+	}
+
+	/* No failures allowed past this point. */
+
+	if (!s->not_found && here->e_value_offs) {
+		/* Remove the old value. */
+		void *first_val = s->base + min_offs;
+		size_t offs = le16_to_cpu(here->e_value_offs);
+		void *val = s->base + offs;
+		size_t size = EXT4_XATTR_SIZE(
+			le32_to_cpu(here->e_value_size));
+
+		memmove(first_val + size, first_val, val - first_val);
+		memset(first_val, 0, size);
+		min_offs += size;
+
+		/* Adjust all value offsets. */
+		last = s->first;
+		while (!IS_LAST_ENTRY(last)) {
+			size_t o = le16_to_cpu(last->e_value_offs);
+			if (!last->e_value_inum &&
+			    last->e_value_size && o < offs)
+				last->e_value_offs =
+					cpu_to_le16(o + size);
+			last = EXT4_XATTR_NEXT(last);
 		}
 	}
 
+	if (!s->not_found && !i->value) {
+		/* Remove old name. */
+		size_t size = EXT4_XATTR_LEN(name_len);
+		last = ENTRY((void *)last - size);
+		memmove(here, (void *)here + size,
+			(void *)last - (void *)here + sizeof(__u32));
+		memset(last, 0, size);
+	} else if (s->not_found && i->value) {
+		/* Insert new name. */
+		size_t size = EXT4_XATTR_LEN(name_len);
+		size_t rest = (void *)last - (void *)here + sizeof(__u32);
+		memmove((void *)here + size, here, rest);
+		memset(here, 0, size);
+		here->e_name_index = i->name_index;
+		here->e_name_len = name_len;
+		memcpy(here->e_name, i->name, name_len);
+	} else {
+		WARN_ON_ONCE(s->not_found || !i->value);
+		/* This is an update, reset value info. */
+		here->e_value_inum = 0;
+		here->e_value_offs = 0;
+		here->e_value_size = 0;
+	}
+
 	if (i->value) {
-		/* Insert the new value. */
+		/* Insert new value. */
 		if (in_inode) {
-			unsigned long ea_ino =
-				le32_to_cpu(s->here->e_value_inum);
-			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
-						  i->value, i->value_len);
-			if (rc)
-				goto out;
-			s->here->e_value_inum = cpu_to_le32(ea_ino);
-			s->here->e_value_offs = 0;
+			here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
 		} else if (i->value_len) {
 			size_t size = EXT4_XATTR_SIZE(i->value_len);
 			void *val = s->base + min_offs - size;
-			s->here->e_value_offs = cpu_to_le16(min_offs - size);
-			s->here->e_value_inum = 0;
+			here->e_value_offs = cpu_to_le16(min_offs - size);
 			if (i->value == EXT4_ZERO_XATTR_VALUE) {
 				memset(val, 0, size);
 			} else {
-				/* Clear the pad bytes first. */
-				memset(val + size - EXT4_XATTR_PAD, 0,
-				       EXT4_XATTR_PAD);
 				memcpy(val, i->value, i->value_len);
+				/* Clear padding bytes. */
+				memset(val + i->value_len, 0,
+				       size - i->value_len);
 			}
 		}
-		s->here->e_value_size = cpu_to_le32(i->value_len);
+		here->e_value_size = cpu_to_le32(i->value_len);
 	}
-
+	ret = 0;
 out:
-	return rc;
+	iput(old_ea_inode);
+	iput(new_ea_inode);
+	return ret;
 }
 
 struct ext4_xattr_block_find {
@@ -1223,6 +1613,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 	struct mb_cache_entry *ce = NULL;
 	int error = 0;
 	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+	struct inode *ea_inode = NULL;
+	size_t old_ea_inode_size = 0;
 
 #define header(x) ((struct ext4_xattr_header *)(x))
 
@@ -1277,6 +1669,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			header(s->base)->h_refcount = cpu_to_le32(1);
 			s->here = ENTRY(s->base + offset);
 			s->end = s->base + bs->bh->b_size;
+
+			/*
+			 * If existing entry points to an xattr inode, we need
+			 * to prevent ext4_xattr_set_entry() from decrementing
+			 * ref count on it because the reference belongs to the
+			 * original block. In this case, make the entry look
+			 * like it has an empty value.
+			 */
+			if (!s->not_found && s->here->e_value_inum) {
+				/*
+				 * Defer quota free call for previous inode
+				 * until success is guaranteed.
+				 */
+				old_ea_inode_size = le32_to_cpu(
+							s->here->e_value_size);
+				s->here->e_value_inum = 0;
+				s->here->e_value_size = 0;
+			}
 		}
 	} else {
 		/* Allocate a buffer where we construct the new block. */
@@ -1298,6 +1708,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		goto bad_block;
 	if (error)
 		goto cleanup;
+
+	if (i->value && s->here->e_value_inum) {
+		unsigned int ea_ino;
+
+		/*
+		 * A ref count on ea_inode has been taken as part of the call to
+		 * ext4_xattr_set_entry() above. We would like to drop this
+		 * extra ref but we have to wait until the xattr block is
+		 * initialized and has its own ref count on the ea_inode.
+		 */
+		ea_ino = le32_to_cpu(s->here->e_value_inum);
+		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+		if (error) {
+			ea_inode = NULL;
+			goto cleanup;
+		}
+	}
+
 	if (!IS_LAST_ENTRY(s->first))
 		ext4_xattr_rehash(header(s->base), s->here);
 
@@ -1408,6 +1836,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 						 EXT4_FREE_BLOCKS_METADATA);
 				goto cleanup;
 			}
+			error = ext4_xattr_inode_inc_ref_all(handle, inode,
+						      ENTRY(header(s->base)+1));
+			if (error)
+				goto getblk_failed;
+			if (ea_inode) {
+				/* Drop the extra ref on ea_inode. */
+				error = ext4_xattr_inode_dec_ref(handle,
+								 ea_inode);
+				if (error)
+					ext4_warning_inode(ea_inode,
+							   "dec ref error=%d",
+							   error);
+				iput(ea_inode);
+				ea_inode = NULL;
+			}
+
 			lock_buffer(new_bh);
 			error = ext4_journal_get_create_access(handle, new_bh);
 			if (error) {
@@ -1427,15 +1871,36 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		}
 	}
 
+	if (old_ea_inode_size)
+		ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
+
 	/* Update the inode. */
 	EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
 
 	/* Drop the previous xattr block. */
-	if (bs->bh && bs->bh != new_bh)
-		ext4_xattr_release_block(handle, inode, bs->bh);
+	if (bs->bh && bs->bh != new_bh) {
+		struct ext4_xattr_inode_array *ea_inode_array = NULL;
+		ext4_xattr_release_block(handle, inode, bs->bh,
+					 &ea_inode_array,
+					 0 /* extra_credits */);
+		ext4_xattr_inode_array_free(ea_inode_array);
+	}
 	error = 0;
 
 cleanup:
+	if (ea_inode) {
+		int error2;
+		error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (error2)
+			ext4_warning_inode(ea_inode, "dec ref error=%d",
+					   error2);
+
+		/* If there was an error, revert the quota charge. */
+		if (error)
+			ext4_xattr_inode_free_quota(inode,
+						    i_size_read(ea_inode));
+		iput(ea_inode);
+	}
 	if (ce)
 		mb_cache_entry_put(ext4_mb_cache, ce);
 	brelse(new_bh);
@@ -1546,6 +2011,117 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
+struct ext4_xattr_ea_info {
+	__le64 ref_count;	/* number of xattr entry references */
+	__le32 hash;		/* crc32c hash of xattr data */
+	__le32 reserved;	/* reserved, must be 0 */
+};
+
+static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
+				 u32 hash)
+{
+	struct ext4_xattr_ea_info ea_info = {
+		.ref_count = cpu_to_le64(1),
+		.hash = cpu_to_le32(hash),
+		.reserved = 0,
+	};
+	struct ext4_xattr_info i = {
+                .name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_EA_INFO,
+		.value = &ea_info,
+		.value_len = sizeof(ea_info),
+	};
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	int err;
+
+	err = ext4_get_inode_loc(ea_inode, &is.iloc);
+	if (err)
+		return err;
+
+	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
+	if (err)
+		return err;
+
+	return ext4_xattr_ibody_set(handle, ea_inode, &i, &is);
+}
+
+static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
+				     u64 *ref_return, u32 *hash)
+{
+	struct ext4_xattr_ea_info ea_info;
+	struct ext4_xattr_info i = {
+                .name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_EA_INFO,
+		.value = &ea_info,
+		.value_len = sizeof(ea_info),
+	};
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	int err;
+
+	err = ext4_get_inode_loc(ea_inode, &is.iloc);
+	if (err)
+		return err;
+
+	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
+	if (err)
+		return err;
+
+	if (WARN_ON(is.s.not_found) ||
+	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(ea_info)))
+		return -EFSCORRUPTED;
+
+	memcpy(&ea_info,
+	       ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs),
+	       sizeof(ea_info));
+
+	if (hash)
+		*hash = le32_to_cpu(ea_info.hash);
+
+	*ref_return = le64_to_cpu(ea_info.ref_count) + ref_change;
+	ea_info.ref_count = cpu_to_le64(*ref_return);
+
+	return ext4_xattr_set_entry(&i, &is.s, NULL, ea_inode);
+}
+
+static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash)
+{
+	struct ext4_xattr_info i = {
+                .name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_EA_INFO,
+	};
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext4_xattr_ea_info *ea_info;
+	void *ptr;
+	int err;
+
+	err = ext4_get_inode_loc(ea_inode, &is.iloc);
+	if (err)
+		return err;
+
+	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
+	if (err)
+		return err;
+
+	if (WARN_ON(is.s.not_found) ||
+	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(*ea_info)))
+		return -EFSCORRUPTED;
+
+	ptr = ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs);
+	ea_info = (struct ext4_xattr_ea_info *)ptr;
+
+	if (WARN_ON(ea_info->reserved != 0))
+		return -EFSCORRUPTED;
+
+	*hash = le32_to_cpu(ea_info->hash);
+	return 0;
+}
+
 static int ext4_xattr_value_same(struct ext4_xattr_search *s,
 				 struct ext4_xattr_info *i)
 {
@@ -1560,6 +2136,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
 	return !memcmp(value, i->value, i->value_len);
 }
 
+struct buffer_head *ext4_xattr_get_block(struct inode *inode)
+{
+	struct buffer_head *bh;
+	int error;
+
+	if (!EXT4_I(inode)->i_file_acl)
+		return NULL;
+	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+	if (!bh)
+		return ERR_PTR(-EIO);
+	error = ext4_xattr_check_block(inode, bh);
+	if (error)
+		return ERR_PTR(error);
+	return bh;
+}
+
 /*
  * ext4_xattr_set_handle()
  *
@@ -1602,9 +2194,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 
 	/* Check journal credits under write lock. */
 	if (ext4_handle_valid(handle)) {
+		struct buffer_head *bh;
 		int credits;
 
-		credits = ext4_xattr_set_credits(inode, value_len);
+		bh = ext4_xattr_get_block(inode);
+		if (IS_ERR(bh)) {
+			error = PTR_ERR(bh);
+			goto cleanup;
+		}
+
+		credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
+		brelse(bh);
+
 		if (!ext4_handle_has_enough_credits(handle, credits)) {
 			error = -ENOSPC;
 			goto cleanup;
@@ -1640,6 +2241,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		if (flags & XATTR_CREATE)
 			goto cleanup;
 	}
+
 	if (!value) {
 		if (!is.s.not_found)
 			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
@@ -1708,34 +2310,29 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	return error;
 }
 
-int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
+int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
 {
-	struct super_block *sb = inode->i_sb;
-	int credits;
-
-	if (!EXT4_SB(sb)->s_journal)
-		return 0;
-
-	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+	struct buffer_head *bh;
+	int err;
 
-	/*
-	 * In case of inline data, we may push out the data to a block,
-	 * so we need to reserve credits for this eventuality
-	 */
-	if (ext4_has_inline_data(inode))
-	        credits += ext4_writepage_trans_blocks(inode) + 1;
+	*credits = 0;
 
-	if (ext4_has_feature_ea_inode(sb)) {
-		int nrblocks = (value_len + sb->s_blocksize - 1) >>
-					sb->s_blocksize_bits;
+	if (!EXT4_SB(inode->i_sb)->s_journal)
+		return 0;
 
-		/* For new inode */
-		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
+	down_read(&EXT4_I(inode)->xattr_sem);
 
-		/* For data blocks of EA inode */
-		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
+	bh = ext4_xattr_get_block(inode);
+	if (IS_ERR(bh)) {
+		err = PTR_ERR(bh);
+	} else {
+		*credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
+		brelse(bh);
+		err = 0;
 	}
-	return credits;
+
+	up_read(&EXT4_I(inode)->xattr_sem);
+	return err;
 }
 
 /*
@@ -1760,7 +2357,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 		return error;
 
 retry:
-	credits = ext4_xattr_set_credits(inode, value_len);
+	error = ext4_xattr_set_credits(inode, value_len, &credits);
+	if (error)
+		return error;
+
 	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
@@ -2066,10 +2666,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 	return error;
 }
 
-
 #define EIA_INCR 16 /* must be 2^n */
 #define EIA_MASK (EIA_INCR - 1)
-/* Add the large xattr @inode into @ea_inode_array for later deletion.
+
+/* Add the large xattr @inode into @ea_inode_array for deferred iput().
  * If @ea_inode_array is new or full it will be grown and the old
  * contents copied over.
  */
@@ -2114,21 +2714,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
  * ext4_xattr_delete_inode()
  *
  * Free extended attribute resources associated with this inode. Traverse
- * all entries and unlink any xattr inodes associated with this inode. This
- * is called immediately before an inode is freed. We have exclusive
- * access to the inode. If an orphan inode is deleted it will also delete any
- * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
- * to ensure they belong to the parent inode and were not deleted already.
+ * all entries and decrement reference on any xattr inodes associated with this
+ * inode. This is called immediately before an inode is freed. We have exclusive
+ * access to the inode. If an orphan inode is deleted it will also release its
+ * references on xattr block and xattr inodes.
  */
-int
-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-			struct ext4_xattr_inode_array **ea_inode_array,
-			int extra_credits)
+int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
+			    struct ext4_xattr_inode_array **ea_inode_array,
+			    int extra_credits)
 {
 	struct buffer_head *bh = NULL;
 	struct ext4_xattr_ibody_header *header;
-	struct ext4_inode *raw_inode;
 	struct ext4_iloc iloc = { .bh = NULL };
+	struct ext4_xattr_entry *entry;
 	int error;
 
 	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
@@ -2140,66 +2738,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 		goto cleanup;
 	}
 
-	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
-		goto delete_external_ea;
-
-	error = ext4_get_inode_loc(inode, &iloc);
-	if (error)
-		goto cleanup;
+	if (ext4_has_feature_ea_inode(inode->i_sb) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
 
-	error = ext4_journal_get_write_access(handle, iloc.bh);
-	if (error)
-		goto cleanup;
+		error = ext4_get_inode_loc(inode, &iloc);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
+			goto cleanup;
+		}
 
-	raw_inode = ext4_raw_inode(&iloc);
-	header = IHDR(inode, raw_inode);
-	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
-				    false /* block_csum */, ea_inode_array,
-				    extra_credits);
+		error = ext4_journal_get_write_access(handle, iloc.bh);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "write access (error %d)",
+					 error);
+			goto cleanup;
+		}
 
-delete_external_ea:
-	if (!EXT4_I(inode)->i_file_acl) {
-		error = 0;
-		goto cleanup;
-	}
-	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-	if (!bh) {
-		EXT4_ERROR_INODE(inode, "block %llu read error",
-				 EXT4_I(inode)->i_file_acl);
-		error = -EIO;
-		goto cleanup;
-	}
-	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
-	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-		EXT4_ERROR_INODE(inode, "bad block %llu",
-				 EXT4_I(inode)->i_file_acl);
-		error = -EFSCORRUPTED;
-		goto cleanup;
+		header = IHDR(inode, ext4_raw_inode(&iloc));
+		if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
+			ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
+						     IFIRST(header),
+						     false /* block_csum */,
+						     ea_inode_array,
+						     extra_credits,
+						     false /* skip_quota */);
 	}
 
-	if (ext4_has_feature_ea_inode(inode->i_sb)) {
-		error = ext4_journal_get_write_access(handle, bh);
-		if (error) {
-			EXT4_ERROR_INODE(inode, "write access %llu",
+	if (EXT4_I(inode)->i_file_acl) {
+		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+		if (!bh) {
+			EXT4_ERROR_INODE(inode, "block %llu read error",
 					 EXT4_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		error = ext4_xattr_check_block(inode, bh);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
+					 EXT4_I(inode)->i_file_acl, error);
 			goto cleanup;
 		}
-		ext4_xattr_inode_remove_all(handle, inode, bh,
-					    BFIRST(bh),
-					    true /* block_csum */,
-					    ea_inode_array,
-					    extra_credits);
-	}
 
-	ext4_xattr_release_block(handle, inode, bh);
-	/* Update i_file_acl within the same transaction that releases block. */
-	EXT4_I(inode)->i_file_acl = 0;
-	error = ext4_mark_inode_dirty(handle, inode);
-	if (error) {
-		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
-				 error);
-		goto cleanup;
+		if (ext4_has_feature_ea_inode(inode->i_sb)) {
+			for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
+			     entry = EXT4_XATTR_NEXT(entry))
+				if (entry->e_value_inum)
+					ext4_xattr_inode_free_quota(inode,
+					      le32_to_cpu(entry->e_value_size));
+
+		}
+
+		ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
+					 extra_credits);
+		/*
+		 * Update i_file_acl value in the same transaction that releases
+		 * block.
+		 */
+		EXT4_I(inode)->i_file_acl = 0;
+		error = ext4_mark_inode_dirty(handle, inode);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
+					 error);
+			goto cleanup;
+		}
 	}
+	error = 0;
 cleanup:
 	brelse(iloc.bh);
 	brelse(bh);
@@ -2208,17 +2811,13 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 
 void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
 {
-	struct inode	*ea_inode;
-	int		idx = 0;
+	int idx;
 
 	if (ea_inode_array == NULL)
 		return;
 
-	for (; idx < ea_inode_array->count; ++idx) {
-		ea_inode = ea_inode_array->inodes[idx];
-		clear_nlink(ea_inode);
-		iput(ea_inode);
-	}
+	for (idx = 0; idx < ea_inode_array->count; ++idx)
+		iput(ea_inode_array->inodes[idx]);
 	kfree(ea_inode_array);
 }
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index b2005a2716d9..67616cb9a059 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -70,19 +70,6 @@ struct ext4_xattr_entry {
 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
 
 /*
- * Link EA inode back to parent one using i_mtime field.
- * Extra integer type conversion added to ignore higher
- * bits in i_mtime.tv_sec which might be set by ext4_get()
- */
-#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
-do {                                                  \
-      (inode)->i_mtime.tv_sec = inum;                 \
-} while(0)
-
-#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
-((__u32)(inode)->i_mtime.tv_sec)
-
-/*
  * The minimum size of EA value when you start storing it in an external inode
  * size of block - size of header - size of 1 entry - 4 null bytes
 */
@@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
 extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
 extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
-extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
+extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
+				  int *credits);
 
-extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 				   struct ext4_xattr_inode_array **array,
 				   int extra_credits);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 77a5b99d8f92..7dfdca822ccb 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -13,10 +13,11 @@
  * mb_cache_entry_delete()).
  *
  * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
- * They use hash of a block contents as a key and block number as a value.
- * That's why keys need not be unique (different xattr blocks may end up having
- * the same hash). However block number always uniquely identifies a cache
- * entry.
+ * Ext4 also uses it for deduplication of xattr values stored in inodes.
+ * They use hash of data as a key and provide a value that may represent a
+ * block or inode number. That's why keys need not be unique (hash of different
+ * data may be the same). However user provided value always uniquely
+ * identifies a cache entry.
  *
  * We provide functions for creation and removal of entries, search by key,
  * and a special "delete entry with given key-value pair" operation. Fixed
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH v2 01/28] ext4: xattr-in-inode support
  2017-05-31 19:59   ` Tahsin Erdogan
@ 2017-06-01 15:50     ` Tahsin Erdogan
  0 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-01 15:50 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Andreas Dilger, Kalpak Shah, James Simmons,
	Alexander Boyko, Tahsin Erdogan

From: Andreas Dilger <andreas.dilger@intel.com>

Large xattr support is implemented for EXT4_FEATURE_INCOMPAT_EA_INODE.

If the size of an xattr value is larger than will fit in a single
external block, then the xattr value will be saved into the body
of an external xattr inode.

The also helps support a larger number of xattr, since only the headers
will be stored in the in-inode space or the single external block.

The inode is referenced from the xattr header via "e_value_inum",
which was formerly "e_value_block", but that field was never used.
The e_value_size still contains the xattr size so that listing
xattrs does not need to look up the inode if the data is not accessed.

struct ext4_xattr_entry {
        __u8    e_name_len;     /* length of name */
        __u8    e_name_index;   /* attribute name index */
        __le16  e_value_offs;   /* offset in disk block of value */
        __le32  e_value_inum;   /* inode in which value is stored */
        __le32  e_value_size;   /* size of attribute value */
        __le32  e_hash;         /* hash value of name and value */
        char    e_name[0];      /* attribute name */
};

The xattr inode is marked with the EXT4_EA_INODE_FL flag and also
holds a back-reference to the owning inode in its i_mtime field,
allowing the ext4/e2fsck to verify the correct inode is accessed.

Lustre-Jira: https://jira.hpdd.intel.com/browse/LU-80
Lustre-bugzilla: https://bugzilla.lustre.org/show_bug.cgi?id=4424
Signed-off-by: Kalpak Shah <kalpak.shah@sun.com>
Signed-off-by: James Simmons <uja.ornl@gmail.com>
Signed-off-by: Andreas Dilger <andreas.dilger@intel.com>
Signed-off-by: Alexander Boyko <alexander.boyko@seagate.com>
Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v2:
 - Added Alexander Boyko to signed off list per note from
   Andrew Perepechko

 fs/ext4/ext4.h   |  12 ++
 fs/ext4/ialloc.c |   1 -
 fs/ext4/inline.c |   2 +-
 fs/ext4/inode.c  |  49 ++++-
 fs/ext4/xattr.c  | 565 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/ext4/xattr.h  |  33 +++-
 6 files changed, 606 insertions(+), 56 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 32191548abed..24ef56b4572f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1797,6 +1797,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,		ENCRYPT)
 					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
 					 EXT4_FEATURE_INCOMPAT_64BIT| \
 					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+					 EXT4_FEATURE_INCOMPAT_EA_INODE| \
 					 EXT4_FEATURE_INCOMPAT_MMP | \
 					 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
 					 EXT4_FEATURE_INCOMPAT_ENCRYPT | \
@@ -2220,6 +2221,12 @@ struct mmpd_data {
 #define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
 
 /*
+ * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb
+ * This limit is arbitrary, but is reasonable for the xattr API.
+ */
+#define EXT4_XATTR_MAX_LARGE_EA_SIZE    (1024 * 1024)
+
+/*
  * Function prototypes
  */
 
@@ -2231,6 +2238,10 @@ struct mmpd_data {
 # define ATTRIB_NORET	__attribute__((noreturn))
 # define NORET_AND	noreturn,
 
+struct ext4_xattr_ino_array {
+	unsigned int xia_count;		/* # of used item in the array */
+	unsigned int xia_inodes[0];
+};
 /* bitmap.c */
 extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
 void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
@@ -2478,6 +2489,7 @@ extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
+extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 98ac2f1f23b3..e2eb3cc06820 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -294,7 +294,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	 * as writing the quota to disk may need the lock as well.
 	 */
 	dquot_initialize(inode);
-	ext4_xattr_delete_inode(handle, inode);
 	dquot_free_inode(inode);
 	dquot_drop(inode);
 
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 8d141c0c8ff9..28c5c3abddb3 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -61,7 +61,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
 
 	/* Compute min_offs. */
 	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
-		if (!entry->e_value_block && entry->e_value_size) {
+		if (!entry->e_value_inum && entry->e_value_size) {
 			size_t offs = le16_to_cpu(entry->e_value_offs);
 			if (offs < min_offs)
 				min_offs = offs;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5cf82d03968c..e5535e5b3dc5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -139,8 +139,6 @@ static void ext4_invalidatepage(struct page *page, unsigned int offset,
 				unsigned int length);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
-				  int pextents);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -189,6 +187,8 @@ void ext4_evict_inode(struct inode *inode)
 {
 	handle_t *handle;
 	int err;
+	int extra_credits = 3;
+	struct ext4_xattr_ino_array *lea_ino_array = NULL;
 
 	trace_ext4_evict_inode(inode);
 
@@ -238,8 +238,8 @@ void ext4_evict_inode(struct inode *inode)
 	 * protection against it
 	 */
 	sb_start_intwrite(inode->i_sb);
-	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
-				    ext4_blocks_for_truncate(inode)+3);
+
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits);
 	if (IS_ERR(handle)) {
 		ext4_std_error(inode->i_sb, PTR_ERR(handle));
 		/*
@@ -251,9 +251,36 @@ void ext4_evict_inode(struct inode *inode)
 		sb_end_intwrite(inode->i_sb);
 		goto no_delete;
 	}
-
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
+
+	/*
+	 * Delete xattr inode before deleting the main inode.
+	 */
+	err = ext4_xattr_delete_inode(handle, inode, &lea_ino_array);
+	if (err) {
+		ext4_warning(inode->i_sb,
+			     "couldn't delete inode's xattr (err %d)", err);
+		goto stop_handle;
+	}
+
+	if (!IS_NOQUOTA(inode))
+		extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
+
+	if (!ext4_handle_has_enough_credits(handle,
+			ext4_blocks_for_truncate(inode) + extra_credits)) {
+		err = ext4_journal_extend(handle,
+			ext4_blocks_for_truncate(inode) + extra_credits);
+		if (err > 0)
+			err = ext4_journal_restart(handle,
+			ext4_blocks_for_truncate(inode) + extra_credits);
+		if (err != 0) {
+			ext4_warning(inode->i_sb,
+				     "couldn't extend journal (err %d)", err);
+			goto stop_handle;
+		}
+	}
+
 	inode->i_size = 0;
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
@@ -277,10 +304,10 @@ void ext4_evict_inode(struct inode *inode)
 	 * enough credits left in the handle to remove the inode from
 	 * the orphan list and set the dtime field.
 	 */
-	if (!ext4_handle_has_enough_credits(handle, 3)) {
-		err = ext4_journal_extend(handle, 3);
+	if (!ext4_handle_has_enough_credits(handle, extra_credits)) {
+		err = ext4_journal_extend(handle, extra_credits);
 		if (err > 0)
-			err = ext4_journal_restart(handle, 3);
+			err = ext4_journal_restart(handle, extra_credits);
 		if (err != 0) {
 			ext4_warning(inode->i_sb,
 				     "couldn't extend journal (err %d)", err);
@@ -315,8 +342,12 @@ void ext4_evict_inode(struct inode *inode)
 		ext4_clear_inode(inode);
 	else
 		ext4_free_inode(handle, inode);
+
 	ext4_journal_stop(handle);
 	sb_end_intwrite(inode->i_sb);
+
+	if (lea_ino_array != NULL)
+		ext4_xattr_inode_array_free(inode, lea_ino_array);
 	return;
 no_delete:
 	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
@@ -5504,7 +5535,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
  *
  * Also account for superblock, inode, quota and xattr blocks
  */
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
 				  int pextents)
 {
 	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 5d3c2536641c..444be5c7a1d5 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -177,9 +177,8 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
 
 	/* Check the values */
 	while (!IS_LAST_ENTRY(entry)) {
-		if (entry->e_value_block != 0)
-			return -EFSCORRUPTED;
-		if (entry->e_value_size != 0) {
+		if (entry->e_value_size != 0 &&
+		    entry->e_value_inum == 0) {
 			u16 offs = le16_to_cpu(entry->e_value_offs);
 			u32 size = le32_to_cpu(entry->e_value_size);
 			void *value;
@@ -269,6 +268,99 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
 	return cmp ? -ENODATA : 0;
 }
 
+/*
+ * Read the EA value from an inode.
+ */
+static int
+ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size)
+{
+	unsigned long block = 0;
+	struct buffer_head *bh = NULL;
+	int blocksize;
+	size_t csize, ret_size = 0;
+
+	if (*size == 0)
+		return 0;
+
+	blocksize = ea_inode->i_sb->s_blocksize;
+
+	while (ret_size < *size) {
+		csize = (*size - ret_size) > blocksize ? blocksize :
+							*size - ret_size;
+		bh = ext4_bread(NULL, ea_inode, block, 0);
+		if (IS_ERR(bh)) {
+			*size = ret_size;
+			return PTR_ERR(bh);
+		}
+		memcpy(buf, bh->b_data, csize);
+		brelse(bh);
+
+		buf += csize;
+		block += 1;
+		ret_size += csize;
+	}
+
+	*size = ret_size;
+
+	return 0;
+}
+
+struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, int *err)
+{
+	struct inode *ea_inode = NULL;
+
+	ea_inode = ext4_iget(parent->i_sb, ea_ino);
+	if (IS_ERR(ea_inode) || is_bad_inode(ea_inode)) {
+		int rc = IS_ERR(ea_inode) ? PTR_ERR(ea_inode) : 0;
+		ext4_error(parent->i_sb, "error while reading EA inode %lu "
+			   "/ %d %d", ea_ino, rc, is_bad_inode(ea_inode));
+		*err = rc != 0 ? rc : -EIO;
+		return NULL;
+	}
+
+	if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != parent->i_ino ||
+	    ea_inode->i_generation != parent->i_generation) {
+		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
+			   "to parent invalid.", ea_ino);
+		*err = -EINVAL;
+		goto error;
+	}
+
+	if (!(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL)) {
+		ext4_error(parent->i_sb, "EA inode %lu does not have "
+			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
+		*err = -EINVAL;
+		goto error;
+	}
+
+	*err = 0;
+	return ea_inode;
+
+error:
+	iput(ea_inode);
+	return NULL;
+}
+
+/*
+ * Read the value from the EA inode.
+ */
+static int
+ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
+		     size_t *size)
+{
+	struct inode *ea_inode = NULL;
+	int err;
+
+	ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
+	if (err)
+		return err;
+
+	err = ext4_xattr_inode_read(ea_inode, buffer, size);
+	iput(ea_inode);
+
+	return err;
+}
+
 static int
 ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 		     void *buffer, size_t buffer_size)
@@ -308,8 +400,16 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 		error = -ERANGE;
 		if (size > buffer_size)
 			goto cleanup;
-		memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
-		       size);
+		if (entry->e_value_inum) {
+			error = ext4_xattr_inode_get(inode,
+					     le32_to_cpu(entry->e_value_inum),
+					     buffer, &size);
+			if (error)
+				goto cleanup;
+		} else {
+			memcpy(buffer, bh->b_data +
+			       le16_to_cpu(entry->e_value_offs), size);
+		}
 	}
 	error = size;
 
@@ -350,8 +450,16 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
 		error = -ERANGE;
 		if (size > buffer_size)
 			goto cleanup;
-		memcpy(buffer, (void *)IFIRST(header) +
-		       le16_to_cpu(entry->e_value_offs), size);
+		if (entry->e_value_inum) {
+			error = ext4_xattr_inode_get(inode,
+					     le32_to_cpu(entry->e_value_inum),
+					     buffer, &size);
+			if (error)
+				goto cleanup;
+		} else {
+			memcpy(buffer, (void *)IFIRST(header) +
+			       le16_to_cpu(entry->e_value_offs), size);
+		}
 	}
 	error = size;
 
@@ -620,7 +728,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
 				    size_t *min_offs, void *base, int *total)
 {
 	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
-		if (last->e_value_size) {
+		if (!last->e_value_inum && last->e_value_size) {
 			size_t offs = le16_to_cpu(last->e_value_offs);
 			if (offs < *min_offs)
 				*min_offs = offs;
@@ -631,16 +739,173 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
 	return (*min_offs - ((void *)last - base) - sizeof(__u32));
 }
 
-static int
-ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
+/*
+ * Write the value of the EA in an inode.
+ */
+static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
+				  const void *buf, int bufsize)
+{
+	struct buffer_head *bh = NULL;
+	unsigned long block = 0;
+	unsigned blocksize = ea_inode->i_sb->s_blocksize;
+	unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
+	int csize, wsize = 0;
+	int ret = 0;
+	int retries = 0;
+
+retry:
+	while (ret >= 0 && ret < max_blocks) {
+		struct ext4_map_blocks map;
+		map.m_lblk = block += ret;
+		map.m_len = max_blocks -= ret;
+
+		ret = ext4_map_blocks(handle, ea_inode, &map,
+				      EXT4_GET_BLOCKS_CREATE);
+		if (ret <= 0) {
+			ext4_mark_inode_dirty(handle, ea_inode);
+			if (ret == -ENOSPC &&
+			    ext4_should_retry_alloc(ea_inode->i_sb, &retries)) {
+				ret = 0;
+				goto retry;
+			}
+			break;
+		}
+	}
+
+	if (ret < 0)
+		return ret;
+
+	block = 0;
+	while (wsize < bufsize) {
+		if (bh != NULL)
+			brelse(bh);
+		csize = (bufsize - wsize) > blocksize ? blocksize :
+								bufsize - wsize;
+		bh = ext4_getblk(handle, ea_inode, block, 0);
+		if (IS_ERR(bh)) {
+			ret = PTR_ERR(bh);
+			goto out;
+		}
+		ret = ext4_journal_get_write_access(handle, bh);
+		if (ret)
+			goto out;
+
+		memcpy(bh->b_data, buf, csize);
+		set_buffer_uptodate(bh);
+		ext4_handle_dirty_metadata(handle, ea_inode, bh);
+
+		buf += csize;
+		wsize += csize;
+		block += 1;
+	}
+
+	inode_lock(ea_inode);
+	i_size_write(ea_inode, wsize);
+	ext4_update_i_disksize(ea_inode, wsize);
+	inode_unlock(ea_inode);
+
+	ext4_mark_inode_dirty(handle, ea_inode);
+
+out:
+	brelse(bh);
+
+	return ret;
+}
+
+/*
+ * Create an inode to store the value of a large EA.
+ */
+static struct inode *ext4_xattr_inode_create(handle_t *handle,
+					     struct inode *inode)
+{
+	struct inode *ea_inode = NULL;
+
+	/*
+	 * Let the next inode be the goal, so we try and allocate the EA inode
+	 * in the same group, or nearby one.
+	 */
+	ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
+				  S_IFREG | 0600, NULL, inode->i_ino + 1, NULL);
+	if (!IS_ERR(ea_inode)) {
+		ea_inode->i_op = &ext4_file_inode_operations;
+		ea_inode->i_fop = &ext4_file_operations;
+		ext4_set_aops(ea_inode);
+		ea_inode->i_generation = inode->i_generation;
+		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
+
+		/*
+		 * A back-pointer from EA inode to parent inode will be useful
+		 * for e2fsck.
+		 */
+		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
+		unlock_new_inode(ea_inode);
+	}
+
+	return ea_inode;
+}
+
+/*
+ * Unlink the inode storing the value of the EA.
+ */
+int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
+{
+	struct inode *ea_inode = NULL;
+	int err;
+
+	ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
+	if (err)
+		return err;
+
+	clear_nlink(ea_inode);
+	iput(ea_inode);
+
+	return 0;
+}
+
+/*
+ * Add value of the EA in an inode.
+ */
+static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
+				unsigned long *ea_ino, const void *value,
+				size_t value_len)
+{
+	struct inode *ea_inode;
+	int err;
+
+	/* Create an inode for the EA value */
+	ea_inode = ext4_xattr_inode_create(handle, inode);
+	if (IS_ERR(ea_inode))
+		return PTR_ERR(ea_inode);
+
+	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
+	if (err)
+		clear_nlink(ea_inode);
+	else
+		*ea_ino = ea_inode->i_ino;
+
+	iput(ea_inode);
+
+	return err;
+}
+
+static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
+				struct ext4_xattr_search *s,
+				handle_t *handle, struct inode *inode)
 {
 	struct ext4_xattr_entry *last;
 	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
+	int in_inode = i->in_inode;
+	int rc;
+
+	if (ext4_has_feature_ea_inode(inode->i_sb) &&
+	    (EXT4_XATTR_SIZE(i->value_len) >
+	     EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize)))
+		in_inode = 1;
 
 	/* Compute min_offs and last. */
 	last = s->first;
 	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
-		if (last->e_value_size) {
+		if (!last->e_value_inum && last->e_value_size) {
 			size_t offs = le16_to_cpu(last->e_value_offs);
 			if (offs < min_offs)
 				min_offs = offs;
@@ -648,15 +913,20 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 	}
 	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
 	if (!s->not_found) {
-		if (s->here->e_value_size) {
+		if (!in_inode &&
+		    !s->here->e_value_inum && s->here->e_value_size) {
 			size_t size = le32_to_cpu(s->here->e_value_size);
 			free += EXT4_XATTR_SIZE(size);
 		}
 		free += EXT4_XATTR_LEN(name_len);
 	}
 	if (i->value) {
-		if (free < EXT4_XATTR_LEN(name_len) +
-			   EXT4_XATTR_SIZE(i->value_len))
+		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
+
+		if (in_inode)
+			value_len = 0;
+
+		if (free < EXT4_XATTR_LEN(name_len) + value_len)
 			return -ENOSPC;
 	}
 
@@ -670,7 +940,8 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 		s->here->e_name_len = name_len;
 		memcpy(s->here->e_name, i->name, name_len);
 	} else {
-		if (s->here->e_value_size) {
+		if (!s->here->e_value_inum && s->here->e_value_size &&
+		    s->here->e_value_offs > 0) {
 			void *first_val = s->base + min_offs;
 			size_t offs = le16_to_cpu(s->here->e_value_offs);
 			void *val = s->base + offs;
@@ -704,12 +975,18 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 			last = s->first;
 			while (!IS_LAST_ENTRY(last)) {
 				size_t o = le16_to_cpu(last->e_value_offs);
-				if (last->e_value_size && o < offs)
+				if (!last->e_value_inum &&
+				    last->e_value_size && o < offs)
 					last->e_value_offs =
 						cpu_to_le16(o + size);
 				last = EXT4_XATTR_NEXT(last);
 			}
 		}
+		if (s->here->e_value_inum) {
+			ext4_xattr_inode_unlink(inode,
+					    le32_to_cpu(s->here->e_value_inum));
+			s->here->e_value_inum = 0;
+		}
 		if (!i->value) {
 			/* Remove the old name. */
 			size_t size = EXT4_XATTR_LEN(name_len);
@@ -722,11 +999,20 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 
 	if (i->value) {
 		/* Insert the new value. */
-		s->here->e_value_size = cpu_to_le32(i->value_len);
-		if (i->value_len) {
+		if (in_inode) {
+			unsigned long ea_ino =
+				le32_to_cpu(s->here->e_value_inum);
+			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
+						  i->value, i->value_len);
+			if (rc)
+				goto out;
+			s->here->e_value_inum = cpu_to_le32(ea_ino);
+			s->here->e_value_offs = 0;
+		} else if (i->value_len) {
 			size_t size = EXT4_XATTR_SIZE(i->value_len);
 			void *val = s->base + min_offs - size;
 			s->here->e_value_offs = cpu_to_le16(min_offs - size);
+			s->here->e_value_inum = 0;
 			if (i->value == EXT4_ZERO_XATTR_VALUE) {
 				memset(val, 0, size);
 			} else {
@@ -736,8 +1022,11 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 				memcpy(val, i->value, i->value_len);
 			}
 		}
+		s->here->e_value_size = cpu_to_le32(i->value_len);
 	}
-	return 0;
+
+out:
+	return rc;
 }
 
 struct ext4_xattr_block_find {
@@ -801,8 +1090,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 
 #define header(x) ((struct ext4_xattr_header *)(x))
 
-	if (i->value && i->value_len > sb->s_blocksize)
-		return -ENOSPC;
 	if (s->base) {
 		BUFFER_TRACE(bs->bh, "get_write_access");
 		error = ext4_journal_get_write_access(handle, bs->bh);
@@ -821,7 +1108,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			mb_cache_entry_delete_block(ext4_mb_cache, hash,
 						    bs->bh->b_blocknr);
 			ea_bdebug(bs->bh, "modifying in-place");
-			error = ext4_xattr_set_entry(i, s);
+			error = ext4_xattr_set_entry(i, s, handle, inode);
 			if (!error) {
 				if (!IS_LAST_ENTRY(s->first))
 					ext4_xattr_rehash(header(s->base),
@@ -870,7 +1157,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		s->end = s->base + sb->s_blocksize;
 	}
 
-	error = ext4_xattr_set_entry(i, s);
+	error = ext4_xattr_set_entry(i, s, handle, inode);
 	if (error == -EFSCORRUPTED)
 		goto bad_block;
 	if (error)
@@ -1070,7 +1357,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
 
 	if (EXT4_I(inode)->i_extra_isize == 0)
 		return -ENOSPC;
-	error = ext4_xattr_set_entry(i, s);
+	error = ext4_xattr_set_entry(i, s, handle, inode);
 	if (error) {
 		if (error == -ENOSPC &&
 		    ext4_has_inline_data(inode)) {
@@ -1082,7 +1369,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
 			error = ext4_xattr_ibody_find(inode, i, is);
 			if (error)
 				return error;
-			error = ext4_xattr_set_entry(i, s);
+			error = ext4_xattr_set_entry(i, s, handle, inode);
 		}
 		if (error)
 			return error;
@@ -1098,7 +1385,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
-static int ext4_xattr_ibody_set(struct inode *inode,
+static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 				struct ext4_xattr_info *i,
 				struct ext4_xattr_ibody_find *is)
 {
@@ -1108,7 +1395,7 @@ static int ext4_xattr_ibody_set(struct inode *inode,
 
 	if (EXT4_I(inode)->i_extra_isize == 0)
 		return -ENOSPC;
-	error = ext4_xattr_set_entry(i, s);
+	error = ext4_xattr_set_entry(i, s, handle, inode);
 	if (error)
 		return error;
 	header = IHDR(inode, ext4_raw_inode(&is->iloc));
@@ -1155,7 +1442,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		.name = name,
 		.value = value,
 		.value_len = value_len,
-
+		.in_inode = 0,
 	};
 	struct ext4_xattr_ibody_find is = {
 		.s = { .not_found = -ENODATA, },
@@ -1204,7 +1491,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	}
 	if (!value) {
 		if (!is.s.not_found)
-			error = ext4_xattr_ibody_set(inode, &i, &is);
+			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
 		else if (!bs.s.not_found)
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
 	} else {
@@ -1215,7 +1502,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
 			goto cleanup;
 
-		error = ext4_xattr_ibody_set(inode, &i, &is);
+		error = ext4_xattr_ibody_set(handle, inode, &i, &is);
 		if (!error && !bs.s.not_found) {
 			i.value = NULL;
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
@@ -1226,11 +1513,20 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 					goto cleanup;
 			}
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
+			if (ext4_has_feature_ea_inode(inode->i_sb) &&
+			    error == -ENOSPC) {
+				/* xattr not fit to block, store at external
+				 * inode */
+				i.in_inode = 1;
+				error = ext4_xattr_ibody_set(handle, inode,
+							     &i, &is);
+			}
 			if (error)
 				goto cleanup;
 			if (!is.s.not_found) {
 				i.value = NULL;
-				error = ext4_xattr_ibody_set(inode, &i, &is);
+				error = ext4_xattr_ibody_set(handle, inode, &i,
+							     &is);
 			}
 		}
 	}
@@ -1269,12 +1565,26 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 	       const void *value, size_t value_len, int flags)
 {
 	handle_t *handle;
+	struct super_block *sb = inode->i_sb;
 	int error, retries = 0;
 	int credits = ext4_jbd2_credits_xattr(inode);
 
 	error = dquot_initialize(inode);
 	if (error)
 		return error;
+
+	if ((value_len >= EXT4_XATTR_MIN_LARGE_EA_SIZE(sb->s_blocksize)) &&
+	    ext4_has_feature_ea_inode(sb)) {
+		int nrblocks = (value_len + sb->s_blocksize - 1) >>
+					sb->s_blocksize_bits;
+
+		/* For new inode */
+		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
+
+		/* For data blocks of EA inode */
+		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
+	}
+
 retry:
 	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle)) {
@@ -1286,7 +1596,7 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 					      value, value_len, flags);
 		error2 = ext4_journal_stop(handle);
 		if (error == -ENOSPC &&
-		    ext4_should_retry_alloc(inode->i_sb, &retries))
+		    ext4_should_retry_alloc(sb, &retries))
 			goto retry;
 		if (error == 0)
 			error = error2;
@@ -1311,7 +1621,7 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
 
 	/* Adjust the value offsets of the entries */
 	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
-		if (last->e_value_size) {
+		if (!last->e_value_inum && last->e_value_size) {
 			new_offs = le16_to_cpu(last->e_value_offs) +
 							value_offs_shift;
 			last->e_value_offs = cpu_to_le16(new_offs);
@@ -1372,7 +1682,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
 		goto out;
 
 	/* Remove the chosen entry from the inode */
-	error = ext4_xattr_ibody_set(inode, &i, is);
+	error = ext4_xattr_ibody_set(handle, inode, &i, is);
 	if (error)
 		goto out;
 
@@ -1572,21 +1882,135 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 }
 
 
+#define EIA_INCR 16 /* must be 2^n */
+#define EIA_MASK (EIA_INCR - 1)
+/* Add the large xattr @ino into @lea_ino_array for later deletion.
+ * If @lea_ino_array is new or full it will be grown and the old
+ * contents copied over.
+ */
+static int
+ext4_expand_ino_array(struct ext4_xattr_ino_array **lea_ino_array, __u32 ino)
+{
+	if (*lea_ino_array == NULL) {
+		/*
+		 * Start with 15 inodes, so it fits into a power-of-two size.
+		 * If *lea_ino_array is NULL, this is essentially offsetof()
+		 */
+		(*lea_ino_array) =
+			kmalloc(offsetof(struct ext4_xattr_ino_array,
+					 xia_inodes[EIA_MASK]),
+				GFP_NOFS);
+		if (*lea_ino_array == NULL)
+			return -ENOMEM;
+		(*lea_ino_array)->xia_count = 0;
+	} else if (((*lea_ino_array)->xia_count & EIA_MASK) == EIA_MASK) {
+		/* expand the array once all 15 + n * 16 slots are full */
+		struct ext4_xattr_ino_array *new_array = NULL;
+		int count = (*lea_ino_array)->xia_count;
+
+		/* if new_array is NULL, this is essentially offsetof() */
+		new_array = kmalloc(
+				offsetof(struct ext4_xattr_ino_array,
+					 xia_inodes[count + EIA_INCR]),
+				GFP_NOFS);
+		if (new_array == NULL)
+			return -ENOMEM;
+		memcpy(new_array, *lea_ino_array,
+		       offsetof(struct ext4_xattr_ino_array,
+				xia_inodes[count]));
+		kfree(*lea_ino_array);
+		*lea_ino_array = new_array;
+	}
+	(*lea_ino_array)->xia_inodes[(*lea_ino_array)->xia_count++] = ino;
+	return 0;
+}
+
+/**
+ * Add xattr inode to orphan list
+ */
+static int
+ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode,
+			int credits, struct ext4_xattr_ino_array *lea_ino_array)
+{
+	struct inode *ea_inode = NULL;
+	int idx = 0, error = 0;
+
+	if (lea_ino_array == NULL)
+		return 0;
+
+	for (; idx < lea_ino_array->xia_count; ++idx) {
+		if (!ext4_handle_has_enough_credits(handle, credits)) {
+			error = ext4_journal_extend(handle, credits);
+			if (error > 0)
+				error = ext4_journal_restart(handle, credits);
+
+			if (error != 0) {
+				ext4_warning(inode->i_sb,
+					"couldn't extend journal "
+					"(err %d)", error);
+				return error;
+			}
+		}
+		ea_inode = ext4_xattr_inode_iget(inode,
+				lea_ino_array->xia_inodes[idx], &error);
+		if (error)
+			continue;
+		ext4_orphan_add(handle, ea_inode);
+		/* the inode's i_count will be released by caller */
+	}
+
+	return 0;
+}
 
 /*
  * ext4_xattr_delete_inode()
  *
- * Free extended attribute resources associated with this inode. This
+ * Free extended attribute resources associated with this inode. Traverse
+ * all entries and unlink any xattr inodes associated with this inode. This
  * is called immediately before an inode is freed. We have exclusive
- * access to the inode.
+ * access to the inode. If an orphan inode is deleted it will also delete any
+ * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
+ * to ensure they belong to the parent inode and were not deleted already.
  */
-void
-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+int
+ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
+			struct ext4_xattr_ino_array **lea_ino_array)
 {
 	struct buffer_head *bh = NULL;
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_inode *raw_inode;
+	struct ext4_iloc iloc;
+	struct ext4_xattr_entry *entry;
+	int credits = 3, error = 0;
 
-	if (!EXT4_I(inode)->i_file_acl)
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+		goto delete_external_ea;
+
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error)
+		goto cleanup;
+	raw_inode = ext4_raw_inode(&iloc);
+	header = IHDR(inode, raw_inode);
+	for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		if (ext4_expand_ino_array(lea_ino_array,
+					  entry->e_value_inum) != 0) {
+			brelse(iloc.bh);
+			goto cleanup;
+		}
+		entry->e_value_inum = 0;
+	}
+	brelse(iloc.bh);
+
+delete_external_ea:
+	if (!EXT4_I(inode)->i_file_acl) {
+		/* add xattr inode to orphan list */
+		ext4_xattr_inode_orphan_add(handle, inode, credits,
+						*lea_ino_array);
 		goto cleanup;
+	}
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
 	if (!bh) {
 		EXT4_ERROR_INODE(inode, "block %llu read error",
@@ -1599,11 +2023,69 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
 				 EXT4_I(inode)->i_file_acl);
 		goto cleanup;
 	}
+
+	for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		if (ext4_expand_ino_array(lea_ino_array,
+					  entry->e_value_inum) != 0)
+			goto cleanup;
+		entry->e_value_inum = 0;
+	}
+
+	/* add xattr inode to orphan list */
+	error = ext4_xattr_inode_orphan_add(handle, inode, credits,
+					*lea_ino_array);
+	if (error != 0)
+		goto cleanup;
+
+	if (!IS_NOQUOTA(inode))
+		credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
+
+	if (!ext4_handle_has_enough_credits(handle, credits)) {
+		error = ext4_journal_extend(handle, credits);
+		if (error > 0)
+			error = ext4_journal_restart(handle, credits);
+		if (error != 0) {
+			ext4_warning(inode->i_sb,
+				"couldn't extend journal (err %d)", error);
+			goto cleanup;
+		}
+	}
+
 	ext4_xattr_release_block(handle, inode, bh);
 	EXT4_I(inode)->i_file_acl = 0;
 
 cleanup:
 	brelse(bh);
+
+	return error;
+}
+
+void
+ext4_xattr_inode_array_free(struct inode *inode,
+			    struct ext4_xattr_ino_array *lea_ino_array)
+{
+	struct inode	*ea_inode = NULL;
+	int		idx = 0;
+	int		err;
+
+	if (lea_ino_array == NULL)
+		return;
+
+	for (; idx < lea_ino_array->xia_count; ++idx) {
+		ea_inode = ext4_xattr_inode_iget(inode,
+				lea_ino_array->xia_inodes[idx], &err);
+		if (err)
+			continue;
+		/* for inode's i_count get from ext4_xattr_delete_inode */
+		if (!list_empty(&EXT4_I(ea_inode)->i_orphan))
+			iput(ea_inode);
+		clear_nlink(ea_inode);
+		iput(ea_inode);
+	}
+	kfree(lea_ino_array);
 }
 
 /*
@@ -1655,10 +2137,9 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
 		    entry1->e_name_index != entry2->e_name_index ||
 		    entry1->e_name_len != entry2->e_name_len ||
 		    entry1->e_value_size != entry2->e_value_size ||
+		    entry1->e_value_inum != entry2->e_value_inum ||
 		    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
 			return 1;
-		if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
-			return -EFSCORRUPTED;
 		if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
 			   (char *)header2 + le16_to_cpu(entry2->e_value_offs),
 			   le32_to_cpu(entry1->e_value_size)))
@@ -1730,7 +2211,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
 		       *name++;
 	}
 
-	if (entry->e_value_size != 0) {
+	if (!entry->e_value_inum && entry->e_value_size) {
 		__le32 *value = (__le32 *)((char *)header +
 			le16_to_cpu(entry->e_value_offs));
 		for (n = (le32_to_cpu(entry->e_value_size) +
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 099c8b670ef5..6e10ff9393d4 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -44,7 +44,7 @@ struct ext4_xattr_entry {
 	__u8	e_name_len;	/* length of name */
 	__u8	e_name_index;	/* attribute name index */
 	__le16	e_value_offs;	/* offset in disk block of value */
-	__le32	e_value_block;	/* disk block attribute is stored on (n/i) */
+	__le32	e_value_inum;	/* inode in which the value is stored */
 	__le32	e_value_size;	/* size of attribute value */
 	__le32	e_hash;		/* hash value of name and value */
 	char	e_name[0];	/* attribute name */
@@ -69,6 +69,26 @@ struct ext4_xattr_entry {
 		EXT4_I(inode)->i_extra_isize))
 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
 
+/*
+ * Link EA inode back to parent one using i_mtime field.
+ * Extra integer type conversion added to ignore higher
+ * bits in i_mtime.tv_sec which might be set by ext4_get()
+ */
+#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
+do {                                                  \
+      (inode)->i_mtime.tv_sec = inum;                 \
+} while(0)
+
+#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
+((__u32)(inode)->i_mtime.tv_sec)
+
+/*
+ * The minimum size of EA value when you start storing it in an external inode
+ * size of block - size of header - size of 1 entry - 4 null bytes
+*/
+#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b)					\
+	((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)
+
 #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
 #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
 #define BFIRST(bh) ENTRY(BHDR(bh)+1)
@@ -77,10 +97,11 @@ struct ext4_xattr_entry {
 #define EXT4_ZERO_XATTR_VALUE ((void *)-1)
 
 struct ext4_xattr_info {
-	int name_index;
 	const char *name;
 	const void *value;
 	size_t value_len;
+	int name_index;
+	int in_inode;
 };
 
 struct ext4_xattr_search {
@@ -140,7 +161,13 @@ extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
 extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
 
-extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
+extern struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
+					   int *err);
+extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
+extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
+				   struct ext4_xattr_ino_array **array);
+extern void ext4_xattr_inode_array_free(struct inode *inode,
+					struct ext4_xattr_ino_array *array);
 
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
-- 
2.13.0.219.gdb65acc882-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* Re: [PATCH v2 27/28] ext4: xattr inode deduplication
  2017-05-31 22:33     ` [PATCH v2 " Tahsin Erdogan
  2017-06-02  5:41         ` Darrick J. Wong
@ 2017-06-02  5:41         ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-06-02  5:41 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

On Wed, May 31, 2017 at 03:33:57PM -0700, Tahsin Erdogan wrote:
> Ext4 now supports xattr values that are up to 64k in size (vfs limit).
> Large xattr values are stored in external inodes each one holding a
> single value. Once written the data blocks of these inodes are immutable.
> 
> The real world use cases are expected to have a lot of value duplication
> such as inherited acls etc. To reduce data duplication on disk, this patch
> implements a deduplicator that allows sharing of xattr inodes.
> 
> The deduplication is based on an in-memory hash lookup that is a best
> effort sharing scheme. When a xattr inode is read from disk (i.e.
> getxattr() call), its crc32c hash is added to a hash table. Before
> creating a new xattr inode for a value being set, the hash table is
> checked to see if an existing inode holds an identical value. If such an
> inode is found, the ref count on that inode is incremented. On value
> removal the ref count is decremented and if it reaches zero the inode is
> deleted.
> 
> The quota charging for such inodes is manually managed. Every reference
> holder is charged the full size as if there was no sharing happening.
> This is consistent with how xattr blocks are also charged.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
> v2:
>  - make dependency on crc32c dynamic
>  - update ext4_has_metadata_csum() and ext4_has_group_desc_csum() so that
>    they do not misinterpret existence of EXT4_SB(sb)->s_chksum_driver
> 
>  fs/ext4/acl.c   |    5 +-
>  fs/ext4/ext4.h  |   22 +-
>  fs/ext4/inode.c |    9 +-
>  fs/ext4/super.c |   25 +-
>  fs/ext4/xattr.c | 1075 +++++++++++++++++++++++++++++++++++++++++++------------
>  fs/ext4/xattr.h |   17 +-
>  fs/mbcache.c    |    9 +-
>  7 files changed, 893 insertions(+), 269 deletions(-)
> 
> diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
> index 74f7ac539e00..8db03e5c78bc 100644
> --- a/fs/ext4/acl.c
> +++ b/fs/ext4/acl.c
> @@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>  	if (error)
>  		return error;
>  retry:
> -	credits = ext4_xattr_set_credits(inode, acl_size);
> +	error = ext4_xattr_set_credits(inode, acl_size, &credits);
> +	if (error)
> +		return error;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>  	if (IS_ERR(handle))
>  		return PTR_ERR(handle);
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index d79d8d7bee88..7ceb1f81e4b8 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1517,6 +1517,7 @@ struct ext4_sb_info {
>  	long s_es_nr_inode;
>  	struct ext4_es_stats s_es_stats;
>  	struct mb_cache *s_mb_cache;
> +	struct mb_cache *s_ea_inode_cache;
>  	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
>  
>  	/* Ratelimit ext4 messages. */
> @@ -2099,7 +2100,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
>  	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
>  }
>  
> -#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
> +static inline bool ext4_is_quota_file(struct inode *inode)
> +{
> +	return IS_NOQUOTA(inode) &&
> +	       !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
> +}
>  
>  /*
>   * This structure is stuffed into the struct file's private_data field
> @@ -2709,19 +2714,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
>  extern int ext4_register_li_request(struct super_block *sb,
>  				    ext4_group_t first_not_zeroed);
>  
> -static inline int ext4_has_group_desc_csum(struct super_block *sb)
> -{
> -	return ext4_has_feature_gdt_csum(sb) ||
> -	       EXT4_SB(sb)->s_chksum_driver != NULL;
> -}
> -
>  static inline int ext4_has_metadata_csum(struct super_block *sb)
>  {
>  	WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
>  		     !EXT4_SB(sb)->s_chksum_driver);
>  
> -	return (EXT4_SB(sb)->s_chksum_driver != NULL);
> +	return ext4_has_feature_metadata_csum(sb) &&
> +	       (EXT4_SB(sb)->s_chksum_driver != NULL);
>  }
> +
> +static inline int ext4_has_group_desc_csum(struct super_block *sb)
> +{
> +	return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
> +}
> +
>  static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
>  {
>  	return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 4d6936f0d8a4..6f5872197d6c 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4843,8 +4843,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>  	}
>  	brelse(iloc.bh);
>  	ext4_set_inode_flags(inode);
> -	if (ei->i_flags & EXT4_EA_INODE_FL)
> +
> +	if (ei->i_flags & EXT4_EA_INODE_FL) {
>  		ext4_xattr_inode_set_class(inode);
> +
> +		inode_lock(inode);
> +		inode->i_flags |= S_NOQUOTA;
> +		inode_unlock(inode);
> +	}
> +
>  	unlock_new_inode(inode);
>  	return inode;
>  
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index b02a23ec92ca..9fcd29e21dc7 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
>  		invalidate_bdev(sbi->journal_bdev);
>  		ext4_blkdev_remove(sbi);
>  	}
> +	if (sbi->s_ea_inode_cache) {
> +		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
> +		sbi->s_ea_inode_cache = NULL;
> +	}
>  	if (sbi->s_mb_cache) {
>  		ext4_xattr_destroy_cache(sbi->s_mb_cache);
>  		sbi->s_mb_cache = NULL;
> @@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
>  	if (res)
>  		return res;
>  retry:
> -	credits = ext4_xattr_set_credits(inode, len);
> +	res = ext4_xattr_set_credits(inode, len, &credits);
> +	if (res)
> +		return res;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
>  	if (IS_ERR(handle))
>  		return PTR_ERR(handle);
> @@ -3445,7 +3452,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	}
>  
>  	/* Load the checksum driver */
> -	if (ext4_has_feature_metadata_csum(sb)) {
> +	if (ext4_has_feature_metadata_csum(sb) ||
> +	    ext4_has_feature_ea_inode(sb)) {
>  		sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
>  		if (IS_ERR(sbi->s_chksum_driver)) {
>  			ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
> @@ -4067,6 +4075,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  		goto failed_mount_wq;
>  	}
>  
> +	if (ext4_has_feature_ea_inode(sb)) {
> +		sbi->s_ea_inode_cache = ext4_xattr_create_cache();
> +		if (!sbi->s_ea_inode_cache) {
> +			ext4_msg(sb, KERN_ERR,
> +				 "Failed to create an s_ea_inode_cache");
> +			goto failed_mount_wq;
> +		}
> +	}
> +
>  	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
>  	    (blocksize != PAGE_SIZE)) {
>  		ext4_msg(sb, KERN_ERR,
> @@ -4296,6 +4313,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	if (EXT4_SB(sb)->rsv_conversion_wq)
>  		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
>  failed_mount_wq:
> +	if (sbi->s_ea_inode_cache) {
> +		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
> +		sbi->s_ea_inode_cache = NULL;
> +	}
>  	if (sbi->s_mb_cache) {
>  		ext4_xattr_destroy_cache(sbi->s_mb_cache);
>  		sbi->s_mb_cache = NULL;
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 6acce1f689ab..4c394411bf6f 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -79,6 +79,7 @@ ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
>  			    struct mb_cache_entry **);
>  static void ext4_xattr_rehash(struct ext4_xattr_header *,
>  			      struct ext4_xattr_entry *);
> +static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash);
>  
>  static const struct xattr_handler * const ext4_xattr_handler_map[] = {
>  	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
> @@ -105,13 +106,23 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
>  	NULL
>  };
>  
> +#define EXT4_XATTR_SYSTEM_EA_INFO  "eai"
> +
>  #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
>  				inode->i_sb->s_fs_info)->s_mb_cache)
>  
> +#define EA_INODE_CACHE(inode)	(((struct ext4_sb_info *) \
> +				inode->i_sb->s_fs_info)->s_ea_inode_cache)
> +
>  static int
>  ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>  			struct inode *inode);
>  
> +static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
> +				 u32 hash);
> +static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
> +				     u64 *ref_return, u32 *hash);
> +
>  #ifdef CONFIG_LOCKDEP
>  void ext4_xattr_inode_set_class(struct inode *ea_inode)
>  {
> @@ -329,14 +340,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>  		goto error;
>  	}
>  
> -	if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
> -	    inode->i_generation != parent->i_generation) {
> -		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
> -			   "to parent is invalid.", ea_ino);
> -		err = -EINVAL;
> -		goto error;
> -	}
> -
>  	if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
>  		ext4_error(parent->i_sb, "EA inode %lu does not have "
>  			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
> @@ -351,6 +354,12 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>  	return err;
>  }
>  
> +static u32
> +ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
> +{
> +	return ext4_chksum(sbi, 0, buffer, size);

Hmm... normally we'd supply sbi->s_csum_seed as the second argument so
that the metadata checksum value also has the fs uuid stamped into it.
That way if we ever encounter a piece of metadata we can positively
confirm that it belongs to this filesystem (vs. a piece of metadata that
came from a previous ext4 that had been written to the disk) or discard
it as being a ghost from an old iteration.  For xattrs I think we were
also baking in either the owning inode number (refcount == 1) or the
block number (refcount > 1) so that there's some redundant parent
pointer information encoded in the checksum too.

Even if you dismiss that, we usually follow the convention of
initializing the crc32c calculation with (~0U), not (0U), to strengthen
crc32c's ability to detect zeroes being injected at the start of the
stream.

--D

> +}
> +
>  /*
>   * Read the value from the EA inode.
>   */
> @@ -358,17 +367,53 @@ static int
>  ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
>  		     size_t size)
>  {
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
>  	struct inode *ea_inode;
> -	int ret;
> +	u32 hash, calc_hash;
> +	struct mb_cache_entry *ce;
> +	int err;
>  
> -	ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -	if (ret)
> -		return ret;
> +	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> +	if (err) {
> +		ea_inode = NULL;
> +		goto out;
> +	}
>  
> -	ret = ext4_xattr_inode_read(ea_inode, buffer, size);
> -	iput(ea_inode);
> +	if (i_size_read(ea_inode) != size) {
> +		ext4_warning_inode(ea_inode,
> +				   "ea_inode file size=%llu entry size=%zu",
> +				   i_size_read(ea_inode), size);
> +		err = -EFSCORRUPTED;
> +		goto out;
> +	}
>  
> -	return ret;
> +	err = ext4_xattr_inode_read(ea_inode, buffer, size);
> +	if (!err) {
> +		if (ext4_xattr_read_ea_hash(ea_inode, &hash))
> +			goto out;
> +
> +		/* Avoid hash calculation if already cached. */
> +		ce = mb_cache_entry_get(ea_inode_cache, hash, ea_inode->i_ino);
> +		if (ce) {
> +			mb_cache_entry_put(ea_inode_cache, ce);
> +			goto out;
> +		}
> +
> +		calc_hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), buffer,
> +						  size);
> +		if (hash != calc_hash) {
> +			ext4_warning_inode(ea_inode, "EA inode saved hash=%#x "
> +					   "does not match calc_hash=%#x",
> +					   hash, calc_hash);
> +			goto out;
> +		}
> +
> +		mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
> +				      ea_inode->i_ino, true /* reusable */);
> +	}
> +out:
> +	iput(ea_inode);
> +	return err;
>  }
>  
>  static int
> @@ -657,6 +702,101 @@ static void ext4_xattr_update_super_block(handle_t *handle,
>  	}
>  }
>  
> +static inline size_t round_up_cluster(struct inode *inode, size_t length)
> +{
> +	struct super_block *sb = inode->i_sb;
> +	size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
> +				    inode->i_blkbits);
> +	size_t mask = ~(cluster_size - 1);
> +
> +	return (length + cluster_size - 1) & mask;
> +}
> +
> +static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
> +{
> +	int err;
> +
> +	err = dquot_alloc_inode(inode);
> +	if (err)
> +		return err;
> +	err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
> +	if (err)
> +		dquot_free_inode(inode);
> +	return err;
> +}
> +
> +static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
> +{
> +	dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
> +	dquot_free_inode(inode);
> +}
> +
> +static int __ext4_xattr_set_credits(struct super_block *sb,
> +				    struct buffer_head *block_bh,
> +				    size_t value_len)
> +{
> +	int credits;
> +	int blocks;
> +
> +	/*
> +	 * 1) Owner inode update
> +	 * 2) Ref count update on old xattr block
> +	 * 3) new xattr block
> +	 * 4) block bitmap update for new xattr block
> +	 * 5) group descriptor for new xattr block
> +	 */
> +	credits = 5;
> +
> +	/* We are done if ea_inode feature is not enabled. */
> +	if (!ext4_has_feature_ea_inode(sb))
> +		return credits;
> +
> +	/* New ea_inode, inode map, block bitmap, group descriptor. */
> +	credits += 4;
> +
> +	/* Data blocks. */
> +	blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
> +
> +	/* Indirection block. */
> +	blocks += 1;
> +
> +	/* Block bitmap and group descriptor updates for each block. */
> +	credits += blocks * 2;
> +
> +	/* Blocks themselves. */
> +	credits += blocks;
> +
> +	/* Dereference ea_inode holding old xattr value.
> +	 * Old ea_inode, inode map, block bitmap, group descriptor.
> +	 */
> +	credits += 4;
> +
> +	/* Data blocks for old ea_inode. */
> +	blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
> +
> +	/* Indirection block for old ea_inode. */
> +	blocks += 1;
> +
> +	/* Block bitmap and group descriptor updates for each block. */
> +	credits += blocks * 2;
> +
> +	/* Quota updates. */
> +	credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
> +
> +	/* We may need to clone the existing xattr block in which case we need
> +	 * to increment ref counts for existing ea_inodes referenced by it.
> +	 */
> +	if (block_bh) {
> +		struct ext4_xattr_entry *entry = BFIRST(block_bh);
> +
> +		for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
> +			if (entry->e_value_inum)
> +				/* Ref count update on ea_inode. */
> +				credits += 1;
> +	}
> +	return credits;
> +}
> +
>  int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>  			      int credits, struct buffer_head *bh,
>  			      bool dirty, bool block_csum)
> @@ -706,12 +846,139 @@ int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>  	return 0;
>  }
>  
> +static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
> +				       int ref_change)
> +{
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
> +	struct ext4_iloc iloc;
> +	s64 ref_return;
> +	u32 hash;
> +	int ret;
> +
> +	inode_lock(ea_inode);
> +
> +	ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
> +	if (ret) {
> +		iloc.bh = NULL;
> +		goto out;
> +	}
> +
> +	ret = ext4_xattr_update_ea_info(ea_inode, ref_change, &ref_return,
> +					&hash);
> +	if (ret)
> +		goto out;
> +
> +	if (ref_change > 0) {
> +		WARN_ONCE(ref_return <= 0, "EA inode %lu ref_return=%lld",
> +			  ea_inode->i_ino, ref_return);
> +
> +		if (ref_return == 1) {
> +			WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
> +				  ea_inode->i_ino, ea_inode->i_nlink);
> +
> +			set_nlink(ea_inode, 1);
> +			ext4_orphan_del(handle, ea_inode);
> +
> +			mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
> +					      ea_inode->i_ino,
> +					      true /* reusable */);
> +		}
> +	} else {
> +		WARN_ONCE(ref_return < 0, "EA inode %lu ref_return=%lld",
> +			  ea_inode->i_ino, ref_return);
> +
> +		if (ref_return == 0) {
> +			WARN_ONCE(ea_inode->i_nlink != 1,
> +				  "EA inode %lu i_nlink=%u",
> +				  ea_inode->i_ino, ea_inode->i_nlink);
> +
> +			clear_nlink(ea_inode);
> +			ext4_orphan_add(handle, ea_inode);
> +
> +			mb_cache_entry_delete(ea_inode_cache, hash,
> +					      ea_inode->i_ino);
> +		}
> +	}
> +
> +	ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
> +	iloc.bh = NULL;
> +	if (ret)
> +		ext4_warning_inode(ea_inode,
> +				   "ext4_mark_iloc_dirty() failed ret=%d", ret);
> +out:
> +	brelse(iloc.bh);
> +	inode_unlock(ea_inode);
> +	return ret;
> +}
> +
> +static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
> +{
> +	return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
> +}
> +
> +static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
> +{
> +	return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
> +}
> +
> +static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
> +					struct ext4_xattr_entry *first)
> +{
> +	struct inode *ea_inode;
> +	struct ext4_xattr_entry *entry;
> +	struct ext4_xattr_entry *failed_entry;
> +	unsigned int ea_ino;
> +	int err, saved_err;
> +
> +	for (entry = first; !IS_LAST_ENTRY(entry);
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		ea_ino = le32_to_cpu(entry->e_value_inum);
> +		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +		if (err)
> +			goto cleanup;
> +		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
> +		if (err) {
> +			ext4_warning_inode(ea_inode, "inc ref error %d", err);
> +			iput(ea_inode);
> +			goto cleanup;
> +		}
> +		iput(ea_inode);
> +	}
> +	return 0;
> +
> +cleanup:
> +	saved_err = err;
> +	failed_entry = entry;
> +
> +	for (entry = first; entry != failed_entry;
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		ea_ino = le32_to_cpu(entry->e_value_inum);
> +		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +		if (err) {
> +			ext4_warning(parent->i_sb,
> +				     "cleanup ea_ino %u iget error %d", ea_ino,
> +				     err);
> +			continue;
> +		}
> +		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (err)
> +			ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
> +					   err);
> +		iput(ea_inode);
> +	}
> +	return saved_err;
> +}
> +
>  static void
> -ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
> -			    struct buffer_head *bh,
> -			    struct ext4_xattr_entry *first, bool block_csum,
> -			    struct ext4_xattr_inode_array **ea_inode_array,
> -			    int extra_credits)
> +ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
> +			     struct buffer_head *bh,
> +			     struct ext4_xattr_entry *first, bool block_csum,
> +			     struct ext4_xattr_inode_array **ea_inode_array,
> +			     int extra_credits, bool skip_quota)
>  {
>  	struct inode *ea_inode;
>  	struct ext4_xattr_entry *entry;
> @@ -748,10 +1015,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>  			continue;
>  		}
>  
> -		inode_lock(ea_inode);
> -		clear_nlink(ea_inode);
> -		ext4_orphan_add(handle, ea_inode);
> -		inode_unlock(ea_inode);
> +		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (err) {
> +			ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
> +					   err);
> +			continue;
> +		}
> +
> +		if (!skip_quota)
> +			ext4_xattr_inode_free_quota(parent,
> +					      le32_to_cpu(entry->e_value_size));
>  
>  		/*
>  		 * Forget about ea_inode within the same transaction that decrements the ref
> @@ -784,7 +1057,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>   */
>  static void
>  ext4_xattr_release_block(handle_t *handle, struct inode *inode,
> -			 struct buffer_head *bh)
> +			 struct buffer_head *bh,
> +			 struct ext4_xattr_inode_array **ea_inode_array,
> +			 int extra_credits)
>  {
>  	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
>  	u32 hash, ref;
> @@ -807,6 +1082,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>  		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
>  		get_bh(bh);
>  		unlock_buffer(bh);
> +
> +		if (ext4_has_feature_ea_inode(inode->i_sb))
> +			ext4_xattr_inode_dec_ref_all(handle, inode, bh,
> +						     BFIRST(bh),
> +						     true /* block_csum */,
> +						     ea_inode_array,
> +						     extra_credits,
> +						     true /* skip_quota */);
>  		ext4_free_blocks(handle, inode, bh, 0, 1,
>  				 EXT4_FREE_BLOCKS_METADATA |
>  				 EXT4_FREE_BLOCKS_FORGET);
> @@ -947,7 +1230,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
>   * Create an inode to store the value of a large EA.
>   */
>  static struct inode *ext4_xattr_inode_create(handle_t *handle,
> -					     struct inode *inode)
> +					     struct inode *inode, u32 hash)
>  {
>  	struct inode *ea_inode = NULL;
>  	uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
> @@ -965,67 +1248,119 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
>  		ea_inode->i_fop = &ext4_file_operations;
>  		ext4_set_aops(ea_inode);
>  		ext4_xattr_inode_set_class(ea_inode);
> -		ea_inode->i_generation = inode->i_generation;
> -		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
> -
> -		/*
> -		 * A back-pointer from EA inode to parent inode will be useful
> -		 * for e2fsck.
> -		 */
> -		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
>  		unlock_new_inode(ea_inode);
>  		err = ext4_inode_attach_jinode(ea_inode);
> +		if (!err)
> +			err = ext4_xattr_inode_init(handle, ea_inode, hash);
>  		if (err) {
>  			iput(ea_inode);
>  			return ERR_PTR(err);
>  		}
> +
> +		/*
> +		 * Xattr inodes are shared therefore quota charging is performed
> +		 * at a higher level.
> +		 */
> +		dquot_free_inode(ea_inode);
> +		dquot_drop(ea_inode);
> +		inode_lock(ea_inode);
> +		ea_inode->i_flags |= S_NOQUOTA;
> +		inode_unlock(ea_inode);
>  	}
>  
>  	return ea_inode;
>  }
>  
> -/*
> - * Unlink the inode storing the value of the EA.
> - */
> -int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
> +static struct inode *
> +ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
> +			    size_t value_len, u32 hash)
>  {
> -	struct inode *ea_inode = NULL;
> +	struct inode *ea_inode;
> +	struct mb_cache_entry *ce;
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
> +	void *ea_data = NULL;
>  	int err;
>  
> -	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -	if (err)
> -		return err;
> +	ce = mb_cache_entry_find_first(ea_inode_cache, hash);
> +	while (ce) {
> +		ea_inode = ext4_iget(inode->i_sb, ce->e_value);
> +		if (IS_ERR(ea_inode)) {
> +			ea_inode = NULL;
> +			goto next;
> +		}
>  
> -	clear_nlink(ea_inode);
> -	iput(ea_inode);
> +		if (is_bad_inode(ea_inode) ||
> +		    !(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) ||
> +		    i_size_read(ea_inode) != value_len)
> +			goto next;
>  
> -	return 0;
> +		if (!ea_data)
> +			ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
> +
> +		if (!ea_data) {
> +			iput(ea_inode);
> +			return NULL;
> +		}
> +
> +		err = ext4_xattr_inode_read(ea_inode, ea_data, value_len);
> +		if (unlikely(err))
> +			goto next;
> +
> +		if (!memcmp(value, ea_data, value_len)) {
> +			mb_cache_entry_touch(ea_inode_cache, ce);
> +			mb_cache_entry_put(ea_inode_cache, ce);
> +			kvfree(ea_data);
> +			return ea_inode;
> +		}
> +	next:
> +		iput(ea_inode);
> +		ce = mb_cache_entry_find_next(ea_inode_cache, ce);
> +	}
> +	kvfree(ea_data);
> +	return NULL;
>  }
>  
>  /*
>   * Add value of the EA in an inode.
>   */
> -static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
> -				unsigned long *ea_ino, const void *value,
> -				size_t value_len)
> +static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
> +					  const void *value, size_t value_len,
> +					  struct inode **ret_inode)
>  {
>  	struct inode *ea_inode;
> +	u32 hash;
>  	int err;
>  
> +	hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
> +	ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
> +	if (ea_inode) {
> +		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
> +		if (err) {
> +			iput(ea_inode);
> +			return err;
> +		}
> +
> +		*ret_inode = ea_inode;
> +		return 0;
> +	}
> +
>  	/* Create an inode for the EA value */
> -	ea_inode = ext4_xattr_inode_create(handle, inode);
> +	ea_inode = ext4_xattr_inode_create(handle, inode, hash);
>  	if (IS_ERR(ea_inode))
>  		return PTR_ERR(ea_inode);
>  
>  	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
> -	if (err)
> -		clear_nlink(ea_inode);
> -	else
> -		*ea_ino = ea_inode->i_ino;
> +	if (err) {
> +		ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		iput(ea_inode);
> +		return err;
> +	}
>  
> -	iput(ea_inode);
> +	mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
> +			      ea_inode->i_ino, true /* reusable */);
>  
> -	return err;
> +	*ret_inode = ea_inode;
> +	return 0;
>  }
>  
>  static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
> @@ -1033,11 +1368,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>  				handle_t *handle, struct inode *inode)
>  {
>  	struct ext4_xattr_entry *last;
> +	struct ext4_xattr_entry *here = s->here;
>  	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
>  	int in_inode = i->in_inode;
> -	int rc;
> +	struct inode *old_ea_inode = NULL;
> +	struct inode *new_ea_inode = NULL;
> +	int ret;
>  
> -	/* Compute min_offs and last. */
> +	/*
> +	 * Optimization for the simple case when old and new values have the
> +	 * same padded sizes. Not applicable if the existing value is stored in
> +	 * an external inode.
> +	 */
> +	if (i->value && !s->not_found && !here->e_value_inum &&
> +	    EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) ==
> +	    EXT4_XATTR_SIZE(i->value_len)) {
> +		size_t offs = le16_to_cpu(here->e_value_offs);
> +		void *val = s->base + offs;
> +		size_t size = EXT4_XATTR_SIZE(i->value_len);
> +
> +		here->e_value_size = cpu_to_le32(i->value_len);
> +		if (i->value == EXT4_ZERO_XATTR_VALUE) {
> +			memset(val, 0, size);
> +		} else {
> +			memcpy(val, i->value, i->value_len);
> +			/* Clear padding bytes. */
> +			memset(val + i->value_len, 0, size - i->value_len);
> +		}
> +		return 0;
> +	}
> +
> +	/* Find out min_offs and last to calculate the free space. */
>  	last = s->first;
>  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
>  		if (!last->e_value_inum && last->e_value_size) {
> @@ -1048,120 +1409,149 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>  	}
>  	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
>  	if (!s->not_found) {
> -		if (!in_inode &&
> -		    !s->here->e_value_inum && s->here->e_value_size) {
> -			size_t size = le32_to_cpu(s->here->e_value_size);
> +		if (!here->e_value_inum && here->e_value_size) {
> +			size_t size = le32_to_cpu(here->e_value_size);
>  			free += EXT4_XATTR_SIZE(size);
>  		}
>  		free += EXT4_XATTR_LEN(name_len);
>  	}
>  	if (i->value) {
> -		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
> +		size_t value_len = in_inode ? 0 : EXT4_XATTR_SIZE(i->value_len);
>  
> -		if (in_inode)
> -			value_len = 0;
> +		if (free < EXT4_XATTR_LEN(name_len) + value_len) {
> +			ret = -ENOSPC;
> +			goto out;
> +		}
> +	}
>  
> -		if (free < EXT4_XATTR_LEN(name_len) + value_len)
> -			return -ENOSPC;
> +	/*
> +	 * Getting access to old and new ea inodes is subject to failures.
> +	 * Finish that work before doing any modifications to the xattr data.
> +	 */
> +	if (!s->not_found && here->e_value_inum) {
> +		ret = ext4_xattr_inode_iget(inode,
> +		 			    le32_to_cpu(here->e_value_inum),
> +		 			    &old_ea_inode);
> +		if (ret) {
> +			old_ea_inode = NULL;
> +			goto out;
> +		}
>  	}
> +	if (i->value && in_inode) {
> +		WARN_ON_ONCE(!i->value_len);
>  
> -	if (i->value && s->not_found) {
> -		/* Insert the new name. */
> -		size_t size = EXT4_XATTR_LEN(name_len);
> -		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
> -		memmove((void *)s->here + size, s->here, rest);
> -		memset(s->here, 0, size);
> -		s->here->e_name_index = i->name_index;
> -		s->here->e_name_len = name_len;
> -		memcpy(s->here->e_name, i->name, name_len);
> -	} else {
> -		if (!s->here->e_value_inum && s->here->e_value_size &&
> -		    s->here->e_value_offs > 0) {
> -			void *first_val = s->base + min_offs;
> -			size_t offs = le16_to_cpu(s->here->e_value_offs);
> -			void *val = s->base + offs;
> -			size_t size = EXT4_XATTR_SIZE(
> -				le32_to_cpu(s->here->e_value_size));
> -
> -			if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
> -				/* The old and the new value have the same
> -				   size. Just replace. */
> -				s->here->e_value_size =
> -					cpu_to_le32(i->value_len);
> -				if (i->value == EXT4_ZERO_XATTR_VALUE) {
> -					memset(val, 0, size);
> -				} else {
> -					/* Clear pad bytes first. */
> -					memset(val + size - EXT4_XATTR_PAD, 0,
> -					       EXT4_XATTR_PAD);
> -					memcpy(val, i->value, i->value_len);
> -				}
> -				return 0;
> -			}
> +		ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
> +		if (ret)
> +			goto out;
>  
> -			/* Remove the old value. */
> -			memmove(first_val + size, first_val, val - first_val);
> -			memset(first_val, 0, size);
> -			s->here->e_value_size = 0;
> -			s->here->e_value_offs = 0;
> -			min_offs += size;
> -
> -			/* Adjust all value offsets. */
> -			last = s->first;
> -			while (!IS_LAST_ENTRY(last)) {
> -				size_t o = le16_to_cpu(last->e_value_offs);
> -				if (!last->e_value_inum &&
> -				    last->e_value_size && o < offs)
> -					last->e_value_offs =
> -						cpu_to_le16(o + size);
> -				last = EXT4_XATTR_NEXT(last);
> -			}
> +		ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
> +						     i->value_len,
> +						     &new_ea_inode);
> +		if (ret) {
> +			new_ea_inode = NULL;
> +			ext4_xattr_inode_free_quota(inode, i->value_len);
> +			goto out;
>  		}
> -		if (s->here->e_value_inum) {
> -			ext4_xattr_inode_unlink(inode,
> -					    le32_to_cpu(s->here->e_value_inum));
> -			s->here->e_value_inum = 0;
> +	}
> +
> +	if (old_ea_inode) {
> +		/* We are ready to release ref count on the old_ea_inode. */
> +		ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
> +		if (ret) {
> +			/* Release newly required ref count on new_ea_inode. */
> +			if (new_ea_inode) {
> +				int err;
> +
> +				err = ext4_xattr_inode_dec_ref(handle,
> +							       new_ea_inode);
> +				if (err)
> +					ext4_warning_inode(new_ea_inode,
> +						  "dec ref new_ea_inode err=%d",
> +						  err);
> +				ext4_xattr_inode_free_quota(inode,
> +							    i->value_len);
> +			}
> +			goto out;
>  		}
> -		if (!i->value) {
> -			/* Remove the old name. */
> -			size_t size = EXT4_XATTR_LEN(name_len);
> -			last = ENTRY((void *)last - size);
> -			memmove(s->here, (void *)s->here + size,
> -				(void *)last - (void *)s->here + sizeof(__u32));
> -			memset(last, 0, size);
> +
> +		ext4_xattr_inode_free_quota(inode,
> +					    le32_to_cpu(here->e_value_size));
> +	}
> +
> +	/* No failures allowed past this point. */
> +
> +	if (!s->not_found && here->e_value_offs) {
> +		/* Remove the old value. */
> +		void *first_val = s->base + min_offs;
> +		size_t offs = le16_to_cpu(here->e_value_offs);
> +		void *val = s->base + offs;
> +		size_t size = EXT4_XATTR_SIZE(
> +			le32_to_cpu(here->e_value_size));
> +
> +		memmove(first_val + size, first_val, val - first_val);
> +		memset(first_val, 0, size);
> +		min_offs += size;
> +
> +		/* Adjust all value offsets. */
> +		last = s->first;
> +		while (!IS_LAST_ENTRY(last)) {
> +			size_t o = le16_to_cpu(last->e_value_offs);
> +			if (!last->e_value_inum &&
> +			    last->e_value_size && o < offs)
> +				last->e_value_offs =
> +					cpu_to_le16(o + size);
> +			last = EXT4_XATTR_NEXT(last);
>  		}
>  	}
>  
> +	if (!s->not_found && !i->value) {
> +		/* Remove old name. */
> +		size_t size = EXT4_XATTR_LEN(name_len);
> +		last = ENTRY((void *)last - size);
> +		memmove(here, (void *)here + size,
> +			(void *)last - (void *)here + sizeof(__u32));
> +		memset(last, 0, size);
> +	} else if (s->not_found && i->value) {
> +		/* Insert new name. */
> +		size_t size = EXT4_XATTR_LEN(name_len);
> +		size_t rest = (void *)last - (void *)here + sizeof(__u32);
> +		memmove((void *)here + size, here, rest);
> +		memset(here, 0, size);
> +		here->e_name_index = i->name_index;
> +		here->e_name_len = name_len;
> +		memcpy(here->e_name, i->name, name_len);
> +	} else {
> +		WARN_ON_ONCE(s->not_found || !i->value);
> +		/* This is an update, reset value info. */
> +		here->e_value_inum = 0;
> +		here->e_value_offs = 0;
> +		here->e_value_size = 0;
> +	}
> +
>  	if (i->value) {
> -		/* Insert the new value. */
> +		/* Insert new value. */
>  		if (in_inode) {
> -			unsigned long ea_ino =
> -				le32_to_cpu(s->here->e_value_inum);
> -			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
> -						  i->value, i->value_len);
> -			if (rc)
> -				goto out;
> -			s->here->e_value_inum = cpu_to_le32(ea_ino);
> -			s->here->e_value_offs = 0;
> +			here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
>  		} else if (i->value_len) {
>  			size_t size = EXT4_XATTR_SIZE(i->value_len);
>  			void *val = s->base + min_offs - size;
> -			s->here->e_value_offs = cpu_to_le16(min_offs - size);
> -			s->here->e_value_inum = 0;
> +			here->e_value_offs = cpu_to_le16(min_offs - size);
>  			if (i->value == EXT4_ZERO_XATTR_VALUE) {
>  				memset(val, 0, size);
>  			} else {
> -				/* Clear the pad bytes first. */
> -				memset(val + size - EXT4_XATTR_PAD, 0,
> -				       EXT4_XATTR_PAD);
>  				memcpy(val, i->value, i->value_len);
> +				/* Clear padding bytes. */
> +				memset(val + i->value_len, 0,
> +				       size - i->value_len);
>  			}
>  		}
> -		s->here->e_value_size = cpu_to_le32(i->value_len);
> +		here->e_value_size = cpu_to_le32(i->value_len);
>  	}
> -
> +	ret = 0;
>  out:
> -	return rc;
> +	iput(old_ea_inode);
> +	iput(new_ea_inode);
> +	return ret;
>  }
>  
>  struct ext4_xattr_block_find {
> @@ -1223,6 +1613,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  	struct mb_cache_entry *ce = NULL;
>  	int error = 0;
>  	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
> +	struct inode *ea_inode = NULL;
> +	size_t old_ea_inode_size = 0;
>  
>  #define header(x) ((struct ext4_xattr_header *)(x))
>  
> @@ -1277,6 +1669,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  			header(s->base)->h_refcount = cpu_to_le32(1);
>  			s->here = ENTRY(s->base + offset);
>  			s->end = s->base + bs->bh->b_size;
> +
> +			/*
> +			 * If existing entry points to an xattr inode, we need
> +			 * to prevent ext4_xattr_set_entry() from decrementing
> +			 * ref count on it because the reference belongs to the
> +			 * original block. In this case, make the entry look
> +			 * like it has an empty value.
> +			 */
> +			if (!s->not_found && s->here->e_value_inum) {
> +				/*
> +				 * Defer quota free call for previous inode
> +				 * until success is guaranteed.
> +				 */
> +				old_ea_inode_size = le32_to_cpu(
> +							s->here->e_value_size);
> +				s->here->e_value_inum = 0;
> +				s->here->e_value_size = 0;
> +			}
>  		}
>  	} else {
>  		/* Allocate a buffer where we construct the new block. */
> @@ -1298,6 +1708,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  		goto bad_block;
>  	if (error)
>  		goto cleanup;
> +
> +	if (i->value && s->here->e_value_inum) {
> +		unsigned int ea_ino;
> +
> +		/*
> +		 * A ref count on ea_inode has been taken as part of the call to
> +		 * ext4_xattr_set_entry() above. We would like to drop this
> +		 * extra ref but we have to wait until the xattr block is
> +		 * initialized and has its own ref count on the ea_inode.
> +		 */
> +		ea_ino = le32_to_cpu(s->here->e_value_inum);
> +		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> +		if (error) {
> +			ea_inode = NULL;
> +			goto cleanup;
> +		}
> +	}
> +
>  	if (!IS_LAST_ENTRY(s->first))
>  		ext4_xattr_rehash(header(s->base), s->here);
>  
> @@ -1408,6 +1836,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  						 EXT4_FREE_BLOCKS_METADATA);
>  				goto cleanup;
>  			}
> +			error = ext4_xattr_inode_inc_ref_all(handle, inode,
> +						      ENTRY(header(s->base)+1));
> +			if (error)
> +				goto getblk_failed;
> +			if (ea_inode) {
> +				/* Drop the extra ref on ea_inode. */
> +				error = ext4_xattr_inode_dec_ref(handle,
> +								 ea_inode);
> +				if (error)
> +					ext4_warning_inode(ea_inode,
> +							   "dec ref error=%d",
> +							   error);
> +				iput(ea_inode);
> +				ea_inode = NULL;
> +			}
> +
>  			lock_buffer(new_bh);
>  			error = ext4_journal_get_create_access(handle, new_bh);
>  			if (error) {
> @@ -1427,15 +1871,36 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  		}
>  	}
>  
> +	if (old_ea_inode_size)
> +		ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
> +
>  	/* Update the inode. */
>  	EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
>  
>  	/* Drop the previous xattr block. */
> -	if (bs->bh && bs->bh != new_bh)
> -		ext4_xattr_release_block(handle, inode, bs->bh);
> +	if (bs->bh && bs->bh != new_bh) {
> +		struct ext4_xattr_inode_array *ea_inode_array = NULL;
> +		ext4_xattr_release_block(handle, inode, bs->bh,
> +					 &ea_inode_array,
> +					 0 /* extra_credits */);
> +		ext4_xattr_inode_array_free(ea_inode_array);
> +	}
>  	error = 0;
>  
>  cleanup:
> +	if (ea_inode) {
> +		int error2;
> +		error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (error2)
> +			ext4_warning_inode(ea_inode, "dec ref error=%d",
> +					   error2);
> +
> +		/* If there was an error, revert the quota charge. */
> +		if (error)
> +			ext4_xattr_inode_free_quota(inode,
> +						    i_size_read(ea_inode));
> +		iput(ea_inode);
> +	}
>  	if (ce)
>  		mb_cache_entry_put(ext4_mb_cache, ce);
>  	brelse(new_bh);
> @@ -1546,6 +2011,117 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
>  	return 0;
>  }
>  
> +struct ext4_xattr_ea_info {
> +	__le64 ref_count;	/* number of xattr entry references */
> +	__le32 hash;		/* crc32c hash of xattr data */
> +	__le32 reserved;	/* reserved, must be 0 */
> +};
> +
> +static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
> +				 u32 hash)
> +{
> +	struct ext4_xattr_ea_info ea_info = {
> +		.ref_count = cpu_to_le64(1),
> +		.hash = cpu_to_le32(hash),
> +		.reserved = 0,
> +	};
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +		.value = &ea_info,
> +		.value_len = sizeof(ea_info),
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	return ext4_xattr_ibody_set(handle, ea_inode, &i, &is);
> +}
> +
> +static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
> +				     u64 *ref_return, u32 *hash)
> +{
> +	struct ext4_xattr_ea_info ea_info;
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +		.value = &ea_info,
> +		.value_len = sizeof(ea_info),
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	if (WARN_ON(is.s.not_found) ||
> +	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(ea_info)))
> +		return -EFSCORRUPTED;
> +
> +	memcpy(&ea_info,
> +	       ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs),
> +	       sizeof(ea_info));
> +
> +	if (hash)
> +		*hash = le32_to_cpu(ea_info.hash);
> +
> +	*ref_return = le64_to_cpu(ea_info.ref_count) + ref_change;
> +	ea_info.ref_count = cpu_to_le64(*ref_return);
> +
> +	return ext4_xattr_set_entry(&i, &is.s, NULL, ea_inode);
> +}
> +
> +static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash)
> +{
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	struct ext4_xattr_ea_info *ea_info;
> +	void *ptr;
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	if (WARN_ON(is.s.not_found) ||
> +	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(*ea_info)))
> +		return -EFSCORRUPTED;
> +
> +	ptr = ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs);
> +	ea_info = (struct ext4_xattr_ea_info *)ptr;
> +
> +	if (WARN_ON(ea_info->reserved != 0))
> +		return -EFSCORRUPTED;
> +
> +	*hash = le32_to_cpu(ea_info->hash);
> +	return 0;
> +}
> +
>  static int ext4_xattr_value_same(struct ext4_xattr_search *s,
>  				 struct ext4_xattr_info *i)
>  {
> @@ -1560,6 +2136,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
>  	return !memcmp(value, i->value, i->value_len);
>  }
>  
> +struct buffer_head *ext4_xattr_get_block(struct inode *inode)
> +{
> +	struct buffer_head *bh;
> +	int error;
> +
> +	if (!EXT4_I(inode)->i_file_acl)
> +		return NULL;
> +	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +	if (!bh)
> +		return ERR_PTR(-EIO);
> +	error = ext4_xattr_check_block(inode, bh);
> +	if (error)
> +		return ERR_PTR(error);
> +	return bh;
> +}
> +
>  /*
>   * ext4_xattr_set_handle()
>   *
> @@ -1602,9 +2194,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  
>  	/* Check journal credits under write lock. */
>  	if (ext4_handle_valid(handle)) {
> +		struct buffer_head *bh;
>  		int credits;
>  
> -		credits = ext4_xattr_set_credits(inode, value_len);
> +		bh = ext4_xattr_get_block(inode);
> +		if (IS_ERR(bh)) {
> +			error = PTR_ERR(bh);
> +			goto cleanup;
> +		}
> +
> +		credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
> +		brelse(bh);
> +
>  		if (!ext4_handle_has_enough_credits(handle, credits)) {
>  			error = -ENOSPC;
>  			goto cleanup;
> @@ -1640,6 +2241,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  		if (flags & XATTR_CREATE)
>  			goto cleanup;
>  	}
> +
>  	if (!value) {
>  		if (!is.s.not_found)
>  			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
> @@ -1708,34 +2310,29 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  	return error;
>  }
>  
> -int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
> +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
>  {
> -	struct super_block *sb = inode->i_sb;
> -	int credits;
> -
> -	if (!EXT4_SB(sb)->s_journal)
> -		return 0;
> -
> -	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
> +	struct buffer_head *bh;
> +	int err;
>  
> -	/*
> -	 * In case of inline data, we may push out the data to a block,
> -	 * so we need to reserve credits for this eventuality
> -	 */
> -	if (ext4_has_inline_data(inode))
> -	        credits += ext4_writepage_trans_blocks(inode) + 1;
> +	*credits = 0;
>  
> -	if (ext4_has_feature_ea_inode(sb)) {
> -		int nrblocks = (value_len + sb->s_blocksize - 1) >>
> -					sb->s_blocksize_bits;
> +	if (!EXT4_SB(inode->i_sb)->s_journal)
> +		return 0;
>  
> -		/* For new inode */
> -		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
> +	down_read(&EXT4_I(inode)->xattr_sem);
>  
> -		/* For data blocks of EA inode */
> -		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
> +	bh = ext4_xattr_get_block(inode);
> +	if (IS_ERR(bh)) {
> +		err = PTR_ERR(bh);
> +	} else {
> +		*credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
> +		brelse(bh);
> +		err = 0;
>  	}
> -	return credits;
> +
> +	up_read(&EXT4_I(inode)->xattr_sem);
> +	return err;
>  }
>  
>  /*
> @@ -1760,7 +2357,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
>  		return error;
>  
>  retry:
> -	credits = ext4_xattr_set_credits(inode, value_len);
> +	error = ext4_xattr_set_credits(inode, value_len, &credits);
> +	if (error)
> +		return error;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>  	if (IS_ERR(handle)) {
>  		error = PTR_ERR(handle);
> @@ -2066,10 +2666,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
>  	return error;
>  }
>  
> -
>  #define EIA_INCR 16 /* must be 2^n */
>  #define EIA_MASK (EIA_INCR - 1)
> -/* Add the large xattr @inode into @ea_inode_array for later deletion.
> +
> +/* Add the large xattr @inode into @ea_inode_array for deferred iput().
>   * If @ea_inode_array is new or full it will be grown and the old
>   * contents copied over.
>   */
> @@ -2114,21 +2714,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>   * ext4_xattr_delete_inode()
>   *
>   * Free extended attribute resources associated with this inode. Traverse
> - * all entries and unlink any xattr inodes associated with this inode. This
> - * is called immediately before an inode is freed. We have exclusive
> - * access to the inode. If an orphan inode is deleted it will also delete any
> - * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
> - * to ensure they belong to the parent inode and were not deleted already.
> + * all entries and decrement reference on any xattr inodes associated with this
> + * inode. This is called immediately before an inode is freed. We have exclusive
> + * access to the inode. If an orphan inode is deleted it will also release its
> + * references on xattr block and xattr inodes.
>   */
> -int
> -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> -			struct ext4_xattr_inode_array **ea_inode_array,
> -			int extra_credits)
> +int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> +			    struct ext4_xattr_inode_array **ea_inode_array,
> +			    int extra_credits)
>  {
>  	struct buffer_head *bh = NULL;
>  	struct ext4_xattr_ibody_header *header;
> -	struct ext4_inode *raw_inode;
>  	struct ext4_iloc iloc = { .bh = NULL };
> +	struct ext4_xattr_entry *entry;
>  	int error;
>  
>  	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
> @@ -2140,66 +2738,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  		goto cleanup;
>  	}
>  
> -	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
> -		goto delete_external_ea;
> -
> -	error = ext4_get_inode_loc(inode, &iloc);
> -	if (error)
> -		goto cleanup;
> +	if (ext4_has_feature_ea_inode(inode->i_sb) &&
> +	    ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
>  
> -	error = ext4_journal_get_write_access(handle, iloc.bh);
> -	if (error)
> -		goto cleanup;
> +		error = ext4_get_inode_loc(inode, &iloc);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
> +			goto cleanup;
> +		}
>  
> -	raw_inode = ext4_raw_inode(&iloc);
> -	header = IHDR(inode, raw_inode);
> -	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
> -				    false /* block_csum */, ea_inode_array,
> -				    extra_credits);
> +		error = ext4_journal_get_write_access(handle, iloc.bh);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "write access (error %d)",
> +					 error);
> +			goto cleanup;
> +		}
>  
> -delete_external_ea:
> -	if (!EXT4_I(inode)->i_file_acl) {
> -		error = 0;
> -		goto cleanup;
> -	}
> -	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> -	if (!bh) {
> -		EXT4_ERROR_INODE(inode, "block %llu read error",
> -				 EXT4_I(inode)->i_file_acl);
> -		error = -EIO;
> -		goto cleanup;
> -	}
> -	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
> -	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
> -		EXT4_ERROR_INODE(inode, "bad block %llu",
> -				 EXT4_I(inode)->i_file_acl);
> -		error = -EFSCORRUPTED;
> -		goto cleanup;
> +		header = IHDR(inode, ext4_raw_inode(&iloc));
> +		if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
> +			ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
> +						     IFIRST(header),
> +						     false /* block_csum */,
> +						     ea_inode_array,
> +						     extra_credits,
> +						     false /* skip_quota */);
>  	}
>  
> -	if (ext4_has_feature_ea_inode(inode->i_sb)) {
> -		error = ext4_journal_get_write_access(handle, bh);
> -		if (error) {
> -			EXT4_ERROR_INODE(inode, "write access %llu",
> +	if (EXT4_I(inode)->i_file_acl) {
> +		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +		if (!bh) {
> +			EXT4_ERROR_INODE(inode, "block %llu read error",
>  					 EXT4_I(inode)->i_file_acl);
> +			error = -EIO;
> +			goto cleanup;
> +		}
> +		error = ext4_xattr_check_block(inode, bh);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
> +					 EXT4_I(inode)->i_file_acl, error);
>  			goto cleanup;
>  		}
> -		ext4_xattr_inode_remove_all(handle, inode, bh,
> -					    BFIRST(bh),
> -					    true /* block_csum */,
> -					    ea_inode_array,
> -					    extra_credits);
> -	}
>  
> -	ext4_xattr_release_block(handle, inode, bh);
> -	/* Update i_file_acl within the same transaction that releases block. */
> -	EXT4_I(inode)->i_file_acl = 0;
> -	error = ext4_mark_inode_dirty(handle, inode);
> -	if (error) {
> -		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> -				 error);
> -		goto cleanup;
> +		if (ext4_has_feature_ea_inode(inode->i_sb)) {
> +			for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> +			     entry = EXT4_XATTR_NEXT(entry))
> +				if (entry->e_value_inum)
> +					ext4_xattr_inode_free_quota(inode,
> +					      le32_to_cpu(entry->e_value_size));
> +
> +		}
> +
> +		ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
> +					 extra_credits);
> +		/*
> +		 * Update i_file_acl value in the same transaction that releases
> +		 * block.
> +		 */
> +		EXT4_I(inode)->i_file_acl = 0;
> +		error = ext4_mark_inode_dirty(handle, inode);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> +					 error);
> +			goto cleanup;
> +		}
>  	}
> +	error = 0;
>  cleanup:
>  	brelse(iloc.bh);
>  	brelse(bh);
> @@ -2208,17 +2811,13 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  
>  void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
>  {
> -	struct inode	*ea_inode;
> -	int		idx = 0;
> +	int idx;
>  
>  	if (ea_inode_array == NULL)
>  		return;
>  
> -	for (; idx < ea_inode_array->count; ++idx) {
> -		ea_inode = ea_inode_array->inodes[idx];
> -		clear_nlink(ea_inode);
> -		iput(ea_inode);
> -	}
> +	for (idx = 0; idx < ea_inode_array->count; ++idx)
> +		iput(ea_inode_array->inodes[idx]);
>  	kfree(ea_inode_array);
>  }
>  
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index b2005a2716d9..67616cb9a059 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -70,19 +70,6 @@ struct ext4_xattr_entry {
>  #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
>  
>  /*
> - * Link EA inode back to parent one using i_mtime field.
> - * Extra integer type conversion added to ignore higher
> - * bits in i_mtime.tv_sec which might be set by ext4_get()
> - */
> -#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
> -do {                                                  \
> -      (inode)->i_mtime.tv_sec = inum;                 \
> -} while(0)
> -
> -#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
> -((__u32)(inode)->i_mtime.tv_sec)
> -
> -/*
>   * The minimum size of EA value when you start storing it in an external inode
>   * size of block - size of header - size of 1 entry - 4 null bytes
>  */
> @@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
>  extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
>  extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
>  extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
> -extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
> +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
> +				  int *credits);
>  
> -extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
>  extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  				   struct ext4_xattr_inode_array **array,
>  				   int extra_credits);
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index 77a5b99d8f92..7dfdca822ccb 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -13,10 +13,11 @@
>   * mb_cache_entry_delete()).
>   *
>   * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
> - * They use hash of a block contents as a key and block number as a value.
> - * That's why keys need not be unique (different xattr blocks may end up having
> - * the same hash). However block number always uniquely identifies a cache
> - * entry.
> + * Ext4 also uses it for deduplication of xattr values stored in inodes.
> + * They use hash of data as a key and provide a value that may represent a
> + * block or inode number. That's why keys need not be unique (hash of different
> + * data may be the same). However user provided value always uniquely
> + * identifies a cache entry.
>   *
>   * We provide functions for creation and removal of entries, search by key,
>   * and a special "delete entry with given key-value pair" operation. Fixed
> -- 
> 2.13.0.219.gdb65acc882-goog
> 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v2 27/28] ext4: xattr inode deduplication
@ 2017-06-02  5:41         ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-06-02  5:41 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Deepa Dinamani, Dave Kleikamp, jfs-discussion, Theodore Ts'o,
	linux-kernel, reiserfs-devel, Jens Axboe, linux-fsdevel,
	Mike Christie, Andreas Dilger, Alexander Viro, Jan Kara,
	Fabian Frederick, linux-ext4, ocfs2-devel

On Wed, May 31, 2017 at 03:33:57PM -0700, Tahsin Erdogan wrote:
> Ext4 now supports xattr values that are up to 64k in size (vfs limit).
> Large xattr values are stored in external inodes each one holding a
> single value. Once written the data blocks of these inodes are immutable.
> 
> The real world use cases are expected to have a lot of value duplication
> such as inherited acls etc. To reduce data duplication on disk, this patch
> implements a deduplicator that allows sharing of xattr inodes.
> 
> The deduplication is based on an in-memory hash lookup that is a best
> effort sharing scheme. When a xattr inode is read from disk (i.e.
> getxattr() call), its crc32c hash is added to a hash table. Before
> creating a new xattr inode for a value being set, the hash table is
> checked to see if an existing inode holds an identical value. If such an
> inode is found, the ref count on that inode is incremented. On value
> removal the ref count is decremented and if it reaches zero the inode is
> deleted.
> 
> The quota charging for such inodes is manually managed. Every reference
> holder is charged the full size as if there was no sharing happening.
> This is consistent with how xattr blocks are also charged.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
> v2:
>  - make dependency on crc32c dynamic
>  - update ext4_has_metadata_csum() and ext4_has_group_desc_csum() so that
>    they do not misinterpret existence of EXT4_SB(sb)->s_chksum_driver
> 
>  fs/ext4/acl.c   |    5 +-
>  fs/ext4/ext4.h  |   22 +-
>  fs/ext4/inode.c |    9 +-
>  fs/ext4/super.c |   25 +-
>  fs/ext4/xattr.c | 1075 +++++++++++++++++++++++++++++++++++++++++++------------
>  fs/ext4/xattr.h |   17 +-
>  fs/mbcache.c    |    9 +-
>  7 files changed, 893 insertions(+), 269 deletions(-)
> 
> diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
> index 74f7ac539e00..8db03e5c78bc 100644
> --- a/fs/ext4/acl.c
> +++ b/fs/ext4/acl.c
> @@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>  	if (error)
>  		return error;
>  retry:
> -	credits = ext4_xattr_set_credits(inode, acl_size);
> +	error = ext4_xattr_set_credits(inode, acl_size, &credits);
> +	if (error)
> +		return error;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>  	if (IS_ERR(handle))
>  		return PTR_ERR(handle);
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index d79d8d7bee88..7ceb1f81e4b8 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1517,6 +1517,7 @@ struct ext4_sb_info {
>  	long s_es_nr_inode;
>  	struct ext4_es_stats s_es_stats;
>  	struct mb_cache *s_mb_cache;
> +	struct mb_cache *s_ea_inode_cache;
>  	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
>  
>  	/* Ratelimit ext4 messages. */
> @@ -2099,7 +2100,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
>  	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
>  }
>  
> -#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
> +static inline bool ext4_is_quota_file(struct inode *inode)
> +{
> +	return IS_NOQUOTA(inode) &&
> +	       !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
> +}
>  
>  /*
>   * This structure is stuffed into the struct file's private_data field
> @@ -2709,19 +2714,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
>  extern int ext4_register_li_request(struct super_block *sb,
>  				    ext4_group_t first_not_zeroed);
>  
> -static inline int ext4_has_group_desc_csum(struct super_block *sb)
> -{
> -	return ext4_has_feature_gdt_csum(sb) ||
> -	       EXT4_SB(sb)->s_chksum_driver != NULL;
> -}
> -
>  static inline int ext4_has_metadata_csum(struct super_block *sb)
>  {
>  	WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
>  		     !EXT4_SB(sb)->s_chksum_driver);
>  
> -	return (EXT4_SB(sb)->s_chksum_driver != NULL);
> +	return ext4_has_feature_metadata_csum(sb) &&
> +	       (EXT4_SB(sb)->s_chksum_driver != NULL);
>  }
> +
> +static inline int ext4_has_group_desc_csum(struct super_block *sb)
> +{
> +	return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
> +}
> +
>  static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
>  {
>  	return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 4d6936f0d8a4..6f5872197d6c 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4843,8 +4843,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>  	}
>  	brelse(iloc.bh);
>  	ext4_set_inode_flags(inode);
> -	if (ei->i_flags & EXT4_EA_INODE_FL)
> +
> +	if (ei->i_flags & EXT4_EA_INODE_FL) {
>  		ext4_xattr_inode_set_class(inode);
> +
> +		inode_lock(inode);
> +		inode->i_flags |= S_NOQUOTA;
> +		inode_unlock(inode);
> +	}
> +
>  	unlock_new_inode(inode);
>  	return inode;
>  
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index b02a23ec92ca..9fcd29e21dc7 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
>  		invalidate_bdev(sbi->journal_bdev);
>  		ext4_blkdev_remove(sbi);
>  	}
> +	if (sbi->s_ea_inode_cache) {
> +		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
> +		sbi->s_ea_inode_cache = NULL;
> +	}
>  	if (sbi->s_mb_cache) {
>  		ext4_xattr_destroy_cache(sbi->s_mb_cache);
>  		sbi->s_mb_cache = NULL;
> @@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
>  	if (res)
>  		return res;
>  retry:
> -	credits = ext4_xattr_set_credits(inode, len);
> +	res = ext4_xattr_set_credits(inode, len, &credits);
> +	if (res)
> +		return res;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
>  	if (IS_ERR(handle))
>  		return PTR_ERR(handle);
> @@ -3445,7 +3452,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	}
>  
>  	/* Load the checksum driver */
> -	if (ext4_has_feature_metadata_csum(sb)) {
> +	if (ext4_has_feature_metadata_csum(sb) ||
> +	    ext4_has_feature_ea_inode(sb)) {
>  		sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
>  		if (IS_ERR(sbi->s_chksum_driver)) {
>  			ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
> @@ -4067,6 +4075,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  		goto failed_mount_wq;
>  	}
>  
> +	if (ext4_has_feature_ea_inode(sb)) {
> +		sbi->s_ea_inode_cache = ext4_xattr_create_cache();
> +		if (!sbi->s_ea_inode_cache) {
> +			ext4_msg(sb, KERN_ERR,
> +				 "Failed to create an s_ea_inode_cache");
> +			goto failed_mount_wq;
> +		}
> +	}
> +
>  	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
>  	    (blocksize != PAGE_SIZE)) {
>  		ext4_msg(sb, KERN_ERR,
> @@ -4296,6 +4313,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	if (EXT4_SB(sb)->rsv_conversion_wq)
>  		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
>  failed_mount_wq:
> +	if (sbi->s_ea_inode_cache) {
> +		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
> +		sbi->s_ea_inode_cache = NULL;
> +	}
>  	if (sbi->s_mb_cache) {
>  		ext4_xattr_destroy_cache(sbi->s_mb_cache);
>  		sbi->s_mb_cache = NULL;
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 6acce1f689ab..4c394411bf6f 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -79,6 +79,7 @@ ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
>  			    struct mb_cache_entry **);
>  static void ext4_xattr_rehash(struct ext4_xattr_header *,
>  			      struct ext4_xattr_entry *);
> +static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash);
>  
>  static const struct xattr_handler * const ext4_xattr_handler_map[] = {
>  	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
> @@ -105,13 +106,23 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
>  	NULL
>  };
>  
> +#define EXT4_XATTR_SYSTEM_EA_INFO  "eai"
> +
>  #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
>  				inode->i_sb->s_fs_info)->s_mb_cache)
>  
> +#define EA_INODE_CACHE(inode)	(((struct ext4_sb_info *) \
> +				inode->i_sb->s_fs_info)->s_ea_inode_cache)
> +
>  static int
>  ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>  			struct inode *inode);
>  
> +static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
> +				 u32 hash);
> +static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
> +				     u64 *ref_return, u32 *hash);
> +
>  #ifdef CONFIG_LOCKDEP
>  void ext4_xattr_inode_set_class(struct inode *ea_inode)
>  {
> @@ -329,14 +340,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>  		goto error;
>  	}
>  
> -	if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
> -	    inode->i_generation != parent->i_generation) {
> -		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
> -			   "to parent is invalid.", ea_ino);
> -		err = -EINVAL;
> -		goto error;
> -	}
> -
>  	if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
>  		ext4_error(parent->i_sb, "EA inode %lu does not have "
>  			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
> @@ -351,6 +354,12 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>  	return err;
>  }
>  
> +static u32
> +ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
> +{
> +	return ext4_chksum(sbi, 0, buffer, size);

Hmm... normally we'd supply sbi->s_csum_seed as the second argument so
that the metadata checksum value also has the fs uuid stamped into it.
That way if we ever encounter a piece of metadata we can positively
confirm that it belongs to this filesystem (vs. a piece of metadata that
came from a previous ext4 that had been written to the disk) or discard
it as being a ghost from an old iteration.  For xattrs I think we were
also baking in either the owning inode number (refcount == 1) or the
block number (refcount > 1) so that there's some redundant parent
pointer information encoded in the checksum too.

Even if you dismiss that, we usually follow the convention of
initializing the crc32c calculation with (~0U), not (0U), to strengthen
crc32c's ability to detect zeroes being injected at the start of the
stream.

--D

> +}
> +
>  /*
>   * Read the value from the EA inode.
>   */
> @@ -358,17 +367,53 @@ static int
>  ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
>  		     size_t size)
>  {
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
>  	struct inode *ea_inode;
> -	int ret;
> +	u32 hash, calc_hash;
> +	struct mb_cache_entry *ce;
> +	int err;
>  
> -	ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -	if (ret)
> -		return ret;
> +	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> +	if (err) {
> +		ea_inode = NULL;
> +		goto out;
> +	}
>  
> -	ret = ext4_xattr_inode_read(ea_inode, buffer, size);
> -	iput(ea_inode);
> +	if (i_size_read(ea_inode) != size) {
> +		ext4_warning_inode(ea_inode,
> +				   "ea_inode file size=%llu entry size=%zu",
> +				   i_size_read(ea_inode), size);
> +		err = -EFSCORRUPTED;
> +		goto out;
> +	}
>  
> -	return ret;
> +	err = ext4_xattr_inode_read(ea_inode, buffer, size);
> +	if (!err) {
> +		if (ext4_xattr_read_ea_hash(ea_inode, &hash))
> +			goto out;
> +
> +		/* Avoid hash calculation if already cached. */
> +		ce = mb_cache_entry_get(ea_inode_cache, hash, ea_inode->i_ino);
> +		if (ce) {
> +			mb_cache_entry_put(ea_inode_cache, ce);
> +			goto out;
> +		}
> +
> +		calc_hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), buffer,
> +						  size);
> +		if (hash != calc_hash) {
> +			ext4_warning_inode(ea_inode, "EA inode saved hash=%#x "
> +					   "does not match calc_hash=%#x",
> +					   hash, calc_hash);
> +			goto out;
> +		}
> +
> +		mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
> +				      ea_inode->i_ino, true /* reusable */);
> +	}
> +out:
> +	iput(ea_inode);
> +	return err;
>  }
>  
>  static int
> @@ -657,6 +702,101 @@ static void ext4_xattr_update_super_block(handle_t *handle,
>  	}
>  }
>  
> +static inline size_t round_up_cluster(struct inode *inode, size_t length)
> +{
> +	struct super_block *sb = inode->i_sb;
> +	size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
> +				    inode->i_blkbits);
> +	size_t mask = ~(cluster_size - 1);
> +
> +	return (length + cluster_size - 1) & mask;
> +}
> +
> +static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
> +{
> +	int err;
> +
> +	err = dquot_alloc_inode(inode);
> +	if (err)
> +		return err;
> +	err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
> +	if (err)
> +		dquot_free_inode(inode);
> +	return err;
> +}
> +
> +static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
> +{
> +	dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
> +	dquot_free_inode(inode);
> +}
> +
> +static int __ext4_xattr_set_credits(struct super_block *sb,
> +				    struct buffer_head *block_bh,
> +				    size_t value_len)
> +{
> +	int credits;
> +	int blocks;
> +
> +	/*
> +	 * 1) Owner inode update
> +	 * 2) Ref count update on old xattr block
> +	 * 3) new xattr block
> +	 * 4) block bitmap update for new xattr block
> +	 * 5) group descriptor for new xattr block
> +	 */
> +	credits = 5;
> +
> +	/* We are done if ea_inode feature is not enabled. */
> +	if (!ext4_has_feature_ea_inode(sb))
> +		return credits;
> +
> +	/* New ea_inode, inode map, block bitmap, group descriptor. */
> +	credits += 4;
> +
> +	/* Data blocks. */
> +	blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
> +
> +	/* Indirection block. */
> +	blocks += 1;
> +
> +	/* Block bitmap and group descriptor updates for each block. */
> +	credits += blocks * 2;
> +
> +	/* Blocks themselves. */
> +	credits += blocks;
> +
> +	/* Dereference ea_inode holding old xattr value.
> +	 * Old ea_inode, inode map, block bitmap, group descriptor.
> +	 */
> +	credits += 4;
> +
> +	/* Data blocks for old ea_inode. */
> +	blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
> +
> +	/* Indirection block for old ea_inode. */
> +	blocks += 1;
> +
> +	/* Block bitmap and group descriptor updates for each block. */
> +	credits += blocks * 2;
> +
> +	/* Quota updates. */
> +	credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
> +
> +	/* We may need to clone the existing xattr block in which case we need
> +	 * to increment ref counts for existing ea_inodes referenced by it.
> +	 */
> +	if (block_bh) {
> +		struct ext4_xattr_entry *entry = BFIRST(block_bh);
> +
> +		for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
> +			if (entry->e_value_inum)
> +				/* Ref count update on ea_inode. */
> +				credits += 1;
> +	}
> +	return credits;
> +}
> +
>  int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>  			      int credits, struct buffer_head *bh,
>  			      bool dirty, bool block_csum)
> @@ -706,12 +846,139 @@ int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>  	return 0;
>  }
>  
> +static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
> +				       int ref_change)
> +{
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
> +	struct ext4_iloc iloc;
> +	s64 ref_return;
> +	u32 hash;
> +	int ret;
> +
> +	inode_lock(ea_inode);
> +
> +	ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
> +	if (ret) {
> +		iloc.bh = NULL;
> +		goto out;
> +	}
> +
> +	ret = ext4_xattr_update_ea_info(ea_inode, ref_change, &ref_return,
> +					&hash);
> +	if (ret)
> +		goto out;
> +
> +	if (ref_change > 0) {
> +		WARN_ONCE(ref_return <= 0, "EA inode %lu ref_return=%lld",
> +			  ea_inode->i_ino, ref_return);
> +
> +		if (ref_return == 1) {
> +			WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
> +				  ea_inode->i_ino, ea_inode->i_nlink);
> +
> +			set_nlink(ea_inode, 1);
> +			ext4_orphan_del(handle, ea_inode);
> +
> +			mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
> +					      ea_inode->i_ino,
> +					      true /* reusable */);
> +		}
> +	} else {
> +		WARN_ONCE(ref_return < 0, "EA inode %lu ref_return=%lld",
> +			  ea_inode->i_ino, ref_return);
> +
> +		if (ref_return == 0) {
> +			WARN_ONCE(ea_inode->i_nlink != 1,
> +				  "EA inode %lu i_nlink=%u",
> +				  ea_inode->i_ino, ea_inode->i_nlink);
> +
> +			clear_nlink(ea_inode);
> +			ext4_orphan_add(handle, ea_inode);
> +
> +			mb_cache_entry_delete(ea_inode_cache, hash,
> +					      ea_inode->i_ino);
> +		}
> +	}
> +
> +	ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
> +	iloc.bh = NULL;
> +	if (ret)
> +		ext4_warning_inode(ea_inode,
> +				   "ext4_mark_iloc_dirty() failed ret=%d", ret);
> +out:
> +	brelse(iloc.bh);
> +	inode_unlock(ea_inode);
> +	return ret;
> +}
> +
> +static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
> +{
> +	return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
> +}
> +
> +static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
> +{
> +	return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
> +}
> +
> +static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
> +					struct ext4_xattr_entry *first)
> +{
> +	struct inode *ea_inode;
> +	struct ext4_xattr_entry *entry;
> +	struct ext4_xattr_entry *failed_entry;
> +	unsigned int ea_ino;
> +	int err, saved_err;
> +
> +	for (entry = first; !IS_LAST_ENTRY(entry);
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		ea_ino = le32_to_cpu(entry->e_value_inum);
> +		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +		if (err)
> +			goto cleanup;
> +		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
> +		if (err) {
> +			ext4_warning_inode(ea_inode, "inc ref error %d", err);
> +			iput(ea_inode);
> +			goto cleanup;
> +		}
> +		iput(ea_inode);
> +	}
> +	return 0;
> +
> +cleanup:
> +	saved_err = err;
> +	failed_entry = entry;
> +
> +	for (entry = first; entry != failed_entry;
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		ea_ino = le32_to_cpu(entry->e_value_inum);
> +		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +		if (err) {
> +			ext4_warning(parent->i_sb,
> +				     "cleanup ea_ino %u iget error %d", ea_ino,
> +				     err);
> +			continue;
> +		}
> +		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (err)
> +			ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
> +					   err);
> +		iput(ea_inode);
> +	}
> +	return saved_err;
> +}
> +
>  static void
> -ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
> -			    struct buffer_head *bh,
> -			    struct ext4_xattr_entry *first, bool block_csum,
> -			    struct ext4_xattr_inode_array **ea_inode_array,
> -			    int extra_credits)
> +ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
> +			     struct buffer_head *bh,
> +			     struct ext4_xattr_entry *first, bool block_csum,
> +			     struct ext4_xattr_inode_array **ea_inode_array,
> +			     int extra_credits, bool skip_quota)
>  {
>  	struct inode *ea_inode;
>  	struct ext4_xattr_entry *entry;
> @@ -748,10 +1015,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>  			continue;
>  		}
>  
> -		inode_lock(ea_inode);
> -		clear_nlink(ea_inode);
> -		ext4_orphan_add(handle, ea_inode);
> -		inode_unlock(ea_inode);
> +		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (err) {
> +			ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
> +					   err);
> +			continue;
> +		}
> +
> +		if (!skip_quota)
> +			ext4_xattr_inode_free_quota(parent,
> +					      le32_to_cpu(entry->e_value_size));
>  
>  		/*
>  		 * Forget about ea_inode within the same transaction that decrements the ref
> @@ -784,7 +1057,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>   */
>  static void
>  ext4_xattr_release_block(handle_t *handle, struct inode *inode,
> -			 struct buffer_head *bh)
> +			 struct buffer_head *bh,
> +			 struct ext4_xattr_inode_array **ea_inode_array,
> +			 int extra_credits)
>  {
>  	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
>  	u32 hash, ref;
> @@ -807,6 +1082,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>  		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
>  		get_bh(bh);
>  		unlock_buffer(bh);
> +
> +		if (ext4_has_feature_ea_inode(inode->i_sb))
> +			ext4_xattr_inode_dec_ref_all(handle, inode, bh,
> +						     BFIRST(bh),
> +						     true /* block_csum */,
> +						     ea_inode_array,
> +						     extra_credits,
> +						     true /* skip_quota */);
>  		ext4_free_blocks(handle, inode, bh, 0, 1,
>  				 EXT4_FREE_BLOCKS_METADATA |
>  				 EXT4_FREE_BLOCKS_FORGET);
> @@ -947,7 +1230,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
>   * Create an inode to store the value of a large EA.
>   */
>  static struct inode *ext4_xattr_inode_create(handle_t *handle,
> -					     struct inode *inode)
> +					     struct inode *inode, u32 hash)
>  {
>  	struct inode *ea_inode = NULL;
>  	uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
> @@ -965,67 +1248,119 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
>  		ea_inode->i_fop = &ext4_file_operations;
>  		ext4_set_aops(ea_inode);
>  		ext4_xattr_inode_set_class(ea_inode);
> -		ea_inode->i_generation = inode->i_generation;
> -		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
> -
> -		/*
> -		 * A back-pointer from EA inode to parent inode will be useful
> -		 * for e2fsck.
> -		 */
> -		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
>  		unlock_new_inode(ea_inode);
>  		err = ext4_inode_attach_jinode(ea_inode);
> +		if (!err)
> +			err = ext4_xattr_inode_init(handle, ea_inode, hash);
>  		if (err) {
>  			iput(ea_inode);
>  			return ERR_PTR(err);
>  		}
> +
> +		/*
> +		 * Xattr inodes are shared therefore quota charging is performed
> +		 * at a higher level.
> +		 */
> +		dquot_free_inode(ea_inode);
> +		dquot_drop(ea_inode);
> +		inode_lock(ea_inode);
> +		ea_inode->i_flags |= S_NOQUOTA;
> +		inode_unlock(ea_inode);
>  	}
>  
>  	return ea_inode;
>  }
>  
> -/*
> - * Unlink the inode storing the value of the EA.
> - */
> -int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
> +static struct inode *
> +ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
> +			    size_t value_len, u32 hash)
>  {
> -	struct inode *ea_inode = NULL;
> +	struct inode *ea_inode;
> +	struct mb_cache_entry *ce;
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
> +	void *ea_data = NULL;
>  	int err;
>  
> -	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -	if (err)
> -		return err;
> +	ce = mb_cache_entry_find_first(ea_inode_cache, hash);
> +	while (ce) {
> +		ea_inode = ext4_iget(inode->i_sb, ce->e_value);
> +		if (IS_ERR(ea_inode)) {
> +			ea_inode = NULL;
> +			goto next;
> +		}
>  
> -	clear_nlink(ea_inode);
> -	iput(ea_inode);
> +		if (is_bad_inode(ea_inode) ||
> +		    !(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) ||
> +		    i_size_read(ea_inode) != value_len)
> +			goto next;
>  
> -	return 0;
> +		if (!ea_data)
> +			ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
> +
> +		if (!ea_data) {
> +			iput(ea_inode);
> +			return NULL;
> +		}
> +
> +		err = ext4_xattr_inode_read(ea_inode, ea_data, value_len);
> +		if (unlikely(err))
> +			goto next;
> +
> +		if (!memcmp(value, ea_data, value_len)) {
> +			mb_cache_entry_touch(ea_inode_cache, ce);
> +			mb_cache_entry_put(ea_inode_cache, ce);
> +			kvfree(ea_data);
> +			return ea_inode;
> +		}
> +	next:
> +		iput(ea_inode);
> +		ce = mb_cache_entry_find_next(ea_inode_cache, ce);
> +	}
> +	kvfree(ea_data);
> +	return NULL;
>  }
>  
>  /*
>   * Add value of the EA in an inode.
>   */
> -static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
> -				unsigned long *ea_ino, const void *value,
> -				size_t value_len)
> +static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
> +					  const void *value, size_t value_len,
> +					  struct inode **ret_inode)
>  {
>  	struct inode *ea_inode;
> +	u32 hash;
>  	int err;
>  
> +	hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
> +	ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
> +	if (ea_inode) {
> +		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
> +		if (err) {
> +			iput(ea_inode);
> +			return err;
> +		}
> +
> +		*ret_inode = ea_inode;
> +		return 0;
> +	}
> +
>  	/* Create an inode for the EA value */
> -	ea_inode = ext4_xattr_inode_create(handle, inode);
> +	ea_inode = ext4_xattr_inode_create(handle, inode, hash);
>  	if (IS_ERR(ea_inode))
>  		return PTR_ERR(ea_inode);
>  
>  	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
> -	if (err)
> -		clear_nlink(ea_inode);
> -	else
> -		*ea_ino = ea_inode->i_ino;
> +	if (err) {
> +		ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		iput(ea_inode);
> +		return err;
> +	}
>  
> -	iput(ea_inode);
> +	mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
> +			      ea_inode->i_ino, true /* reusable */);
>  
> -	return err;
> +	*ret_inode = ea_inode;
> +	return 0;
>  }
>  
>  static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
> @@ -1033,11 +1368,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>  				handle_t *handle, struct inode *inode)
>  {
>  	struct ext4_xattr_entry *last;
> +	struct ext4_xattr_entry *here = s->here;
>  	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
>  	int in_inode = i->in_inode;
> -	int rc;
> +	struct inode *old_ea_inode = NULL;
> +	struct inode *new_ea_inode = NULL;
> +	int ret;
>  
> -	/* Compute min_offs and last. */
> +	/*
> +	 * Optimization for the simple case when old and new values have the
> +	 * same padded sizes. Not applicable if the existing value is stored in
> +	 * an external inode.
> +	 */
> +	if (i->value && !s->not_found && !here->e_value_inum &&
> +	    EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) ==
> +	    EXT4_XATTR_SIZE(i->value_len)) {
> +		size_t offs = le16_to_cpu(here->e_value_offs);
> +		void *val = s->base + offs;
> +		size_t size = EXT4_XATTR_SIZE(i->value_len);
> +
> +		here->e_value_size = cpu_to_le32(i->value_len);
> +		if (i->value == EXT4_ZERO_XATTR_VALUE) {
> +			memset(val, 0, size);
> +		} else {
> +			memcpy(val, i->value, i->value_len);
> +			/* Clear padding bytes. */
> +			memset(val + i->value_len, 0, size - i->value_len);
> +		}
> +		return 0;
> +	}
> +
> +	/* Find out min_offs and last to calculate the free space. */
>  	last = s->first;
>  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
>  		if (!last->e_value_inum && last->e_value_size) {
> @@ -1048,120 +1409,149 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>  	}
>  	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
>  	if (!s->not_found) {
> -		if (!in_inode &&
> -		    !s->here->e_value_inum && s->here->e_value_size) {
> -			size_t size = le32_to_cpu(s->here->e_value_size);
> +		if (!here->e_value_inum && here->e_value_size) {
> +			size_t size = le32_to_cpu(here->e_value_size);
>  			free += EXT4_XATTR_SIZE(size);
>  		}
>  		free += EXT4_XATTR_LEN(name_len);
>  	}
>  	if (i->value) {
> -		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
> +		size_t value_len = in_inode ? 0 : EXT4_XATTR_SIZE(i->value_len);
>  
> -		if (in_inode)
> -			value_len = 0;
> +		if (free < EXT4_XATTR_LEN(name_len) + value_len) {
> +			ret = -ENOSPC;
> +			goto out;
> +		}
> +	}
>  
> -		if (free < EXT4_XATTR_LEN(name_len) + value_len)
> -			return -ENOSPC;
> +	/*
> +	 * Getting access to old and new ea inodes is subject to failures.
> +	 * Finish that work before doing any modifications to the xattr data.
> +	 */
> +	if (!s->not_found && here->e_value_inum) {
> +		ret = ext4_xattr_inode_iget(inode,
> +		 			    le32_to_cpu(here->e_value_inum),
> +		 			    &old_ea_inode);
> +		if (ret) {
> +			old_ea_inode = NULL;
> +			goto out;
> +		}
>  	}
> +	if (i->value && in_inode) {
> +		WARN_ON_ONCE(!i->value_len);
>  
> -	if (i->value && s->not_found) {
> -		/* Insert the new name. */
> -		size_t size = EXT4_XATTR_LEN(name_len);
> -		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
> -		memmove((void *)s->here + size, s->here, rest);
> -		memset(s->here, 0, size);
> -		s->here->e_name_index = i->name_index;
> -		s->here->e_name_len = name_len;
> -		memcpy(s->here->e_name, i->name, name_len);
> -	} else {
> -		if (!s->here->e_value_inum && s->here->e_value_size &&
> -		    s->here->e_value_offs > 0) {
> -			void *first_val = s->base + min_offs;
> -			size_t offs = le16_to_cpu(s->here->e_value_offs);
> -			void *val = s->base + offs;
> -			size_t size = EXT4_XATTR_SIZE(
> -				le32_to_cpu(s->here->e_value_size));
> -
> -			if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
> -				/* The old and the new value have the same
> -				   size. Just replace. */
> -				s->here->e_value_size =
> -					cpu_to_le32(i->value_len);
> -				if (i->value == EXT4_ZERO_XATTR_VALUE) {
> -					memset(val, 0, size);
> -				} else {
> -					/* Clear pad bytes first. */
> -					memset(val + size - EXT4_XATTR_PAD, 0,
> -					       EXT4_XATTR_PAD);
> -					memcpy(val, i->value, i->value_len);
> -				}
> -				return 0;
> -			}
> +		ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
> +		if (ret)
> +			goto out;
>  
> -			/* Remove the old value. */
> -			memmove(first_val + size, first_val, val - first_val);
> -			memset(first_val, 0, size);
> -			s->here->e_value_size = 0;
> -			s->here->e_value_offs = 0;
> -			min_offs += size;
> -
> -			/* Adjust all value offsets. */
> -			last = s->first;
> -			while (!IS_LAST_ENTRY(last)) {
> -				size_t o = le16_to_cpu(last->e_value_offs);
> -				if (!last->e_value_inum &&
> -				    last->e_value_size && o < offs)
> -					last->e_value_offs =
> -						cpu_to_le16(o + size);
> -				last = EXT4_XATTR_NEXT(last);
> -			}
> +		ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
> +						     i->value_len,
> +						     &new_ea_inode);
> +		if (ret) {
> +			new_ea_inode = NULL;
> +			ext4_xattr_inode_free_quota(inode, i->value_len);
> +			goto out;
>  		}
> -		if (s->here->e_value_inum) {
> -			ext4_xattr_inode_unlink(inode,
> -					    le32_to_cpu(s->here->e_value_inum));
> -			s->here->e_value_inum = 0;
> +	}
> +
> +	if (old_ea_inode) {
> +		/* We are ready to release ref count on the old_ea_inode. */
> +		ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
> +		if (ret) {
> +			/* Release newly required ref count on new_ea_inode. */
> +			if (new_ea_inode) {
> +				int err;
> +
> +				err = ext4_xattr_inode_dec_ref(handle,
> +							       new_ea_inode);
> +				if (err)
> +					ext4_warning_inode(new_ea_inode,
> +						  "dec ref new_ea_inode err=%d",
> +						  err);
> +				ext4_xattr_inode_free_quota(inode,
> +							    i->value_len);
> +			}
> +			goto out;
>  		}
> -		if (!i->value) {
> -			/* Remove the old name. */
> -			size_t size = EXT4_XATTR_LEN(name_len);
> -			last = ENTRY((void *)last - size);
> -			memmove(s->here, (void *)s->here + size,
> -				(void *)last - (void *)s->here + sizeof(__u32));
> -			memset(last, 0, size);
> +
> +		ext4_xattr_inode_free_quota(inode,
> +					    le32_to_cpu(here->e_value_size));
> +	}
> +
> +	/* No failures allowed past this point. */
> +
> +	if (!s->not_found && here->e_value_offs) {
> +		/* Remove the old value. */
> +		void *first_val = s->base + min_offs;
> +		size_t offs = le16_to_cpu(here->e_value_offs);
> +		void *val = s->base + offs;
> +		size_t size = EXT4_XATTR_SIZE(
> +			le32_to_cpu(here->e_value_size));
> +
> +		memmove(first_val + size, first_val, val - first_val);
> +		memset(first_val, 0, size);
> +		min_offs += size;
> +
> +		/* Adjust all value offsets. */
> +		last = s->first;
> +		while (!IS_LAST_ENTRY(last)) {
> +			size_t o = le16_to_cpu(last->e_value_offs);
> +			if (!last->e_value_inum &&
> +			    last->e_value_size && o < offs)
> +				last->e_value_offs =
> +					cpu_to_le16(o + size);
> +			last = EXT4_XATTR_NEXT(last);
>  		}
>  	}
>  
> +	if (!s->not_found && !i->value) {
> +		/* Remove old name. */
> +		size_t size = EXT4_XATTR_LEN(name_len);
> +		last = ENTRY((void *)last - size);
> +		memmove(here, (void *)here + size,
> +			(void *)last - (void *)here + sizeof(__u32));
> +		memset(last, 0, size);
> +	} else if (s->not_found && i->value) {
> +		/* Insert new name. */
> +		size_t size = EXT4_XATTR_LEN(name_len);
> +		size_t rest = (void *)last - (void *)here + sizeof(__u32);
> +		memmove((void *)here + size, here, rest);
> +		memset(here, 0, size);
> +		here->e_name_index = i->name_index;
> +		here->e_name_len = name_len;
> +		memcpy(here->e_name, i->name, name_len);
> +	} else {
> +		WARN_ON_ONCE(s->not_found || !i->value);
> +		/* This is an update, reset value info. */
> +		here->e_value_inum = 0;
> +		here->e_value_offs = 0;
> +		here->e_value_size = 0;
> +	}
> +
>  	if (i->value) {
> -		/* Insert the new value. */
> +		/* Insert new value. */
>  		if (in_inode) {
> -			unsigned long ea_ino =
> -				le32_to_cpu(s->here->e_value_inum);
> -			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
> -						  i->value, i->value_len);
> -			if (rc)
> -				goto out;
> -			s->here->e_value_inum = cpu_to_le32(ea_ino);
> -			s->here->e_value_offs = 0;
> +			here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
>  		} else if (i->value_len) {
>  			size_t size = EXT4_XATTR_SIZE(i->value_len);
>  			void *val = s->base + min_offs - size;
> -			s->here->e_value_offs = cpu_to_le16(min_offs - size);
> -			s->here->e_value_inum = 0;
> +			here->e_value_offs = cpu_to_le16(min_offs - size);
>  			if (i->value == EXT4_ZERO_XATTR_VALUE) {
>  				memset(val, 0, size);
>  			} else {
> -				/* Clear the pad bytes first. */
> -				memset(val + size - EXT4_XATTR_PAD, 0,
> -				       EXT4_XATTR_PAD);
>  				memcpy(val, i->value, i->value_len);
> +				/* Clear padding bytes. */
> +				memset(val + i->value_len, 0,
> +				       size - i->value_len);
>  			}
>  		}
> -		s->here->e_value_size = cpu_to_le32(i->value_len);
> +		here->e_value_size = cpu_to_le32(i->value_len);
>  	}
> -
> +	ret = 0;
>  out:
> -	return rc;
> +	iput(old_ea_inode);
> +	iput(new_ea_inode);
> +	return ret;
>  }
>  
>  struct ext4_xattr_block_find {
> @@ -1223,6 +1613,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  	struct mb_cache_entry *ce = NULL;
>  	int error = 0;
>  	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
> +	struct inode *ea_inode = NULL;
> +	size_t old_ea_inode_size = 0;
>  
>  #define header(x) ((struct ext4_xattr_header *)(x))
>  
> @@ -1277,6 +1669,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  			header(s->base)->h_refcount = cpu_to_le32(1);
>  			s->here = ENTRY(s->base + offset);
>  			s->end = s->base + bs->bh->b_size;
> +
> +			/*
> +			 * If existing entry points to an xattr inode, we need
> +			 * to prevent ext4_xattr_set_entry() from decrementing
> +			 * ref count on it because the reference belongs to the
> +			 * original block. In this case, make the entry look
> +			 * like it has an empty value.
> +			 */
> +			if (!s->not_found && s->here->e_value_inum) {
> +				/*
> +				 * Defer quota free call for previous inode
> +				 * until success is guaranteed.
> +				 */
> +				old_ea_inode_size = le32_to_cpu(
> +							s->here->e_value_size);
> +				s->here->e_value_inum = 0;
> +				s->here->e_value_size = 0;
> +			}
>  		}
>  	} else {
>  		/* Allocate a buffer where we construct the new block. */
> @@ -1298,6 +1708,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  		goto bad_block;
>  	if (error)
>  		goto cleanup;
> +
> +	if (i->value && s->here->e_value_inum) {
> +		unsigned int ea_ino;
> +
> +		/*
> +		 * A ref count on ea_inode has been taken as part of the call to
> +		 * ext4_xattr_set_entry() above. We would like to drop this
> +		 * extra ref but we have to wait until the xattr block is
> +		 * initialized and has its own ref count on the ea_inode.
> +		 */
> +		ea_ino = le32_to_cpu(s->here->e_value_inum);
> +		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> +		if (error) {
> +			ea_inode = NULL;
> +			goto cleanup;
> +		}
> +	}
> +
>  	if (!IS_LAST_ENTRY(s->first))
>  		ext4_xattr_rehash(header(s->base), s->here);
>  
> @@ -1408,6 +1836,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  						 EXT4_FREE_BLOCKS_METADATA);
>  				goto cleanup;
>  			}
> +			error = ext4_xattr_inode_inc_ref_all(handle, inode,
> +						      ENTRY(header(s->base)+1));
> +			if (error)
> +				goto getblk_failed;
> +			if (ea_inode) {
> +				/* Drop the extra ref on ea_inode. */
> +				error = ext4_xattr_inode_dec_ref(handle,
> +								 ea_inode);
> +				if (error)
> +					ext4_warning_inode(ea_inode,
> +							   "dec ref error=%d",
> +							   error);
> +				iput(ea_inode);
> +				ea_inode = NULL;
> +			}
> +
>  			lock_buffer(new_bh);
>  			error = ext4_journal_get_create_access(handle, new_bh);
>  			if (error) {
> @@ -1427,15 +1871,36 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  		}
>  	}
>  
> +	if (old_ea_inode_size)
> +		ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
> +
>  	/* Update the inode. */
>  	EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
>  
>  	/* Drop the previous xattr block. */
> -	if (bs->bh && bs->bh != new_bh)
> -		ext4_xattr_release_block(handle, inode, bs->bh);
> +	if (bs->bh && bs->bh != new_bh) {
> +		struct ext4_xattr_inode_array *ea_inode_array = NULL;
> +		ext4_xattr_release_block(handle, inode, bs->bh,
> +					 &ea_inode_array,
> +					 0 /* extra_credits */);
> +		ext4_xattr_inode_array_free(ea_inode_array);
> +	}
>  	error = 0;
>  
>  cleanup:
> +	if (ea_inode) {
> +		int error2;
> +		error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (error2)
> +			ext4_warning_inode(ea_inode, "dec ref error=%d",
> +					   error2);
> +
> +		/* If there was an error, revert the quota charge. */
> +		if (error)
> +			ext4_xattr_inode_free_quota(inode,
> +						    i_size_read(ea_inode));
> +		iput(ea_inode);
> +	}
>  	if (ce)
>  		mb_cache_entry_put(ext4_mb_cache, ce);
>  	brelse(new_bh);
> @@ -1546,6 +2011,117 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
>  	return 0;
>  }
>  
> +struct ext4_xattr_ea_info {
> +	__le64 ref_count;	/* number of xattr entry references */
> +	__le32 hash;		/* crc32c hash of xattr data */
> +	__le32 reserved;	/* reserved, must be 0 */
> +};
> +
> +static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
> +				 u32 hash)
> +{
> +	struct ext4_xattr_ea_info ea_info = {
> +		.ref_count = cpu_to_le64(1),
> +		.hash = cpu_to_le32(hash),
> +		.reserved = 0,
> +	};
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +		.value = &ea_info,
> +		.value_len = sizeof(ea_info),
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	return ext4_xattr_ibody_set(handle, ea_inode, &i, &is);
> +}
> +
> +static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
> +				     u64 *ref_return, u32 *hash)
> +{
> +	struct ext4_xattr_ea_info ea_info;
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +		.value = &ea_info,
> +		.value_len = sizeof(ea_info),
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	if (WARN_ON(is.s.not_found) ||
> +	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(ea_info)))
> +		return -EFSCORRUPTED;
> +
> +	memcpy(&ea_info,
> +	       ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs),
> +	       sizeof(ea_info));
> +
> +	if (hash)
> +		*hash = le32_to_cpu(ea_info.hash);
> +
> +	*ref_return = le64_to_cpu(ea_info.ref_count) + ref_change;
> +	ea_info.ref_count = cpu_to_le64(*ref_return);
> +
> +	return ext4_xattr_set_entry(&i, &is.s, NULL, ea_inode);
> +}
> +
> +static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash)
> +{
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	struct ext4_xattr_ea_info *ea_info;
> +	void *ptr;
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	if (WARN_ON(is.s.not_found) ||
> +	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(*ea_info)))
> +		return -EFSCORRUPTED;
> +
> +	ptr = ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs);
> +	ea_info = (struct ext4_xattr_ea_info *)ptr;
> +
> +	if (WARN_ON(ea_info->reserved != 0))
> +		return -EFSCORRUPTED;
> +
> +	*hash = le32_to_cpu(ea_info->hash);
> +	return 0;
> +}
> +
>  static int ext4_xattr_value_same(struct ext4_xattr_search *s,
>  				 struct ext4_xattr_info *i)
>  {
> @@ -1560,6 +2136,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
>  	return !memcmp(value, i->value, i->value_len);
>  }
>  
> +struct buffer_head *ext4_xattr_get_block(struct inode *inode)
> +{
> +	struct buffer_head *bh;
> +	int error;
> +
> +	if (!EXT4_I(inode)->i_file_acl)
> +		return NULL;
> +	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +	if (!bh)
> +		return ERR_PTR(-EIO);
> +	error = ext4_xattr_check_block(inode, bh);
> +	if (error)
> +		return ERR_PTR(error);
> +	return bh;
> +}
> +
>  /*
>   * ext4_xattr_set_handle()
>   *
> @@ -1602,9 +2194,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  
>  	/* Check journal credits under write lock. */
>  	if (ext4_handle_valid(handle)) {
> +		struct buffer_head *bh;
>  		int credits;
>  
> -		credits = ext4_xattr_set_credits(inode, value_len);
> +		bh = ext4_xattr_get_block(inode);
> +		if (IS_ERR(bh)) {
> +			error = PTR_ERR(bh);
> +			goto cleanup;
> +		}
> +
> +		credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
> +		brelse(bh);
> +
>  		if (!ext4_handle_has_enough_credits(handle, credits)) {
>  			error = -ENOSPC;
>  			goto cleanup;
> @@ -1640,6 +2241,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  		if (flags & XATTR_CREATE)
>  			goto cleanup;
>  	}
> +
>  	if (!value) {
>  		if (!is.s.not_found)
>  			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
> @@ -1708,34 +2310,29 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  	return error;
>  }
>  
> -int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
> +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
>  {
> -	struct super_block *sb = inode->i_sb;
> -	int credits;
> -
> -	if (!EXT4_SB(sb)->s_journal)
> -		return 0;
> -
> -	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
> +	struct buffer_head *bh;
> +	int err;
>  
> -	/*
> -	 * In case of inline data, we may push out the data to a block,
> -	 * so we need to reserve credits for this eventuality
> -	 */
> -	if (ext4_has_inline_data(inode))
> -	        credits += ext4_writepage_trans_blocks(inode) + 1;
> +	*credits = 0;
>  
> -	if (ext4_has_feature_ea_inode(sb)) {
> -		int nrblocks = (value_len + sb->s_blocksize - 1) >>
> -					sb->s_blocksize_bits;
> +	if (!EXT4_SB(inode->i_sb)->s_journal)
> +		return 0;
>  
> -		/* For new inode */
> -		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
> +	down_read(&EXT4_I(inode)->xattr_sem);
>  
> -		/* For data blocks of EA inode */
> -		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
> +	bh = ext4_xattr_get_block(inode);
> +	if (IS_ERR(bh)) {
> +		err = PTR_ERR(bh);
> +	} else {
> +		*credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
> +		brelse(bh);
> +		err = 0;
>  	}
> -	return credits;
> +
> +	up_read(&EXT4_I(inode)->xattr_sem);
> +	return err;
>  }
>  
>  /*
> @@ -1760,7 +2357,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
>  		return error;
>  
>  retry:
> -	credits = ext4_xattr_set_credits(inode, value_len);
> +	error = ext4_xattr_set_credits(inode, value_len, &credits);
> +	if (error)
> +		return error;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>  	if (IS_ERR(handle)) {
>  		error = PTR_ERR(handle);
> @@ -2066,10 +2666,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
>  	return error;
>  }
>  
> -
>  #define EIA_INCR 16 /* must be 2^n */
>  #define EIA_MASK (EIA_INCR - 1)
> -/* Add the large xattr @inode into @ea_inode_array for later deletion.
> +
> +/* Add the large xattr @inode into @ea_inode_array for deferred iput().
>   * If @ea_inode_array is new or full it will be grown and the old
>   * contents copied over.
>   */
> @@ -2114,21 +2714,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>   * ext4_xattr_delete_inode()
>   *
>   * Free extended attribute resources associated with this inode. Traverse
> - * all entries and unlink any xattr inodes associated with this inode. This
> - * is called immediately before an inode is freed. We have exclusive
> - * access to the inode. If an orphan inode is deleted it will also delete any
> - * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
> - * to ensure they belong to the parent inode and were not deleted already.
> + * all entries and decrement reference on any xattr inodes associated with this
> + * inode. This is called immediately before an inode is freed. We have exclusive
> + * access to the inode. If an orphan inode is deleted it will also release its
> + * references on xattr block and xattr inodes.
>   */
> -int
> -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> -			struct ext4_xattr_inode_array **ea_inode_array,
> -			int extra_credits)
> +int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> +			    struct ext4_xattr_inode_array **ea_inode_array,
> +			    int extra_credits)
>  {
>  	struct buffer_head *bh = NULL;
>  	struct ext4_xattr_ibody_header *header;
> -	struct ext4_inode *raw_inode;
>  	struct ext4_iloc iloc = { .bh = NULL };
> +	struct ext4_xattr_entry *entry;
>  	int error;
>  
>  	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
> @@ -2140,66 +2738,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  		goto cleanup;
>  	}
>  
> -	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
> -		goto delete_external_ea;
> -
> -	error = ext4_get_inode_loc(inode, &iloc);
> -	if (error)
> -		goto cleanup;
> +	if (ext4_has_feature_ea_inode(inode->i_sb) &&
> +	    ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
>  
> -	error = ext4_journal_get_write_access(handle, iloc.bh);
> -	if (error)
> -		goto cleanup;
> +		error = ext4_get_inode_loc(inode, &iloc);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
> +			goto cleanup;
> +		}
>  
> -	raw_inode = ext4_raw_inode(&iloc);
> -	header = IHDR(inode, raw_inode);
> -	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
> -				    false /* block_csum */, ea_inode_array,
> -				    extra_credits);
> +		error = ext4_journal_get_write_access(handle, iloc.bh);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "write access (error %d)",
> +					 error);
> +			goto cleanup;
> +		}
>  
> -delete_external_ea:
> -	if (!EXT4_I(inode)->i_file_acl) {
> -		error = 0;
> -		goto cleanup;
> -	}
> -	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> -	if (!bh) {
> -		EXT4_ERROR_INODE(inode, "block %llu read error",
> -				 EXT4_I(inode)->i_file_acl);
> -		error = -EIO;
> -		goto cleanup;
> -	}
> -	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
> -	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
> -		EXT4_ERROR_INODE(inode, "bad block %llu",
> -				 EXT4_I(inode)->i_file_acl);
> -		error = -EFSCORRUPTED;
> -		goto cleanup;
> +		header = IHDR(inode, ext4_raw_inode(&iloc));
> +		if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
> +			ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
> +						     IFIRST(header),
> +						     false /* block_csum */,
> +						     ea_inode_array,
> +						     extra_credits,
> +						     false /* skip_quota */);
>  	}
>  
> -	if (ext4_has_feature_ea_inode(inode->i_sb)) {
> -		error = ext4_journal_get_write_access(handle, bh);
> -		if (error) {
> -			EXT4_ERROR_INODE(inode, "write access %llu",
> +	if (EXT4_I(inode)->i_file_acl) {
> +		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +		if (!bh) {
> +			EXT4_ERROR_INODE(inode, "block %llu read error",
>  					 EXT4_I(inode)->i_file_acl);
> +			error = -EIO;
> +			goto cleanup;
> +		}
> +		error = ext4_xattr_check_block(inode, bh);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
> +					 EXT4_I(inode)->i_file_acl, error);
>  			goto cleanup;
>  		}
> -		ext4_xattr_inode_remove_all(handle, inode, bh,
> -					    BFIRST(bh),
> -					    true /* block_csum */,
> -					    ea_inode_array,
> -					    extra_credits);
> -	}
>  
> -	ext4_xattr_release_block(handle, inode, bh);
> -	/* Update i_file_acl within the same transaction that releases block. */
> -	EXT4_I(inode)->i_file_acl = 0;
> -	error = ext4_mark_inode_dirty(handle, inode);
> -	if (error) {
> -		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> -				 error);
> -		goto cleanup;
> +		if (ext4_has_feature_ea_inode(inode->i_sb)) {
> +			for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> +			     entry = EXT4_XATTR_NEXT(entry))
> +				if (entry->e_value_inum)
> +					ext4_xattr_inode_free_quota(inode,
> +					      le32_to_cpu(entry->e_value_size));
> +
> +		}
> +
> +		ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
> +					 extra_credits);
> +		/*
> +		 * Update i_file_acl value in the same transaction that releases
> +		 * block.
> +		 */
> +		EXT4_I(inode)->i_file_acl = 0;
> +		error = ext4_mark_inode_dirty(handle, inode);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> +					 error);
> +			goto cleanup;
> +		}
>  	}
> +	error = 0;
>  cleanup:
>  	brelse(iloc.bh);
>  	brelse(bh);
> @@ -2208,17 +2811,13 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  
>  void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
>  {
> -	struct inode	*ea_inode;
> -	int		idx = 0;
> +	int idx;
>  
>  	if (ea_inode_array == NULL)
>  		return;
>  
> -	for (; idx < ea_inode_array->count; ++idx) {
> -		ea_inode = ea_inode_array->inodes[idx];
> -		clear_nlink(ea_inode);
> -		iput(ea_inode);
> -	}
> +	for (idx = 0; idx < ea_inode_array->count; ++idx)
> +		iput(ea_inode_array->inodes[idx]);
>  	kfree(ea_inode_array);
>  }
>  
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index b2005a2716d9..67616cb9a059 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -70,19 +70,6 @@ struct ext4_xattr_entry {
>  #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
>  
>  /*
> - * Link EA inode back to parent one using i_mtime field.
> - * Extra integer type conversion added to ignore higher
> - * bits in i_mtime.tv_sec which might be set by ext4_get()
> - */
> -#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
> -do {                                                  \
> -      (inode)->i_mtime.tv_sec = inum;                 \
> -} while(0)
> -
> -#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
> -((__u32)(inode)->i_mtime.tv_sec)
> -
> -/*
>   * The minimum size of EA value when you start storing it in an external inode
>   * size of block - size of header - size of 1 entry - 4 null bytes
>  */
> @@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
>  extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
>  extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
>  extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
> -extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
> +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
> +				  int *credits);
>  
> -extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
>  extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  				   struct ext4_xattr_inode_array **array,
>  				   int extra_credits);
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index 77a5b99d8f92..7dfdca822ccb 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -13,10 +13,11 @@
>   * mb_cache_entry_delete()).
>   *
>   * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
> - * They use hash of a block contents as a key and block number as a value.
> - * That's why keys need not be unique (different xattr blocks may end up having
> - * the same hash). However block number always uniquely identifies a cache
> - * entry.
> + * Ext4 also uses it for deduplication of xattr values stored in inodes.
> + * They use hash of data as a key and provide a value that may represent a
> + * block or inode number. That's why keys need not be unique (hash of different
> + * data may be the same). However user provided value always uniquely
> + * identifies a cache entry.
>   *
>   * We provide functions for creation and removal of entries, search by key,
>   * and a special "delete entry with given key-value pair" operation. Fixed
> -- 
> 2.13.0.219.gdb65acc882-goog
> 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [Ocfs2-devel] [PATCH v2 27/28] ext4: xattr inode deduplication
@ 2017-06-02  5:41         ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-06-02  5:41 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Deepa Dinamani, Dave Kleikamp, jfs-discussion, Theodore Ts'o,
	linux-kernel, reiserfs-devel, Jens Axboe, linux-fsdevel,
	Mike Christie, Andreas Dilger, Alexander Viro, Jan Kara,
	Fabian Frederick, linux-ext4, ocfs2-devel

On Wed, May 31, 2017 at 03:33:57PM -0700, Tahsin Erdogan wrote:
> Ext4 now supports xattr values that are up to 64k in size (vfs limit).
> Large xattr values are stored in external inodes each one holding a
> single value. Once written the data blocks of these inodes are immutable.
> 
> The real world use cases are expected to have a lot of value duplication
> such as inherited acls etc. To reduce data duplication on disk, this patch
> implements a deduplicator that allows sharing of xattr inodes.
> 
> The deduplication is based on an in-memory hash lookup that is a best
> effort sharing scheme. When a xattr inode is read from disk (i.e.
> getxattr() call), its crc32c hash is added to a hash table. Before
> creating a new xattr inode for a value being set, the hash table is
> checked to see if an existing inode holds an identical value. If such an
> inode is found, the ref count on that inode is incremented. On value
> removal the ref count is decremented and if it reaches zero the inode is
> deleted.
> 
> The quota charging for such inodes is manually managed. Every reference
> holder is charged the full size as if there was no sharing happening.
> This is consistent with how xattr blocks are also charged.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
> v2:
>  - make dependency on crc32c dynamic
>  - update ext4_has_metadata_csum() and ext4_has_group_desc_csum() so that
>    they do not misinterpret existence of EXT4_SB(sb)->s_chksum_driver
> 
>  fs/ext4/acl.c   |    5 +-
>  fs/ext4/ext4.h  |   22 +-
>  fs/ext4/inode.c |    9 +-
>  fs/ext4/super.c |   25 +-
>  fs/ext4/xattr.c | 1075 +++++++++++++++++++++++++++++++++++++++++++------------
>  fs/ext4/xattr.h |   17 +-
>  fs/mbcache.c    |    9 +-
>  7 files changed, 893 insertions(+), 269 deletions(-)
> 
> diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
> index 74f7ac539e00..8db03e5c78bc 100644
> --- a/fs/ext4/acl.c
> +++ b/fs/ext4/acl.c
> @@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>  	if (error)
>  		return error;
>  retry:
> -	credits = ext4_xattr_set_credits(inode, acl_size);
> +	error = ext4_xattr_set_credits(inode, acl_size, &credits);
> +	if (error)
> +		return error;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>  	if (IS_ERR(handle))
>  		return PTR_ERR(handle);
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index d79d8d7bee88..7ceb1f81e4b8 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1517,6 +1517,7 @@ struct ext4_sb_info {
>  	long s_es_nr_inode;
>  	struct ext4_es_stats s_es_stats;
>  	struct mb_cache *s_mb_cache;
> +	struct mb_cache *s_ea_inode_cache;
>  	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
>  
>  	/* Ratelimit ext4 messages. */
> @@ -2099,7 +2100,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
>  	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
>  }
>  
> -#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
> +static inline bool ext4_is_quota_file(struct inode *inode)
> +{
> +	return IS_NOQUOTA(inode) &&
> +	       !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
> +}
>  
>  /*
>   * This structure is stuffed into the struct file's private_data field
> @@ -2709,19 +2714,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
>  extern int ext4_register_li_request(struct super_block *sb,
>  				    ext4_group_t first_not_zeroed);
>  
> -static inline int ext4_has_group_desc_csum(struct super_block *sb)
> -{
> -	return ext4_has_feature_gdt_csum(sb) ||
> -	       EXT4_SB(sb)->s_chksum_driver != NULL;
> -}
> -
>  static inline int ext4_has_metadata_csum(struct super_block *sb)
>  {
>  	WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
>  		     !EXT4_SB(sb)->s_chksum_driver);
>  
> -	return (EXT4_SB(sb)->s_chksum_driver != NULL);
> +	return ext4_has_feature_metadata_csum(sb) &&
> +	       (EXT4_SB(sb)->s_chksum_driver != NULL);
>  }
> +
> +static inline int ext4_has_group_desc_csum(struct super_block *sb)
> +{
> +	return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
> +}
> +
>  static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
>  {
>  	return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 4d6936f0d8a4..6f5872197d6c 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4843,8 +4843,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>  	}
>  	brelse(iloc.bh);
>  	ext4_set_inode_flags(inode);
> -	if (ei->i_flags & EXT4_EA_INODE_FL)
> +
> +	if (ei->i_flags & EXT4_EA_INODE_FL) {
>  		ext4_xattr_inode_set_class(inode);
> +
> +		inode_lock(inode);
> +		inode->i_flags |= S_NOQUOTA;
> +		inode_unlock(inode);
> +	}
> +
>  	unlock_new_inode(inode);
>  	return inode;
>  
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index b02a23ec92ca..9fcd29e21dc7 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
>  		invalidate_bdev(sbi->journal_bdev);
>  		ext4_blkdev_remove(sbi);
>  	}
> +	if (sbi->s_ea_inode_cache) {
> +		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
> +		sbi->s_ea_inode_cache = NULL;
> +	}
>  	if (sbi->s_mb_cache) {
>  		ext4_xattr_destroy_cache(sbi->s_mb_cache);
>  		sbi->s_mb_cache = NULL;
> @@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
>  	if (res)
>  		return res;
>  retry:
> -	credits = ext4_xattr_set_credits(inode, len);
> +	res = ext4_xattr_set_credits(inode, len, &credits);
> +	if (res)
> +		return res;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
>  	if (IS_ERR(handle))
>  		return PTR_ERR(handle);
> @@ -3445,7 +3452,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	}
>  
>  	/* Load the checksum driver */
> -	if (ext4_has_feature_metadata_csum(sb)) {
> +	if (ext4_has_feature_metadata_csum(sb) ||
> +	    ext4_has_feature_ea_inode(sb)) {
>  		sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
>  		if (IS_ERR(sbi->s_chksum_driver)) {
>  			ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
> @@ -4067,6 +4075,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  		goto failed_mount_wq;
>  	}
>  
> +	if (ext4_has_feature_ea_inode(sb)) {
> +		sbi->s_ea_inode_cache = ext4_xattr_create_cache();
> +		if (!sbi->s_ea_inode_cache) {
> +			ext4_msg(sb, KERN_ERR,
> +				 "Failed to create an s_ea_inode_cache");
> +			goto failed_mount_wq;
> +		}
> +	}
> +
>  	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
>  	    (blocksize != PAGE_SIZE)) {
>  		ext4_msg(sb, KERN_ERR,
> @@ -4296,6 +4313,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	if (EXT4_SB(sb)->rsv_conversion_wq)
>  		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
>  failed_mount_wq:
> +	if (sbi->s_ea_inode_cache) {
> +		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
> +		sbi->s_ea_inode_cache = NULL;
> +	}
>  	if (sbi->s_mb_cache) {
>  		ext4_xattr_destroy_cache(sbi->s_mb_cache);
>  		sbi->s_mb_cache = NULL;
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 6acce1f689ab..4c394411bf6f 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -79,6 +79,7 @@ ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
>  			    struct mb_cache_entry **);
>  static void ext4_xattr_rehash(struct ext4_xattr_header *,
>  			      struct ext4_xattr_entry *);
> +static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash);
>  
>  static const struct xattr_handler * const ext4_xattr_handler_map[] = {
>  	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
> @@ -105,13 +106,23 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
>  	NULL
>  };
>  
> +#define EXT4_XATTR_SYSTEM_EA_INFO  "eai"
> +
>  #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
>  				inode->i_sb->s_fs_info)->s_mb_cache)
>  
> +#define EA_INODE_CACHE(inode)	(((struct ext4_sb_info *) \
> +				inode->i_sb->s_fs_info)->s_ea_inode_cache)
> +
>  static int
>  ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>  			struct inode *inode);
>  
> +static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
> +				 u32 hash);
> +static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
> +				     u64 *ref_return, u32 *hash);
> +
>  #ifdef CONFIG_LOCKDEP
>  void ext4_xattr_inode_set_class(struct inode *ea_inode)
>  {
> @@ -329,14 +340,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>  		goto error;
>  	}
>  
> -	if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
> -	    inode->i_generation != parent->i_generation) {
> -		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
> -			   "to parent is invalid.", ea_ino);
> -		err = -EINVAL;
> -		goto error;
> -	}
> -
>  	if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
>  		ext4_error(parent->i_sb, "EA inode %lu does not have "
>  			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
> @@ -351,6 +354,12 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>  	return err;
>  }
>  
> +static u32
> +ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
> +{
> +	return ext4_chksum(sbi, 0, buffer, size);

Hmm... normally we'd supply sbi->s_csum_seed as the second argument so
that the metadata checksum value also has the fs uuid stamped into it.
That way if we ever encounter a piece of metadata we can positively
confirm that it belongs to this filesystem (vs. a piece of metadata that
came from a previous ext4 that had been written to the disk) or discard
it as being a ghost from an old iteration.  For xattrs I think we were
also baking in either the owning inode number (refcount == 1) or the
block number (refcount > 1) so that there's some redundant parent
pointer information encoded in the checksum too.

Even if you dismiss that, we usually follow the convention of
initializing the crc32c calculation with (~0U), not (0U), to strengthen
crc32c's ability to detect zeroes being injected at the start of the
stream.

--D

> +}
> +
>  /*
>   * Read the value from the EA inode.
>   */
> @@ -358,17 +367,53 @@ static int
>  ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
>  		     size_t size)
>  {
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
>  	struct inode *ea_inode;
> -	int ret;
> +	u32 hash, calc_hash;
> +	struct mb_cache_entry *ce;
> +	int err;
>  
> -	ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -	if (ret)
> -		return ret;
> +	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> +	if (err) {
> +		ea_inode = NULL;
> +		goto out;
> +	}
>  
> -	ret = ext4_xattr_inode_read(ea_inode, buffer, size);
> -	iput(ea_inode);
> +	if (i_size_read(ea_inode) != size) {
> +		ext4_warning_inode(ea_inode,
> +				   "ea_inode file size=%llu entry size=%zu",
> +				   i_size_read(ea_inode), size);
> +		err = -EFSCORRUPTED;
> +		goto out;
> +	}
>  
> -	return ret;
> +	err = ext4_xattr_inode_read(ea_inode, buffer, size);
> +	if (!err) {
> +		if (ext4_xattr_read_ea_hash(ea_inode, &hash))
> +			goto out;
> +
> +		/* Avoid hash calculation if already cached. */
> +		ce = mb_cache_entry_get(ea_inode_cache, hash, ea_inode->i_ino);
> +		if (ce) {
> +			mb_cache_entry_put(ea_inode_cache, ce);
> +			goto out;
> +		}
> +
> +		calc_hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), buffer,
> +						  size);
> +		if (hash != calc_hash) {
> +			ext4_warning_inode(ea_inode, "EA inode saved hash=%#x "
> +					   "does not match calc_hash=%#x",
> +					   hash, calc_hash);
> +			goto out;
> +		}
> +
> +		mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
> +				      ea_inode->i_ino, true /* reusable */);
> +	}
> +out:
> +	iput(ea_inode);
> +	return err;
>  }
>  
>  static int
> @@ -657,6 +702,101 @@ static void ext4_xattr_update_super_block(handle_t *handle,
>  	}
>  }
>  
> +static inline size_t round_up_cluster(struct inode *inode, size_t length)
> +{
> +	struct super_block *sb = inode->i_sb;
> +	size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
> +				    inode->i_blkbits);
> +	size_t mask = ~(cluster_size - 1);
> +
> +	return (length + cluster_size - 1) & mask;
> +}
> +
> +static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
> +{
> +	int err;
> +
> +	err = dquot_alloc_inode(inode);
> +	if (err)
> +		return err;
> +	err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
> +	if (err)
> +		dquot_free_inode(inode);
> +	return err;
> +}
> +
> +static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
> +{
> +	dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
> +	dquot_free_inode(inode);
> +}
> +
> +static int __ext4_xattr_set_credits(struct super_block *sb,
> +				    struct buffer_head *block_bh,
> +				    size_t value_len)
> +{
> +	int credits;
> +	int blocks;
> +
> +	/*
> +	 * 1) Owner inode update
> +	 * 2) Ref count update on old xattr block
> +	 * 3) new xattr block
> +	 * 4) block bitmap update for new xattr block
> +	 * 5) group descriptor for new xattr block
> +	 */
> +	credits = 5;
> +
> +	/* We are done if ea_inode feature is not enabled. */
> +	if (!ext4_has_feature_ea_inode(sb))
> +		return credits;
> +
> +	/* New ea_inode, inode map, block bitmap, group descriptor. */
> +	credits += 4;
> +
> +	/* Data blocks. */
> +	blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
> +
> +	/* Indirection block. */
> +	blocks += 1;
> +
> +	/* Block bitmap and group descriptor updates for each block. */
> +	credits += blocks * 2;
> +
> +	/* Blocks themselves. */
> +	credits += blocks;
> +
> +	/* Dereference ea_inode holding old xattr value.
> +	 * Old ea_inode, inode map, block bitmap, group descriptor.
> +	 */
> +	credits += 4;
> +
> +	/* Data blocks for old ea_inode. */
> +	blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
> +
> +	/* Indirection block for old ea_inode. */
> +	blocks += 1;
> +
> +	/* Block bitmap and group descriptor updates for each block. */
> +	credits += blocks * 2;
> +
> +	/* Quota updates. */
> +	credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
> +
> +	/* We may need to clone the existing xattr block in which case we need
> +	 * to increment ref counts for existing ea_inodes referenced by it.
> +	 */
> +	if (block_bh) {
> +		struct ext4_xattr_entry *entry = BFIRST(block_bh);
> +
> +		for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
> +			if (entry->e_value_inum)
> +				/* Ref count update on ea_inode. */
> +				credits += 1;
> +	}
> +	return credits;
> +}
> +
>  int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>  			      int credits, struct buffer_head *bh,
>  			      bool dirty, bool block_csum)
> @@ -706,12 +846,139 @@ int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>  	return 0;
>  }
>  
> +static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
> +				       int ref_change)
> +{
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
> +	struct ext4_iloc iloc;
> +	s64 ref_return;
> +	u32 hash;
> +	int ret;
> +
> +	inode_lock(ea_inode);
> +
> +	ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
> +	if (ret) {
> +		iloc.bh = NULL;
> +		goto out;
> +	}
> +
> +	ret = ext4_xattr_update_ea_info(ea_inode, ref_change, &ref_return,
> +					&hash);
> +	if (ret)
> +		goto out;
> +
> +	if (ref_change > 0) {
> +		WARN_ONCE(ref_return <= 0, "EA inode %lu ref_return=%lld",
> +			  ea_inode->i_ino, ref_return);
> +
> +		if (ref_return == 1) {
> +			WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
> +				  ea_inode->i_ino, ea_inode->i_nlink);
> +
> +			set_nlink(ea_inode, 1);
> +			ext4_orphan_del(handle, ea_inode);
> +
> +			mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
> +					      ea_inode->i_ino,
> +					      true /* reusable */);
> +		}
> +	} else {
> +		WARN_ONCE(ref_return < 0, "EA inode %lu ref_return=%lld",
> +			  ea_inode->i_ino, ref_return);
> +
> +		if (ref_return == 0) {
> +			WARN_ONCE(ea_inode->i_nlink != 1,
> +				  "EA inode %lu i_nlink=%u",
> +				  ea_inode->i_ino, ea_inode->i_nlink);
> +
> +			clear_nlink(ea_inode);
> +			ext4_orphan_add(handle, ea_inode);
> +
> +			mb_cache_entry_delete(ea_inode_cache, hash,
> +					      ea_inode->i_ino);
> +		}
> +	}
> +
> +	ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
> +	iloc.bh = NULL;
> +	if (ret)
> +		ext4_warning_inode(ea_inode,
> +				   "ext4_mark_iloc_dirty() failed ret=%d", ret);
> +out:
> +	brelse(iloc.bh);
> +	inode_unlock(ea_inode);
> +	return ret;
> +}
> +
> +static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
> +{
> +	return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
> +}
> +
> +static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
> +{
> +	return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
> +}
> +
> +static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
> +					struct ext4_xattr_entry *first)
> +{
> +	struct inode *ea_inode;
> +	struct ext4_xattr_entry *entry;
> +	struct ext4_xattr_entry *failed_entry;
> +	unsigned int ea_ino;
> +	int err, saved_err;
> +
> +	for (entry = first; !IS_LAST_ENTRY(entry);
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		ea_ino = le32_to_cpu(entry->e_value_inum);
> +		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +		if (err)
> +			goto cleanup;
> +		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
> +		if (err) {
> +			ext4_warning_inode(ea_inode, "inc ref error %d", err);
> +			iput(ea_inode);
> +			goto cleanup;
> +		}
> +		iput(ea_inode);
> +	}
> +	return 0;
> +
> +cleanup:
> +	saved_err = err;
> +	failed_entry = entry;
> +
> +	for (entry = first; entry != failed_entry;
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		ea_ino = le32_to_cpu(entry->e_value_inum);
> +		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +		if (err) {
> +			ext4_warning(parent->i_sb,
> +				     "cleanup ea_ino %u iget error %d", ea_ino,
> +				     err);
> +			continue;
> +		}
> +		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (err)
> +			ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
> +					   err);
> +		iput(ea_inode);
> +	}
> +	return saved_err;
> +}
> +
>  static void
> -ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
> -			    struct buffer_head *bh,
> -			    struct ext4_xattr_entry *first, bool block_csum,
> -			    struct ext4_xattr_inode_array **ea_inode_array,
> -			    int extra_credits)
> +ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
> +			     struct buffer_head *bh,
> +			     struct ext4_xattr_entry *first, bool block_csum,
> +			     struct ext4_xattr_inode_array **ea_inode_array,
> +			     int extra_credits, bool skip_quota)
>  {
>  	struct inode *ea_inode;
>  	struct ext4_xattr_entry *entry;
> @@ -748,10 +1015,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>  			continue;
>  		}
>  
> -		inode_lock(ea_inode);
> -		clear_nlink(ea_inode);
> -		ext4_orphan_add(handle, ea_inode);
> -		inode_unlock(ea_inode);
> +		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (err) {
> +			ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
> +					   err);
> +			continue;
> +		}
> +
> +		if (!skip_quota)
> +			ext4_xattr_inode_free_quota(parent,
> +					      le32_to_cpu(entry->e_value_size));
>  
>  		/*
>  		 * Forget about ea_inode within the same transaction that decrements the ref
> @@ -784,7 +1057,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>   */
>  static void
>  ext4_xattr_release_block(handle_t *handle, struct inode *inode,
> -			 struct buffer_head *bh)
> +			 struct buffer_head *bh,
> +			 struct ext4_xattr_inode_array **ea_inode_array,
> +			 int extra_credits)
>  {
>  	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
>  	u32 hash, ref;
> @@ -807,6 +1082,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>  		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
>  		get_bh(bh);
>  		unlock_buffer(bh);
> +
> +		if (ext4_has_feature_ea_inode(inode->i_sb))
> +			ext4_xattr_inode_dec_ref_all(handle, inode, bh,
> +						     BFIRST(bh),
> +						     true /* block_csum */,
> +						     ea_inode_array,
> +						     extra_credits,
> +						     true /* skip_quota */);
>  		ext4_free_blocks(handle, inode, bh, 0, 1,
>  				 EXT4_FREE_BLOCKS_METADATA |
>  				 EXT4_FREE_BLOCKS_FORGET);
> @@ -947,7 +1230,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
>   * Create an inode to store the value of a large EA.
>   */
>  static struct inode *ext4_xattr_inode_create(handle_t *handle,
> -					     struct inode *inode)
> +					     struct inode *inode, u32 hash)
>  {
>  	struct inode *ea_inode = NULL;
>  	uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
> @@ -965,67 +1248,119 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
>  		ea_inode->i_fop = &ext4_file_operations;
>  		ext4_set_aops(ea_inode);
>  		ext4_xattr_inode_set_class(ea_inode);
> -		ea_inode->i_generation = inode->i_generation;
> -		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
> -
> -		/*
> -		 * A back-pointer from EA inode to parent inode will be useful
> -		 * for e2fsck.
> -		 */
> -		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
>  		unlock_new_inode(ea_inode);
>  		err = ext4_inode_attach_jinode(ea_inode);
> +		if (!err)
> +			err = ext4_xattr_inode_init(handle, ea_inode, hash);
>  		if (err) {
>  			iput(ea_inode);
>  			return ERR_PTR(err);
>  		}
> +
> +		/*
> +		 * Xattr inodes are shared therefore quota charging is performed
> +		 * at a higher level.
> +		 */
> +		dquot_free_inode(ea_inode);
> +		dquot_drop(ea_inode);
> +		inode_lock(ea_inode);
> +		ea_inode->i_flags |= S_NOQUOTA;
> +		inode_unlock(ea_inode);
>  	}
>  
>  	return ea_inode;
>  }
>  
> -/*
> - * Unlink the inode storing the value of the EA.
> - */
> -int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
> +static struct inode *
> +ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
> +			    size_t value_len, u32 hash)
>  {
> -	struct inode *ea_inode = NULL;
> +	struct inode *ea_inode;
> +	struct mb_cache_entry *ce;
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
> +	void *ea_data = NULL;
>  	int err;
>  
> -	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -	if (err)
> -		return err;
> +	ce = mb_cache_entry_find_first(ea_inode_cache, hash);
> +	while (ce) {
> +		ea_inode = ext4_iget(inode->i_sb, ce->e_value);
> +		if (IS_ERR(ea_inode)) {
> +			ea_inode = NULL;
> +			goto next;
> +		}
>  
> -	clear_nlink(ea_inode);
> -	iput(ea_inode);
> +		if (is_bad_inode(ea_inode) ||
> +		    !(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) ||
> +		    i_size_read(ea_inode) != value_len)
> +			goto next;
>  
> -	return 0;
> +		if (!ea_data)
> +			ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
> +
> +		if (!ea_data) {
> +			iput(ea_inode);
> +			return NULL;
> +		}
> +
> +		err = ext4_xattr_inode_read(ea_inode, ea_data, value_len);
> +		if (unlikely(err))
> +			goto next;
> +
> +		if (!memcmp(value, ea_data, value_len)) {
> +			mb_cache_entry_touch(ea_inode_cache, ce);
> +			mb_cache_entry_put(ea_inode_cache, ce);
> +			kvfree(ea_data);
> +			return ea_inode;
> +		}
> +	next:
> +		iput(ea_inode);
> +		ce = mb_cache_entry_find_next(ea_inode_cache, ce);
> +	}
> +	kvfree(ea_data);
> +	return NULL;
>  }
>  
>  /*
>   * Add value of the EA in an inode.
>   */
> -static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
> -				unsigned long *ea_ino, const void *value,
> -				size_t value_len)
> +static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
> +					  const void *value, size_t value_len,
> +					  struct inode **ret_inode)
>  {
>  	struct inode *ea_inode;
> +	u32 hash;
>  	int err;
>  
> +	hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
> +	ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
> +	if (ea_inode) {
> +		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
> +		if (err) {
> +			iput(ea_inode);
> +			return err;
> +		}
> +
> +		*ret_inode = ea_inode;
> +		return 0;
> +	}
> +
>  	/* Create an inode for the EA value */
> -	ea_inode = ext4_xattr_inode_create(handle, inode);
> +	ea_inode = ext4_xattr_inode_create(handle, inode, hash);
>  	if (IS_ERR(ea_inode))
>  		return PTR_ERR(ea_inode);
>  
>  	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
> -	if (err)
> -		clear_nlink(ea_inode);
> -	else
> -		*ea_ino = ea_inode->i_ino;
> +	if (err) {
> +		ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		iput(ea_inode);
> +		return err;
> +	}
>  
> -	iput(ea_inode);
> +	mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
> +			      ea_inode->i_ino, true /* reusable */);
>  
> -	return err;
> +	*ret_inode = ea_inode;
> +	return 0;
>  }
>  
>  static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
> @@ -1033,11 +1368,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>  				handle_t *handle, struct inode *inode)
>  {
>  	struct ext4_xattr_entry *last;
> +	struct ext4_xattr_entry *here = s->here;
>  	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
>  	int in_inode = i->in_inode;
> -	int rc;
> +	struct inode *old_ea_inode = NULL;
> +	struct inode *new_ea_inode = NULL;
> +	int ret;
>  
> -	/* Compute min_offs and last. */
> +	/*
> +	 * Optimization for the simple case when old and new values have the
> +	 * same padded sizes. Not applicable if the existing value is stored in
> +	 * an external inode.
> +	 */
> +	if (i->value && !s->not_found && !here->e_value_inum &&
> +	    EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) ==
> +	    EXT4_XATTR_SIZE(i->value_len)) {
> +		size_t offs = le16_to_cpu(here->e_value_offs);
> +		void *val = s->base + offs;
> +		size_t size = EXT4_XATTR_SIZE(i->value_len);
> +
> +		here->e_value_size = cpu_to_le32(i->value_len);
> +		if (i->value == EXT4_ZERO_XATTR_VALUE) {
> +			memset(val, 0, size);
> +		} else {
> +			memcpy(val, i->value, i->value_len);
> +			/* Clear padding bytes. */
> +			memset(val + i->value_len, 0, size - i->value_len);
> +		}
> +		return 0;
> +	}
> +
> +	/* Find out min_offs and last to calculate the free space. */
>  	last = s->first;
>  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
>  		if (!last->e_value_inum && last->e_value_size) {
> @@ -1048,120 +1409,149 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>  	}
>  	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
>  	if (!s->not_found) {
> -		if (!in_inode &&
> -		    !s->here->e_value_inum && s->here->e_value_size) {
> -			size_t size = le32_to_cpu(s->here->e_value_size);
> +		if (!here->e_value_inum && here->e_value_size) {
> +			size_t size = le32_to_cpu(here->e_value_size);
>  			free += EXT4_XATTR_SIZE(size);
>  		}
>  		free += EXT4_XATTR_LEN(name_len);
>  	}
>  	if (i->value) {
> -		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
> +		size_t value_len = in_inode ? 0 : EXT4_XATTR_SIZE(i->value_len);
>  
> -		if (in_inode)
> -			value_len = 0;
> +		if (free < EXT4_XATTR_LEN(name_len) + value_len) {
> +			ret = -ENOSPC;
> +			goto out;
> +		}
> +	}
>  
> -		if (free < EXT4_XATTR_LEN(name_len) + value_len)
> -			return -ENOSPC;
> +	/*
> +	 * Getting access to old and new ea inodes is subject to failures.
> +	 * Finish that work before doing any modifications to the xattr data.
> +	 */
> +	if (!s->not_found && here->e_value_inum) {
> +		ret = ext4_xattr_inode_iget(inode,
> +		 			    le32_to_cpu(here->e_value_inum),
> +		 			    &old_ea_inode);
> +		if (ret) {
> +			old_ea_inode = NULL;
> +			goto out;
> +		}
>  	}
> +	if (i->value && in_inode) {
> +		WARN_ON_ONCE(!i->value_len);
>  
> -	if (i->value && s->not_found) {
> -		/* Insert the new name. */
> -		size_t size = EXT4_XATTR_LEN(name_len);
> -		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
> -		memmove((void *)s->here + size, s->here, rest);
> -		memset(s->here, 0, size);
> -		s->here->e_name_index = i->name_index;
> -		s->here->e_name_len = name_len;
> -		memcpy(s->here->e_name, i->name, name_len);
> -	} else {
> -		if (!s->here->e_value_inum && s->here->e_value_size &&
> -		    s->here->e_value_offs > 0) {
> -			void *first_val = s->base + min_offs;
> -			size_t offs = le16_to_cpu(s->here->e_value_offs);
> -			void *val = s->base + offs;
> -			size_t size = EXT4_XATTR_SIZE(
> -				le32_to_cpu(s->here->e_value_size));
> -
> -			if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
> -				/* The old and the new value have the same
> -				   size. Just replace. */
> -				s->here->e_value_size =
> -					cpu_to_le32(i->value_len);
> -				if (i->value == EXT4_ZERO_XATTR_VALUE) {
> -					memset(val, 0, size);
> -				} else {
> -					/* Clear pad bytes first. */
> -					memset(val + size - EXT4_XATTR_PAD, 0,
> -					       EXT4_XATTR_PAD);
> -					memcpy(val, i->value, i->value_len);
> -				}
> -				return 0;
> -			}
> +		ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
> +		if (ret)
> +			goto out;
>  
> -			/* Remove the old value. */
> -			memmove(first_val + size, first_val, val - first_val);
> -			memset(first_val, 0, size);
> -			s->here->e_value_size = 0;
> -			s->here->e_value_offs = 0;
> -			min_offs += size;
> -
> -			/* Adjust all value offsets. */
> -			last = s->first;
> -			while (!IS_LAST_ENTRY(last)) {
> -				size_t o = le16_to_cpu(last->e_value_offs);
> -				if (!last->e_value_inum &&
> -				    last->e_value_size && o < offs)
> -					last->e_value_offs =
> -						cpu_to_le16(o + size);
> -				last = EXT4_XATTR_NEXT(last);
> -			}
> +		ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
> +						     i->value_len,
> +						     &new_ea_inode);
> +		if (ret) {
> +			new_ea_inode = NULL;
> +			ext4_xattr_inode_free_quota(inode, i->value_len);
> +			goto out;
>  		}
> -		if (s->here->e_value_inum) {
> -			ext4_xattr_inode_unlink(inode,
> -					    le32_to_cpu(s->here->e_value_inum));
> -			s->here->e_value_inum = 0;
> +	}
> +
> +	if (old_ea_inode) {
> +		/* We are ready to release ref count on the old_ea_inode. */
> +		ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
> +		if (ret) {
> +			/* Release newly required ref count on new_ea_inode. */
> +			if (new_ea_inode) {
> +				int err;
> +
> +				err = ext4_xattr_inode_dec_ref(handle,
> +							       new_ea_inode);
> +				if (err)
> +					ext4_warning_inode(new_ea_inode,
> +						  "dec ref new_ea_inode err=%d",
> +						  err);
> +				ext4_xattr_inode_free_quota(inode,
> +							    i->value_len);
> +			}
> +			goto out;
>  		}
> -		if (!i->value) {
> -			/* Remove the old name. */
> -			size_t size = EXT4_XATTR_LEN(name_len);
> -			last = ENTRY((void *)last - size);
> -			memmove(s->here, (void *)s->here + size,
> -				(void *)last - (void *)s->here + sizeof(__u32));
> -			memset(last, 0, size);
> +
> +		ext4_xattr_inode_free_quota(inode,
> +					    le32_to_cpu(here->e_value_size));
> +	}
> +
> +	/* No failures allowed past this point. */
> +
> +	if (!s->not_found && here->e_value_offs) {
> +		/* Remove the old value. */
> +		void *first_val = s->base + min_offs;
> +		size_t offs = le16_to_cpu(here->e_value_offs);
> +		void *val = s->base + offs;
> +		size_t size = EXT4_XATTR_SIZE(
> +			le32_to_cpu(here->e_value_size));
> +
> +		memmove(first_val + size, first_val, val - first_val);
> +		memset(first_val, 0, size);
> +		min_offs += size;
> +
> +		/* Adjust all value offsets. */
> +		last = s->first;
> +		while (!IS_LAST_ENTRY(last)) {
> +			size_t o = le16_to_cpu(last->e_value_offs);
> +			if (!last->e_value_inum &&
> +			    last->e_value_size && o < offs)
> +				last->e_value_offs =
> +					cpu_to_le16(o + size);
> +			last = EXT4_XATTR_NEXT(last);
>  		}
>  	}
>  
> +	if (!s->not_found && !i->value) {
> +		/* Remove old name. */
> +		size_t size = EXT4_XATTR_LEN(name_len);
> +		last = ENTRY((void *)last - size);
> +		memmove(here, (void *)here + size,
> +			(void *)last - (void *)here + sizeof(__u32));
> +		memset(last, 0, size);
> +	} else if (s->not_found && i->value) {
> +		/* Insert new name. */
> +		size_t size = EXT4_XATTR_LEN(name_len);
> +		size_t rest = (void *)last - (void *)here + sizeof(__u32);
> +		memmove((void *)here + size, here, rest);
> +		memset(here, 0, size);
> +		here->e_name_index = i->name_index;
> +		here->e_name_len = name_len;
> +		memcpy(here->e_name, i->name, name_len);
> +	} else {
> +		WARN_ON_ONCE(s->not_found || !i->value);
> +		/* This is an update, reset value info. */
> +		here->e_value_inum = 0;
> +		here->e_value_offs = 0;
> +		here->e_value_size = 0;
> +	}
> +
>  	if (i->value) {
> -		/* Insert the new value. */
> +		/* Insert new value. */
>  		if (in_inode) {
> -			unsigned long ea_ino =
> -				le32_to_cpu(s->here->e_value_inum);
> -			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
> -						  i->value, i->value_len);
> -			if (rc)
> -				goto out;
> -			s->here->e_value_inum = cpu_to_le32(ea_ino);
> -			s->here->e_value_offs = 0;
> +			here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
>  		} else if (i->value_len) {
>  			size_t size = EXT4_XATTR_SIZE(i->value_len);
>  			void *val = s->base + min_offs - size;
> -			s->here->e_value_offs = cpu_to_le16(min_offs - size);
> -			s->here->e_value_inum = 0;
> +			here->e_value_offs = cpu_to_le16(min_offs - size);
>  			if (i->value == EXT4_ZERO_XATTR_VALUE) {
>  				memset(val, 0, size);
>  			} else {
> -				/* Clear the pad bytes first. */
> -				memset(val + size - EXT4_XATTR_PAD, 0,
> -				       EXT4_XATTR_PAD);
>  				memcpy(val, i->value, i->value_len);
> +				/* Clear padding bytes. */
> +				memset(val + i->value_len, 0,
> +				       size - i->value_len);
>  			}
>  		}
> -		s->here->e_value_size = cpu_to_le32(i->value_len);
> +		here->e_value_size = cpu_to_le32(i->value_len);
>  	}
> -
> +	ret = 0;
>  out:
> -	return rc;
> +	iput(old_ea_inode);
> +	iput(new_ea_inode);
> +	return ret;
>  }
>  
>  struct ext4_xattr_block_find {
> @@ -1223,6 +1613,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  	struct mb_cache_entry *ce = NULL;
>  	int error = 0;
>  	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
> +	struct inode *ea_inode = NULL;
> +	size_t old_ea_inode_size = 0;
>  
>  #define header(x) ((struct ext4_xattr_header *)(x))
>  
> @@ -1277,6 +1669,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  			header(s->base)->h_refcount = cpu_to_le32(1);
>  			s->here = ENTRY(s->base + offset);
>  			s->end = s->base + bs->bh->b_size;
> +
> +			/*
> +			 * If existing entry points to an xattr inode, we need
> +			 * to prevent ext4_xattr_set_entry() from decrementing
> +			 * ref count on it because the reference belongs to the
> +			 * original block. In this case, make the entry look
> +			 * like it has an empty value.
> +			 */
> +			if (!s->not_found && s->here->e_value_inum) {
> +				/*
> +				 * Defer quota free call for previous inode
> +				 * until success is guaranteed.
> +				 */
> +				old_ea_inode_size = le32_to_cpu(
> +							s->here->e_value_size);
> +				s->here->e_value_inum = 0;
> +				s->here->e_value_size = 0;
> +			}
>  		}
>  	} else {
>  		/* Allocate a buffer where we construct the new block. */
> @@ -1298,6 +1708,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  		goto bad_block;
>  	if (error)
>  		goto cleanup;
> +
> +	if (i->value && s->here->e_value_inum) {
> +		unsigned int ea_ino;
> +
> +		/*
> +		 * A ref count on ea_inode has been taken as part of the call to
> +		 * ext4_xattr_set_entry() above. We would like to drop this
> +		 * extra ref but we have to wait until the xattr block is
> +		 * initialized and has its own ref count on the ea_inode.
> +		 */
> +		ea_ino = le32_to_cpu(s->here->e_value_inum);
> +		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> +		if (error) {
> +			ea_inode = NULL;
> +			goto cleanup;
> +		}
> +	}
> +
>  	if (!IS_LAST_ENTRY(s->first))
>  		ext4_xattr_rehash(header(s->base), s->here);
>  
> @@ -1408,6 +1836,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  						 EXT4_FREE_BLOCKS_METADATA);
>  				goto cleanup;
>  			}
> +			error = ext4_xattr_inode_inc_ref_all(handle, inode,
> +						      ENTRY(header(s->base)+1));
> +			if (error)
> +				goto getblk_failed;
> +			if (ea_inode) {
> +				/* Drop the extra ref on ea_inode. */
> +				error = ext4_xattr_inode_dec_ref(handle,
> +								 ea_inode);
> +				if (error)
> +					ext4_warning_inode(ea_inode,
> +							   "dec ref error=%d",
> +							   error);
> +				iput(ea_inode);
> +				ea_inode = NULL;
> +			}
> +
>  			lock_buffer(new_bh);
>  			error = ext4_journal_get_create_access(handle, new_bh);
>  			if (error) {
> @@ -1427,15 +1871,36 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  		}
>  	}
>  
> +	if (old_ea_inode_size)
> +		ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
> +
>  	/* Update the inode. */
>  	EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
>  
>  	/* Drop the previous xattr block. */
> -	if (bs->bh && bs->bh != new_bh)
> -		ext4_xattr_release_block(handle, inode, bs->bh);
> +	if (bs->bh && bs->bh != new_bh) {
> +		struct ext4_xattr_inode_array *ea_inode_array = NULL;
> +		ext4_xattr_release_block(handle, inode, bs->bh,
> +					 &ea_inode_array,
> +					 0 /* extra_credits */);
> +		ext4_xattr_inode_array_free(ea_inode_array);
> +	}
>  	error = 0;
>  
>  cleanup:
> +	if (ea_inode) {
> +		int error2;
> +		error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		if (error2)
> +			ext4_warning_inode(ea_inode, "dec ref error=%d",
> +					   error2);
> +
> +		/* If there was an error, revert the quota charge. */
> +		if (error)
> +			ext4_xattr_inode_free_quota(inode,
> +						    i_size_read(ea_inode));
> +		iput(ea_inode);
> +	}
>  	if (ce)
>  		mb_cache_entry_put(ext4_mb_cache, ce);
>  	brelse(new_bh);
> @@ -1546,6 +2011,117 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
>  	return 0;
>  }
>  
> +struct ext4_xattr_ea_info {
> +	__le64 ref_count;	/* number of xattr entry references */
> +	__le32 hash;		/* crc32c hash of xattr data */
> +	__le32 reserved;	/* reserved, must be 0 */
> +};
> +
> +static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
> +				 u32 hash)
> +{
> +	struct ext4_xattr_ea_info ea_info = {
> +		.ref_count = cpu_to_le64(1),
> +		.hash = cpu_to_le32(hash),
> +		.reserved = 0,
> +	};
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +		.value = &ea_info,
> +		.value_len = sizeof(ea_info),
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	return ext4_xattr_ibody_set(handle, ea_inode, &i, &is);
> +}
> +
> +static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
> +				     u64 *ref_return, u32 *hash)
> +{
> +	struct ext4_xattr_ea_info ea_info;
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +		.value = &ea_info,
> +		.value_len = sizeof(ea_info),
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	if (WARN_ON(is.s.not_found) ||
> +	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(ea_info)))
> +		return -EFSCORRUPTED;
> +
> +	memcpy(&ea_info,
> +	       ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs),
> +	       sizeof(ea_info));
> +
> +	if (hash)
> +		*hash = le32_to_cpu(ea_info.hash);
> +
> +	*ref_return = le64_to_cpu(ea_info.ref_count) + ref_change;
> +	ea_info.ref_count = cpu_to_le64(*ref_return);
> +
> +	return ext4_xattr_set_entry(&i, &is.s, NULL, ea_inode);
> +}
> +
> +static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash)
> +{
> +	struct ext4_xattr_info i = {
> +                .name_index = EXT4_XATTR_INDEX_SYSTEM,
> +		.name = EXT4_XATTR_SYSTEM_EA_INFO,
> +	};
> +	struct ext4_xattr_ibody_find is = {
> +		.s = { .not_found = -ENODATA, },
> +	};
> +	struct ext4_xattr_ea_info *ea_info;
> +	void *ptr;
> +	int err;
> +
> +	err = ext4_get_inode_loc(ea_inode, &is.iloc);
> +	if (err)
> +		return err;
> +
> +	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
> +	if (err)
> +		return err;
> +
> +	if (WARN_ON(is.s.not_found) ||
> +	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(*ea_info)))
> +		return -EFSCORRUPTED;
> +
> +	ptr = ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs);
> +	ea_info = (struct ext4_xattr_ea_info *)ptr;
> +
> +	if (WARN_ON(ea_info->reserved != 0))
> +		return -EFSCORRUPTED;
> +
> +	*hash = le32_to_cpu(ea_info->hash);
> +	return 0;
> +}
> +
>  static int ext4_xattr_value_same(struct ext4_xattr_search *s,
>  				 struct ext4_xattr_info *i)
>  {
> @@ -1560,6 +2136,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
>  	return !memcmp(value, i->value, i->value_len);
>  }
>  
> +struct buffer_head *ext4_xattr_get_block(struct inode *inode)
> +{
> +	struct buffer_head *bh;
> +	int error;
> +
> +	if (!EXT4_I(inode)->i_file_acl)
> +		return NULL;
> +	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +	if (!bh)
> +		return ERR_PTR(-EIO);
> +	error = ext4_xattr_check_block(inode, bh);
> +	if (error)
> +		return ERR_PTR(error);
> +	return bh;
> +}
> +
>  /*
>   * ext4_xattr_set_handle()
>   *
> @@ -1602,9 +2194,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  
>  	/* Check journal credits under write lock. */
>  	if (ext4_handle_valid(handle)) {
> +		struct buffer_head *bh;
>  		int credits;
>  
> -		credits = ext4_xattr_set_credits(inode, value_len);
> +		bh = ext4_xattr_get_block(inode);
> +		if (IS_ERR(bh)) {
> +			error = PTR_ERR(bh);
> +			goto cleanup;
> +		}
> +
> +		credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
> +		brelse(bh);
> +
>  		if (!ext4_handle_has_enough_credits(handle, credits)) {
>  			error = -ENOSPC;
>  			goto cleanup;
> @@ -1640,6 +2241,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  		if (flags & XATTR_CREATE)
>  			goto cleanup;
>  	}
> +
>  	if (!value) {
>  		if (!is.s.not_found)
>  			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
> @@ -1708,34 +2310,29 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>  	return error;
>  }
>  
> -int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
> +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
>  {
> -	struct super_block *sb = inode->i_sb;
> -	int credits;
> -
> -	if (!EXT4_SB(sb)->s_journal)
> -		return 0;
> -
> -	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
> +	struct buffer_head *bh;
> +	int err;
>  
> -	/*
> -	 * In case of inline data, we may push out the data to a block,
> -	 * so we need to reserve credits for this eventuality
> -	 */
> -	if (ext4_has_inline_data(inode))
> -	        credits += ext4_writepage_trans_blocks(inode) + 1;
> +	*credits = 0;
>  
> -	if (ext4_has_feature_ea_inode(sb)) {
> -		int nrblocks = (value_len + sb->s_blocksize - 1) >>
> -					sb->s_blocksize_bits;
> +	if (!EXT4_SB(inode->i_sb)->s_journal)
> +		return 0;
>  
> -		/* For new inode */
> -		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
> +	down_read(&EXT4_I(inode)->xattr_sem);
>  
> -		/* For data blocks of EA inode */
> -		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
> +	bh = ext4_xattr_get_block(inode);
> +	if (IS_ERR(bh)) {
> +		err = PTR_ERR(bh);
> +	} else {
> +		*credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
> +		brelse(bh);
> +		err = 0;
>  	}
> -	return credits;
> +
> +	up_read(&EXT4_I(inode)->xattr_sem);
> +	return err;
>  }
>  
>  /*
> @@ -1760,7 +2357,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
>  		return error;
>  
>  retry:
> -	credits = ext4_xattr_set_credits(inode, value_len);
> +	error = ext4_xattr_set_credits(inode, value_len, &credits);
> +	if (error)
> +		return error;
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>  	if (IS_ERR(handle)) {
>  		error = PTR_ERR(handle);
> @@ -2066,10 +2666,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
>  	return error;
>  }
>  
> -
>  #define EIA_INCR 16 /* must be 2^n */
>  #define EIA_MASK (EIA_INCR - 1)
> -/* Add the large xattr @inode into @ea_inode_array for later deletion.
> +
> +/* Add the large xattr @inode into @ea_inode_array for deferred iput().
>   * If @ea_inode_array is new or full it will be grown and the old
>   * contents copied over.
>   */
> @@ -2114,21 +2714,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>   * ext4_xattr_delete_inode()
>   *
>   * Free extended attribute resources associated with this inode. Traverse
> - * all entries and unlink any xattr inodes associated with this inode. This
> - * is called immediately before an inode is freed. We have exclusive
> - * access to the inode. If an orphan inode is deleted it will also delete any
> - * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
> - * to ensure they belong to the parent inode and were not deleted already.
> + * all entries and decrement reference on any xattr inodes associated with this
> + * inode. This is called immediately before an inode is freed. We have exclusive
> + * access to the inode. If an orphan inode is deleted it will also release its
> + * references on xattr block and xattr inodes.
>   */
> -int
> -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> -			struct ext4_xattr_inode_array **ea_inode_array,
> -			int extra_credits)
> +int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> +			    struct ext4_xattr_inode_array **ea_inode_array,
> +			    int extra_credits)
>  {
>  	struct buffer_head *bh = NULL;
>  	struct ext4_xattr_ibody_header *header;
> -	struct ext4_inode *raw_inode;
>  	struct ext4_iloc iloc = { .bh = NULL };
> +	struct ext4_xattr_entry *entry;
>  	int error;
>  
>  	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
> @@ -2140,66 +2738,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  		goto cleanup;
>  	}
>  
> -	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
> -		goto delete_external_ea;
> -
> -	error = ext4_get_inode_loc(inode, &iloc);
> -	if (error)
> -		goto cleanup;
> +	if (ext4_has_feature_ea_inode(inode->i_sb) &&
> +	    ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
>  
> -	error = ext4_journal_get_write_access(handle, iloc.bh);
> -	if (error)
> -		goto cleanup;
> +		error = ext4_get_inode_loc(inode, &iloc);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
> +			goto cleanup;
> +		}
>  
> -	raw_inode = ext4_raw_inode(&iloc);
> -	header = IHDR(inode, raw_inode);
> -	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
> -				    false /* block_csum */, ea_inode_array,
> -				    extra_credits);
> +		error = ext4_journal_get_write_access(handle, iloc.bh);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "write access (error %d)",
> +					 error);
> +			goto cleanup;
> +		}
>  
> -delete_external_ea:
> -	if (!EXT4_I(inode)->i_file_acl) {
> -		error = 0;
> -		goto cleanup;
> -	}
> -	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> -	if (!bh) {
> -		EXT4_ERROR_INODE(inode, "block %llu read error",
> -				 EXT4_I(inode)->i_file_acl);
> -		error = -EIO;
> -		goto cleanup;
> -	}
> -	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
> -	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
> -		EXT4_ERROR_INODE(inode, "bad block %llu",
> -				 EXT4_I(inode)->i_file_acl);
> -		error = -EFSCORRUPTED;
> -		goto cleanup;
> +		header = IHDR(inode, ext4_raw_inode(&iloc));
> +		if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
> +			ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
> +						     IFIRST(header),
> +						     false /* block_csum */,
> +						     ea_inode_array,
> +						     extra_credits,
> +						     false /* skip_quota */);
>  	}
>  
> -	if (ext4_has_feature_ea_inode(inode->i_sb)) {
> -		error = ext4_journal_get_write_access(handle, bh);
> -		if (error) {
> -			EXT4_ERROR_INODE(inode, "write access %llu",
> +	if (EXT4_I(inode)->i_file_acl) {
> +		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +		if (!bh) {
> +			EXT4_ERROR_INODE(inode, "block %llu read error",
>  					 EXT4_I(inode)->i_file_acl);
> +			error = -EIO;
> +			goto cleanup;
> +		}
> +		error = ext4_xattr_check_block(inode, bh);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
> +					 EXT4_I(inode)->i_file_acl, error);
>  			goto cleanup;
>  		}
> -		ext4_xattr_inode_remove_all(handle, inode, bh,
> -					    BFIRST(bh),
> -					    true /* block_csum */,
> -					    ea_inode_array,
> -					    extra_credits);
> -	}
>  
> -	ext4_xattr_release_block(handle, inode, bh);
> -	/* Update i_file_acl within the same transaction that releases block. */
> -	EXT4_I(inode)->i_file_acl = 0;
> -	error = ext4_mark_inode_dirty(handle, inode);
> -	if (error) {
> -		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> -				 error);
> -		goto cleanup;
> +		if (ext4_has_feature_ea_inode(inode->i_sb)) {
> +			for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> +			     entry = EXT4_XATTR_NEXT(entry))
> +				if (entry->e_value_inum)
> +					ext4_xattr_inode_free_quota(inode,
> +					      le32_to_cpu(entry->e_value_size));
> +
> +		}
> +
> +		ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
> +					 extra_credits);
> +		/*
> +		 * Update i_file_acl value in the same transaction that releases
> +		 * block.
> +		 */
> +		EXT4_I(inode)->i_file_acl = 0;
> +		error = ext4_mark_inode_dirty(handle, inode);
> +		if (error) {
> +			EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> +					 error);
> +			goto cleanup;
> +		}
>  	}
> +	error = 0;
>  cleanup:
>  	brelse(iloc.bh);
>  	brelse(bh);
> @@ -2208,17 +2811,13 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  
>  void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
>  {
> -	struct inode	*ea_inode;
> -	int		idx = 0;
> +	int idx;
>  
>  	if (ea_inode_array == NULL)
>  		return;
>  
> -	for (; idx < ea_inode_array->count; ++idx) {
> -		ea_inode = ea_inode_array->inodes[idx];
> -		clear_nlink(ea_inode);
> -		iput(ea_inode);
> -	}
> +	for (idx = 0; idx < ea_inode_array->count; ++idx)
> +		iput(ea_inode_array->inodes[idx]);
>  	kfree(ea_inode_array);
>  }
>  
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index b2005a2716d9..67616cb9a059 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -70,19 +70,6 @@ struct ext4_xattr_entry {
>  #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
>  
>  /*
> - * Link EA inode back to parent one using i_mtime field.
> - * Extra integer type conversion added to ignore higher
> - * bits in i_mtime.tv_sec which might be set by ext4_get()
> - */
> -#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
> -do {                                                  \
> -      (inode)->i_mtime.tv_sec = inum;                 \
> -} while(0)
> -
> -#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
> -((__u32)(inode)->i_mtime.tv_sec)
> -
> -/*
>   * The minimum size of EA value when you start storing it in an external inode
>   * size of block - size of header - size of 1 entry - 4 null bytes
>  */
> @@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
>  extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
>  extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
>  extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
> -extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
> +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
> +				  int *credits);
>  
> -extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
>  extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>  				   struct ext4_xattr_inode_array **array,
>  				   int extra_credits);
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index 77a5b99d8f92..7dfdca822ccb 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -13,10 +13,11 @@
>   * mb_cache_entry_delete()).
>   *
>   * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
> - * They use hash of a block contents as a key and block number as a value.
> - * That's why keys need not be unique (different xattr blocks may end up having
> - * the same hash). However block number always uniquely identifies a cache
> - * entry.
> + * Ext4 also uses it for deduplication of xattr values stored in inodes.
> + * They use hash of data as a key and provide a value that may represent a
> + * block or inode number. That's why keys need not be unique (hash of different
> + * data may be the same). However user provided value always uniquely
> + * identifies a cache entry.
>   *
>   * We provide functions for creation and removal of entries, search by key,
>   * and a special "delete entry with given key-value pair" operation. Fixed
> -- 
> 2.13.0.219.gdb65acc882-goog
> 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v2 27/28] ext4: xattr inode deduplication
  2017-06-02  5:41         ` Darrick J. Wong
  (?)
  (?)
@ 2017-06-02 12:46         ` Tahsin Erdogan
  2017-06-02 17:59             ` Darrick J. Wong
  -1 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-02 12:46 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

> Hmm... normally we'd supply sbi->s_csum_seed as the second argument so
> that the metadata checksum value also has the fs uuid stamped into it.

I have thought about using sbi->s_csum_seed and was a little hesitant
because it involves adding more complexity to e2fsprogs to handle
cases like changing uuid or turning off metadata_csum. After thinking
more about this, I think it is doable.

> Even if you dismiss that, we usually follow the convention of
> initializing the crc32c calculation with (~0U), not (0U), to strengthen
> crc32c's ability to detect zeroes being injected at the start of the
> stream.

Agreed, using ~0 is definitely better than 0.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v2 27/28] ext4: xattr inode deduplication
  2017-06-02 12:46         ` Tahsin Erdogan
  2017-06-02 17:59             ` Darrick J. Wong
@ 2017-06-02 17:59             ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-06-02 17:59 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

On Fri, Jun 02, 2017 at 05:46:22AM -0700, Tahsin Erdogan wrote:
> > Hmm... normally we'd supply sbi->s_csum_seed as the second argument so
> > that the metadata checksum value also has the fs uuid stamped into it.
> 
> I have thought about using sbi->s_csum_seed and was a little hesitant
> because it involves adding more complexity to e2fsprogs to handle
> cases like changing uuid or turning off metadata_csum. After thinking
> more about this, I think it is doable.

e2fsprogs already has code to walk the fs to rewrite/remove checksums,
so it shouldn't be too much effort to tap into that to rewrite the
ea_info hash.

> > Even if you dismiss that, we usually follow the convention of
> > initializing the crc32c calculation with (~0U), not (0U), to strengthen
> > crc32c's ability to detect zeroes being injected at the start of the
> > stream.
> 
> Agreed, using ~0 is definitely better than 0.

--D

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v2 27/28] ext4: xattr inode deduplication
@ 2017-06-02 17:59             ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-06-02 17:59 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Deepa Dinamani, Dave Kleikamp, jfs-discussion, Theodore Ts'o,
	linux-kernel, reiserfs-devel, Jens Axboe, linux-fsdevel,
	Mike Christie, Andreas Dilger, Alexander Viro, Jan Kara,
	Fabian Frederick, linux-ext4, ocfs2-devel

On Fri, Jun 02, 2017 at 05:46:22AM -0700, Tahsin Erdogan wrote:
> > Hmm... normally we'd supply sbi->s_csum_seed as the second argument so
> > that the metadata checksum value also has the fs uuid stamped into it.
> 
> I have thought about using sbi->s_csum_seed and was a little hesitant
> because it involves adding more complexity to e2fsprogs to handle
> cases like changing uuid or turning off metadata_csum. After thinking
> more about this, I think it is doable.

e2fsprogs already has code to walk the fs to rewrite/remove checksums,
so it shouldn't be too much effort to tap into that to rewrite the
ea_info hash.

> > Even if you dismiss that, we usually follow the convention of
> > initializing the crc32c calculation with (~0U), not (0U), to strengthen
> > crc32c's ability to detect zeroes being injected at the start of the
> > stream.
> 
> Agreed, using ~0 is definitely better than 0.

--D

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [Ocfs2-devel] [PATCH v2 27/28] ext4: xattr inode deduplication
@ 2017-06-02 17:59             ` Darrick J. Wong
  0 siblings, 0 replies; 100+ messages in thread
From: Darrick J. Wong @ 2017-06-02 17:59 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Deepa Dinamani, Dave Kleikamp, jfs-discussion, Theodore Ts'o,
	linux-kernel, reiserfs-devel, Jens Axboe, linux-fsdevel,
	Mike Christie, Andreas Dilger, Alexander Viro, Jan Kara,
	Fabian Frederick, linux-ext4, ocfs2-devel

On Fri, Jun 02, 2017 at 05:46:22AM -0700, Tahsin Erdogan wrote:
> > Hmm... normally we'd supply sbi->s_csum_seed as the second argument so
> > that the metadata checksum value also has the fs uuid stamped into it.
> 
> I have thought about using sbi->s_csum_seed and was a little hesitant
> because it involves adding more complexity to e2fsprogs to handle
> cases like changing uuid or turning off metadata_csum. After thinking
> more about this, I think it is doable.

e2fsprogs already has code to walk the fs to rewrite/remove checksums,
so it shouldn't be too much effort to tap into that to rewrite the
ea_info hash.

> > Even if you dismiss that, we usually follow the convention of
> > initializing the crc32c calculation with (~0U), not (0U), to strengthen
> > crc32c's ability to detect zeroes being injected at the start of the
> > stream.
> 
> Agreed, using ~0 is definitely better than 0.

--D

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [PATCH v3 27/28] ext4: xattr inode deduplication
  2017-06-02 17:59             ` Darrick J. Wong
  (?)
  (?)
@ 2017-06-02 23:35             ` Tahsin Erdogan
  2017-06-14 14:34               ` [PATCH v4 " Tahsin Erdogan
  -1 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-02 23:35 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

Ext4 now supports xattr values that are up to 64k in size (vfs limit).
Large xattr values are stored in external inodes each one holding a
single value. Once written the data blocks of these inodes are immutable.

The real world use cases are expected to have a lot of value duplication
such as inherited acls etc. To reduce data duplication on disk, this patch
implements a deduplicator that allows sharing of xattr inodes.

The deduplication is based on an in-memory hash lookup that is a best
effort sharing scheme. When a xattr inode is read from disk (i.e.
getxattr() call), its crc32c hash is added to a hash table. Before
creating a new xattr inode for a value being set, the hash table is
checked to see if an existing inode holds an identical value. If such an
inode is found, the ref count on that inode is incremented. On value
removal the ref count is decremented and if it reaches zero the inode is
deleted.

The quota charging for such inodes is manually managed. Every reference
holder is charged the full size as if there was no sharing happening.
This is consistent with how xattr blocks are also charged.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v3:
 - use s_csum_seed for hash calculations when available
 - return error on stored vs calculated hash mismatch
 
v2:
 - make dependency on crc32c dynamic
 - update ext4_has_metadata_csum() and ext4_has_group_desc_csum() so that
   they do not misinterpret existence of EXT4_SB(sb)->s_chksum_driver

 fs/ext4/acl.c   |    5 +-
 fs/ext4/ext4.h  |   22 +-
 fs/ext4/inode.c |    9 +-
 fs/ext4/super.c |   25 +-
 fs/ext4/xattr.c | 1081 ++++++++++++++++++++++++++++++++++++++++++-------------
 fs/ext4/xattr.h |   17 +-
 fs/mbcache.c    |    9 +-
 7 files changed, 896 insertions(+), 272 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 74f7ac539e00..8db03e5c78bc 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	if (error)
 		return error;
 retry:
-	credits = ext4_xattr_set_credits(inode, acl_size);
+	error = ext4_xattr_set_credits(inode, acl_size, &credits);
+	if (error)
+		return error;
+
 	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d79d8d7bee88..7ceb1f81e4b8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1517,6 +1517,7 @@ struct ext4_sb_info {
 	long s_es_nr_inode;
 	struct ext4_es_stats s_es_stats;
 	struct mb_cache *s_mb_cache;
+	struct mb_cache *s_ea_inode_cache;
 	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
 
 	/* Ratelimit ext4 messages. */
@@ -2099,7 +2100,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
 	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
 }
 
-#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
+static inline bool ext4_is_quota_file(struct inode *inode)
+{
+	return IS_NOQUOTA(inode) &&
+	       !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
+}
 
 /*
  * This structure is stuffed into the struct file's private_data field
@@ -2709,19 +2714,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
 extern int ext4_register_li_request(struct super_block *sb,
 				    ext4_group_t first_not_zeroed);
 
-static inline int ext4_has_group_desc_csum(struct super_block *sb)
-{
-	return ext4_has_feature_gdt_csum(sb) ||
-	       EXT4_SB(sb)->s_chksum_driver != NULL;
-}
-
 static inline int ext4_has_metadata_csum(struct super_block *sb)
 {
 	WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
 		     !EXT4_SB(sb)->s_chksum_driver);
 
-	return (EXT4_SB(sb)->s_chksum_driver != NULL);
+	return ext4_has_feature_metadata_csum(sb) &&
+	       (EXT4_SB(sb)->s_chksum_driver != NULL);
 }
+
+static inline int ext4_has_group_desc_csum(struct super_block *sb)
+{
+	return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
+}
+
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
 	return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4d6936f0d8a4..6f5872197d6c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4843,8 +4843,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	}
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
-	if (ei->i_flags & EXT4_EA_INODE_FL)
+
+	if (ei->i_flags & EXT4_EA_INODE_FL) {
 		ext4_xattr_inode_set_class(inode);
+
+		inode_lock(inode);
+		inode->i_flags |= S_NOQUOTA;
+		inode_unlock(inode);
+	}
+
 	unlock_new_inode(inode);
 	return inode;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b02a23ec92ca..9fcd29e21dc7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
 		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
+	if (sbi->s_ea_inode_cache) {
+		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+		sbi->s_ea_inode_cache = NULL;
+	}
 	if (sbi->s_mb_cache) {
 		ext4_xattr_destroy_cache(sbi->s_mb_cache);
 		sbi->s_mb_cache = NULL;
@@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 	if (res)
 		return res;
 retry:
-	credits = ext4_xattr_set_credits(inode, len);
+	res = ext4_xattr_set_credits(inode, len, &credits);
+	if (res)
+		return res;
+
 	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -3445,7 +3452,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* Load the checksum driver */
-	if (ext4_has_feature_metadata_csum(sb)) {
+	if (ext4_has_feature_metadata_csum(sb) ||
+	    ext4_has_feature_ea_inode(sb)) {
 		sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
 		if (IS_ERR(sbi->s_chksum_driver)) {
 			ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
@@ -4067,6 +4075,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount_wq;
 	}
 
+	if (ext4_has_feature_ea_inode(sb)) {
+		sbi->s_ea_inode_cache = ext4_xattr_create_cache();
+		if (!sbi->s_ea_inode_cache) {
+			ext4_msg(sb, KERN_ERR,
+				 "Failed to create an s_ea_inode_cache");
+			goto failed_mount_wq;
+		}
+	}
+
 	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
 	    (blocksize != PAGE_SIZE)) {
 		ext4_msg(sb, KERN_ERR,
@@ -4296,6 +4313,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (EXT4_SB(sb)->rsv_conversion_wq)
 		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
 failed_mount_wq:
+	if (sbi->s_ea_inode_cache) {
+		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+		sbi->s_ea_inode_cache = NULL;
+	}
 	if (sbi->s_mb_cache) {
 		ext4_xattr_destroy_cache(sbi->s_mb_cache);
 		sbi->s_mb_cache = NULL;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 6acce1f689ab..2a2d2c58e0fb 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -79,6 +79,7 @@ ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
 			    struct mb_cache_entry **);
 static void ext4_xattr_rehash(struct ext4_xattr_header *,
 			      struct ext4_xattr_entry *);
+static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash);
 
 static const struct xattr_handler * const ext4_xattr_handler_map[] = {
 	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
@@ -105,13 +106,23 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
 	NULL
 };
 
+#define EXT4_XATTR_SYSTEM_EA_INFO  "eai"
+
 #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
 				inode->i_sb->s_fs_info)->s_mb_cache)
 
+#define EA_INODE_CACHE(inode)	(((struct ext4_sb_info *) \
+				inode->i_sb->s_fs_info)->s_ea_inode_cache)
+
 static int
 ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
 			struct inode *inode);
 
+static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
+				 u32 hash);
+static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
+				     u64 *ref_return, u32 *hash);
+
 #ifdef CONFIG_LOCKDEP
 void ext4_xattr_inode_set_class(struct inode *ea_inode)
 {
@@ -280,15 +291,25 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
 	return cmp ? -ENODATA : 0;
 }
 
+static u32
+ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
+{
+	return ext4_chksum(sbi, sbi->s_csum_seed ?: ~0, buffer, size);
+}
+
 /*
  * Read the EA value from an inode.
  */
-static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
+static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size,
+				 u32 *ret_hash)
 {
 	unsigned long block = 0;
 	struct buffer_head *bh = NULL;
 	int blocksize = ea_inode->i_sb->s_blocksize;
 	size_t csize, copied = 0;
+	void *copy_pos = buf;
+	u32 calc_hash, stored_hash;
+	int err;
 
 	while (copied < size) {
 		csize = (size - copied) > blocksize ? blocksize : size - copied;
@@ -298,13 +319,29 @@ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
 		if (!bh)
 			return -EFSCORRUPTED;
 
-		memcpy(buf, bh->b_data, csize);
+		memcpy(copy_pos, bh->b_data, csize);
 		brelse(bh);
 
-		buf += csize;
+		copy_pos += csize;
 		block += 1;
 		copied += csize;
 	}
+
+	calc_hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buf, size);
+
+	/* Verify stored hash matches calculated hash. */
+	err = ext4_xattr_read_ea_hash(ea_inode, &stored_hash);
+	if (err)
+		return err;
+
+	if (calc_hash != stored_hash) {
+		ext4_warning_inode(ea_inode,
+			"EA inode calc_hash=%#x does not match stored_hash=%#x",
+			calc_hash, stored_hash);
+		return -EFSCORRUPTED;
+	}
+	if (ret_hash)
+		*ret_hash = calc_hash;
 	return 0;
 }
 
@@ -329,14 +366,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
 		goto error;
 	}
 
-	if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
-	    inode->i_generation != parent->i_generation) {
-		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
-			   "to parent is invalid.", ea_ino);
-		err = -EINVAL;
-		goto error;
-	}
-
 	if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
 		ext4_error(parent->i_sb, "EA inode %lu does not have "
 			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
@@ -358,17 +387,33 @@ static int
 ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
 		     size_t size)
 {
+	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
 	struct inode *ea_inode;
-	int ret;
+	u32 hash;
+	int err;
 
-	ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-	if (ret)
-		return ret;
+	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+	if (err) {
+		ea_inode = NULL;
+		goto out;
+	}
 
-	ret = ext4_xattr_inode_read(ea_inode, buffer, size);
-	iput(ea_inode);
+	if (i_size_read(ea_inode) != size) {
+		ext4_warning_inode(ea_inode,
+				   "ea_inode file size=%llu entry size=%zu",
+				   i_size_read(ea_inode), size);
+		err = -EFSCORRUPTED;
+		goto out;
+	}
 
-	return ret;
+	err = ext4_xattr_inode_read(ea_inode, buffer, size, &hash);
+	if (!err) {
+		mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
+				      ea_inode->i_ino, true /* reusable */);
+	}
+out:
+	iput(ea_inode);
+	return err;
 }
 
 static int
@@ -657,6 +702,101 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 	}
 }
 
+static inline size_t round_up_cluster(struct inode *inode, size_t length)
+{
+	struct super_block *sb = inode->i_sb;
+	size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
+				    inode->i_blkbits);
+	size_t mask = ~(cluster_size - 1);
+
+	return (length + cluster_size - 1) & mask;
+}
+
+static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
+{
+	int err;
+
+	err = dquot_alloc_inode(inode);
+	if (err)
+		return err;
+	err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
+	if (err)
+		dquot_free_inode(inode);
+	return err;
+}
+
+static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
+{
+	dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
+	dquot_free_inode(inode);
+}
+
+static int __ext4_xattr_set_credits(struct super_block *sb,
+				    struct buffer_head *block_bh,
+				    size_t value_len)
+{
+	int credits;
+	int blocks;
+
+	/*
+	 * 1) Owner inode update
+	 * 2) Ref count update on old xattr block
+	 * 3) new xattr block
+	 * 4) block bitmap update for new xattr block
+	 * 5) group descriptor for new xattr block
+	 */
+	credits = 5;
+
+	/* We are done if ea_inode feature is not enabled. */
+	if (!ext4_has_feature_ea_inode(sb))
+		return credits;
+
+	/* New ea_inode, inode map, block bitmap, group descriptor. */
+	credits += 4;
+
+	/* Data blocks. */
+	blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+
+	/* Indirection block. */
+	blocks += 1;
+
+	/* Block bitmap and group descriptor updates for each block. */
+	credits += blocks * 2;
+
+	/* Blocks themselves. */
+	credits += blocks;
+
+	/* Dereference ea_inode holding old xattr value.
+	 * Old ea_inode, inode map, block bitmap, group descriptor.
+	 */
+	credits += 4;
+
+	/* Data blocks for old ea_inode. */
+	blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
+
+	/* Indirection block for old ea_inode. */
+	blocks += 1;
+
+	/* Block bitmap and group descriptor updates for each block. */
+	credits += blocks * 2;
+
+	/* Quota updates. */
+	credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
+
+	/* We may need to clone the existing xattr block in which case we need
+	 * to increment ref counts for existing ea_inodes referenced by it.
+	 */
+	if (block_bh) {
+		struct ext4_xattr_entry *entry = BFIRST(block_bh);
+
+		for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
+			if (entry->e_value_inum)
+				/* Ref count update on ea_inode. */
+				credits += 1;
+	}
+	return credits;
+}
+
 int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
 			      int credits, struct buffer_head *bh,
 			      bool dirty, bool block_csum)
@@ -706,12 +846,139 @@ int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
+static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
+				       int ref_change)
+{
+	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
+	struct ext4_iloc iloc;
+	s64 ref_return;
+	u32 hash;
+	int ret;
+
+	inode_lock(ea_inode);
+
+	ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
+	if (ret) {
+		iloc.bh = NULL;
+		goto out;
+	}
+
+	ret = ext4_xattr_update_ea_info(ea_inode, ref_change, &ref_return,
+					&hash);
+	if (ret)
+		goto out;
+
+	if (ref_change > 0) {
+		WARN_ONCE(ref_return <= 0, "EA inode %lu ref_return=%lld",
+			  ea_inode->i_ino, ref_return);
+
+		if (ref_return == 1) {
+			WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
+				  ea_inode->i_ino, ea_inode->i_nlink);
+
+			set_nlink(ea_inode, 1);
+			ext4_orphan_del(handle, ea_inode);
+
+			mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
+					      ea_inode->i_ino,
+					      true /* reusable */);
+		}
+	} else {
+		WARN_ONCE(ref_return < 0, "EA inode %lu ref_return=%lld",
+			  ea_inode->i_ino, ref_return);
+
+		if (ref_return == 0) {
+			WARN_ONCE(ea_inode->i_nlink != 1,
+				  "EA inode %lu i_nlink=%u",
+				  ea_inode->i_ino, ea_inode->i_nlink);
+
+			clear_nlink(ea_inode);
+			ext4_orphan_add(handle, ea_inode);
+
+			mb_cache_entry_delete(ea_inode_cache, hash,
+					      ea_inode->i_ino);
+		}
+	}
+
+	ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
+	iloc.bh = NULL;
+	if (ret)
+		ext4_warning_inode(ea_inode,
+				   "ext4_mark_iloc_dirty() failed ret=%d", ret);
+out:
+	brelse(iloc.bh);
+	inode_unlock(ea_inode);
+	return ret;
+}
+
+static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
+{
+	return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
+}
+
+static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
+{
+	return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
+}
+
+static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
+					struct ext4_xattr_entry *first)
+{
+	struct inode *ea_inode;
+	struct ext4_xattr_entry *entry;
+	struct ext4_xattr_entry *failed_entry;
+	unsigned int ea_ino;
+	int err, saved_err;
+
+	for (entry = first; !IS_LAST_ENTRY(entry);
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		ea_ino = le32_to_cpu(entry->e_value_inum);
+		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		if (err)
+			goto cleanup;
+		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
+		if (err) {
+			ext4_warning_inode(ea_inode, "inc ref error %d", err);
+			iput(ea_inode);
+			goto cleanup;
+		}
+		iput(ea_inode);
+	}
+	return 0;
+
+cleanup:
+	saved_err = err;
+	failed_entry = entry;
+
+	for (entry = first; entry != failed_entry;
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		ea_ino = le32_to_cpu(entry->e_value_inum);
+		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		if (err) {
+			ext4_warning(parent->i_sb,
+				     "cleanup ea_ino %u iget error %d", ea_ino,
+				     err);
+			continue;
+		}
+		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (err)
+			ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
+					   err);
+		iput(ea_inode);
+	}
+	return saved_err;
+}
+
 static void
-ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
-			    struct buffer_head *bh,
-			    struct ext4_xattr_entry *first, bool block_csum,
-			    struct ext4_xattr_inode_array **ea_inode_array,
-			    int extra_credits)
+ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
+			     struct buffer_head *bh,
+			     struct ext4_xattr_entry *first, bool block_csum,
+			     struct ext4_xattr_inode_array **ea_inode_array,
+			     int extra_credits, bool skip_quota)
 {
 	struct inode *ea_inode;
 	struct ext4_xattr_entry *entry;
@@ -748,10 +1015,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
 			continue;
 		}
 
-		inode_lock(ea_inode);
-		clear_nlink(ea_inode);
-		ext4_orphan_add(handle, ea_inode);
-		inode_unlock(ea_inode);
+		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (err) {
+			ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
+					   err);
+			continue;
+		}
+
+		if (!skip_quota)
+			ext4_xattr_inode_free_quota(parent,
+					      le32_to_cpu(entry->e_value_size));
 
 		/*
 		 * Forget about ea_inode within the same transaction that decrements the ref
@@ -784,7 +1057,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
  */
 static void
 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
-			 struct buffer_head *bh)
+			 struct buffer_head *bh,
+			 struct ext4_xattr_inode_array **ea_inode_array,
+			 int extra_credits)
 {
 	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 	u32 hash, ref;
@@ -807,6 +1082,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
 		get_bh(bh);
 		unlock_buffer(bh);
+
+		if (ext4_has_feature_ea_inode(inode->i_sb))
+			ext4_xattr_inode_dec_ref_all(handle, inode, bh,
+						     BFIRST(bh),
+						     true /* block_csum */,
+						     ea_inode_array,
+						     extra_credits,
+						     true /* skip_quota */);
 		ext4_free_blocks(handle, inode, bh, 0, 1,
 				 EXT4_FREE_BLOCKS_METADATA |
 				 EXT4_FREE_BLOCKS_FORGET);
@@ -947,7 +1230,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
  * Create an inode to store the value of a large EA.
  */
 static struct inode *ext4_xattr_inode_create(handle_t *handle,
-					     struct inode *inode)
+					     struct inode *inode, u32 hash)
 {
 	struct inode *ea_inode = NULL;
 	uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
@@ -965,67 +1248,119 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 		ea_inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(ea_inode);
 		ext4_xattr_inode_set_class(ea_inode);
-		ea_inode->i_generation = inode->i_generation;
-		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
-
-		/*
-		 * A back-pointer from EA inode to parent inode will be useful
-		 * for e2fsck.
-		 */
-		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
 		unlock_new_inode(ea_inode);
 		err = ext4_inode_attach_jinode(ea_inode);
+		if (!err)
+			err = ext4_xattr_inode_init(handle, ea_inode, hash);
 		if (err) {
 			iput(ea_inode);
 			return ERR_PTR(err);
 		}
+
+		/*
+		 * Xattr inodes are shared therefore quota charging is performed
+		 * at a higher level.
+		 */
+		dquot_free_inode(ea_inode);
+		dquot_drop(ea_inode);
+		inode_lock(ea_inode);
+		ea_inode->i_flags |= S_NOQUOTA;
+		inode_unlock(ea_inode);
 	}
 
 	return ea_inode;
 }
 
-/*
- * Unlink the inode storing the value of the EA.
- */
-int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
+static struct inode *
+ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
+			    size_t value_len, u32 hash)
 {
-	struct inode *ea_inode = NULL;
+	struct inode *ea_inode;
+	struct mb_cache_entry *ce;
+	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
+	void *ea_data = NULL;
 	int err;
 
-	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-	if (err)
-		return err;
+	ce = mb_cache_entry_find_first(ea_inode_cache, hash);
+	while (ce) {
+		ea_inode = ext4_iget(inode->i_sb, ce->e_value);
+		if (IS_ERR(ea_inode)) {
+			ea_inode = NULL;
+			goto next;
+		}
 
-	clear_nlink(ea_inode);
-	iput(ea_inode);
+		if (is_bad_inode(ea_inode) ||
+		    !(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) ||
+		    i_size_read(ea_inode) != value_len)
+			goto next;
 
-	return 0;
+		if (!ea_data)
+			ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
+
+		if (!ea_data) {
+			iput(ea_inode);
+			return NULL;
+		}
+
+		err = ext4_xattr_inode_read(ea_inode, ea_data, value_len, NULL);
+		if (unlikely(err))
+			goto next;
+
+		if (!memcmp(value, ea_data, value_len)) {
+			mb_cache_entry_touch(ea_inode_cache, ce);
+			mb_cache_entry_put(ea_inode_cache, ce);
+			kvfree(ea_data);
+			return ea_inode;
+		}
+	next:
+		iput(ea_inode);
+		ce = mb_cache_entry_find_next(ea_inode_cache, ce);
+	}
+	kvfree(ea_data);
+	return NULL;
 }
 
 /*
  * Add value of the EA in an inode.
  */
-static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
-				unsigned long *ea_ino, const void *value,
-				size_t value_len)
+static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
+					  const void *value, size_t value_len,
+					  struct inode **ret_inode)
 {
 	struct inode *ea_inode;
+	u32 hash;
 	int err;
 
+	hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
+	ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
+	if (ea_inode) {
+		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
+		if (err) {
+			iput(ea_inode);
+			return err;
+		}
+
+		*ret_inode = ea_inode;
+		return 0;
+	}
+
 	/* Create an inode for the EA value */
-	ea_inode = ext4_xattr_inode_create(handle, inode);
+	ea_inode = ext4_xattr_inode_create(handle, inode, hash);
 	if (IS_ERR(ea_inode))
 		return PTR_ERR(ea_inode);
 
 	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
-	if (err)
-		clear_nlink(ea_inode);
-	else
-		*ea_ino = ea_inode->i_ino;
+	if (err) {
+		ext4_xattr_inode_dec_ref(handle, ea_inode);
+		iput(ea_inode);
+		return err;
+	}
 
-	iput(ea_inode);
+	mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
+			      ea_inode->i_ino, true /* reusable */);
 
-	return err;
+	*ret_inode = ea_inode;
+	return 0;
 }
 
 static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
@@ -1033,11 +1368,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 				handle_t *handle, struct inode *inode)
 {
 	struct ext4_xattr_entry *last;
+	struct ext4_xattr_entry *here = s->here;
 	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
 	int in_inode = i->in_inode;
-	int rc;
+	struct inode *old_ea_inode = NULL;
+	struct inode *new_ea_inode = NULL;
+	int ret;
 
-	/* Compute min_offs and last. */
+	/*
+	 * Optimization for the simple case when old and new values have the
+	 * same padded sizes. Not applicable if the existing value is stored in
+	 * an external inode.
+	 */
+	if (i->value && !s->not_found && !here->e_value_inum &&
+	    EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) ==
+	    EXT4_XATTR_SIZE(i->value_len)) {
+		size_t offs = le16_to_cpu(here->e_value_offs);
+		void *val = s->base + offs;
+		size_t size = EXT4_XATTR_SIZE(i->value_len);
+
+		here->e_value_size = cpu_to_le32(i->value_len);
+		if (i->value == EXT4_ZERO_XATTR_VALUE) {
+			memset(val, 0, size);
+		} else {
+			memcpy(val, i->value, i->value_len);
+			/* Clear padding bytes. */
+			memset(val + i->value_len, 0, size - i->value_len);
+		}
+		return 0;
+	}
+
+	/* Find out min_offs and last to calculate the free space. */
 	last = s->first;
 	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
 		if (!last->e_value_inum && last->e_value_size) {
@@ -1048,120 +1409,149 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 	}
 	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
 	if (!s->not_found) {
-		if (!in_inode &&
-		    !s->here->e_value_inum && s->here->e_value_size) {
-			size_t size = le32_to_cpu(s->here->e_value_size);
+		if (!here->e_value_inum && here->e_value_size) {
+			size_t size = le32_to_cpu(here->e_value_size);
 			free += EXT4_XATTR_SIZE(size);
 		}
 		free += EXT4_XATTR_LEN(name_len);
 	}
 	if (i->value) {
-		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
+		size_t value_len = in_inode ? 0 : EXT4_XATTR_SIZE(i->value_len);
 
-		if (in_inode)
-			value_len = 0;
+		if (free < EXT4_XATTR_LEN(name_len) + value_len) {
+			ret = -ENOSPC;
+			goto out;
+		}
+	}
 
-		if (free < EXT4_XATTR_LEN(name_len) + value_len)
-			return -ENOSPC;
+	/*
+	 * Getting access to old and new ea inodes is subject to failures.
+	 * Finish that work before doing any modifications to the xattr data.
+	 */
+	if (!s->not_found && here->e_value_inum) {
+		ret = ext4_xattr_inode_iget(inode,
+		 			    le32_to_cpu(here->e_value_inum),
+		 			    &old_ea_inode);
+		if (ret) {
+			old_ea_inode = NULL;
+			goto out;
+		}
 	}
+	if (i->value && in_inode) {
+		WARN_ON_ONCE(!i->value_len);
 
-	if (i->value && s->not_found) {
-		/* Insert the new name. */
-		size_t size = EXT4_XATTR_LEN(name_len);
-		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
-		memmove((void *)s->here + size, s->here, rest);
-		memset(s->here, 0, size);
-		s->here->e_name_index = i->name_index;
-		s->here->e_name_len = name_len;
-		memcpy(s->here->e_name, i->name, name_len);
-	} else {
-		if (!s->here->e_value_inum && s->here->e_value_size &&
-		    s->here->e_value_offs > 0) {
-			void *first_val = s->base + min_offs;
-			size_t offs = le16_to_cpu(s->here->e_value_offs);
-			void *val = s->base + offs;
-			size_t size = EXT4_XATTR_SIZE(
-				le32_to_cpu(s->here->e_value_size));
-
-			if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
-				/* The old and the new value have the same
-				   size. Just replace. */
-				s->here->e_value_size =
-					cpu_to_le32(i->value_len);
-				if (i->value == EXT4_ZERO_XATTR_VALUE) {
-					memset(val, 0, size);
-				} else {
-					/* Clear pad bytes first. */
-					memset(val + size - EXT4_XATTR_PAD, 0,
-					       EXT4_XATTR_PAD);
-					memcpy(val, i->value, i->value_len);
-				}
-				return 0;
-			}
+		ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
+		if (ret)
+			goto out;
 
-			/* Remove the old value. */
-			memmove(first_val + size, first_val, val - first_val);
-			memset(first_val, 0, size);
-			s->here->e_value_size = 0;
-			s->here->e_value_offs = 0;
-			min_offs += size;
-
-			/* Adjust all value offsets. */
-			last = s->first;
-			while (!IS_LAST_ENTRY(last)) {
-				size_t o = le16_to_cpu(last->e_value_offs);
-				if (!last->e_value_inum &&
-				    last->e_value_size && o < offs)
-					last->e_value_offs =
-						cpu_to_le16(o + size);
-				last = EXT4_XATTR_NEXT(last);
-			}
+		ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
+						     i->value_len,
+						     &new_ea_inode);
+		if (ret) {
+			new_ea_inode = NULL;
+			ext4_xattr_inode_free_quota(inode, i->value_len);
+			goto out;
 		}
-		if (s->here->e_value_inum) {
-			ext4_xattr_inode_unlink(inode,
-					    le32_to_cpu(s->here->e_value_inum));
-			s->here->e_value_inum = 0;
+	}
+
+	if (old_ea_inode) {
+		/* We are ready to release ref count on the old_ea_inode. */
+		ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
+		if (ret) {
+			/* Release newly required ref count on new_ea_inode. */
+			if (new_ea_inode) {
+				int err;
+
+				err = ext4_xattr_inode_dec_ref(handle,
+							       new_ea_inode);
+				if (err)
+					ext4_warning_inode(new_ea_inode,
+						  "dec ref new_ea_inode err=%d",
+						  err);
+				ext4_xattr_inode_free_quota(inode,
+							    i->value_len);
+			}
+			goto out;
 		}
-		if (!i->value) {
-			/* Remove the old name. */
-			size_t size = EXT4_XATTR_LEN(name_len);
-			last = ENTRY((void *)last - size);
-			memmove(s->here, (void *)s->here + size,
-				(void *)last - (void *)s->here + sizeof(__u32));
-			memset(last, 0, size);
+
+		ext4_xattr_inode_free_quota(inode,
+					    le32_to_cpu(here->e_value_size));
+	}
+
+	/* No failures allowed past this point. */
+
+	if (!s->not_found && here->e_value_offs) {
+		/* Remove the old value. */
+		void *first_val = s->base + min_offs;
+		size_t offs = le16_to_cpu(here->e_value_offs);
+		void *val = s->base + offs;
+		size_t size = EXT4_XATTR_SIZE(
+			le32_to_cpu(here->e_value_size));
+
+		memmove(first_val + size, first_val, val - first_val);
+		memset(first_val, 0, size);
+		min_offs += size;
+
+		/* Adjust all value offsets. */
+		last = s->first;
+		while (!IS_LAST_ENTRY(last)) {
+			size_t o = le16_to_cpu(last->e_value_offs);
+			if (!last->e_value_inum &&
+			    last->e_value_size && o < offs)
+				last->e_value_offs =
+					cpu_to_le16(o + size);
+			last = EXT4_XATTR_NEXT(last);
 		}
 	}
 
+	if (!s->not_found && !i->value) {
+		/* Remove old name. */
+		size_t size = EXT4_XATTR_LEN(name_len);
+		last = ENTRY((void *)last - size);
+		memmove(here, (void *)here + size,
+			(void *)last - (void *)here + sizeof(__u32));
+		memset(last, 0, size);
+	} else if (s->not_found && i->value) {
+		/* Insert new name. */
+		size_t size = EXT4_XATTR_LEN(name_len);
+		size_t rest = (void *)last - (void *)here + sizeof(__u32);
+		memmove((void *)here + size, here, rest);
+		memset(here, 0, size);
+		here->e_name_index = i->name_index;
+		here->e_name_len = name_len;
+		memcpy(here->e_name, i->name, name_len);
+	} else {
+		WARN_ON_ONCE(s->not_found || !i->value);
+		/* This is an update, reset value info. */
+		here->e_value_inum = 0;
+		here->e_value_offs = 0;
+		here->e_value_size = 0;
+	}
+
 	if (i->value) {
-		/* Insert the new value. */
+		/* Insert new value. */
 		if (in_inode) {
-			unsigned long ea_ino =
-				le32_to_cpu(s->here->e_value_inum);
-			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
-						  i->value, i->value_len);
-			if (rc)
-				goto out;
-			s->here->e_value_inum = cpu_to_le32(ea_ino);
-			s->here->e_value_offs = 0;
+			here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
 		} else if (i->value_len) {
 			size_t size = EXT4_XATTR_SIZE(i->value_len);
 			void *val = s->base + min_offs - size;
-			s->here->e_value_offs = cpu_to_le16(min_offs - size);
-			s->here->e_value_inum = 0;
+			here->e_value_offs = cpu_to_le16(min_offs - size);
 			if (i->value == EXT4_ZERO_XATTR_VALUE) {
 				memset(val, 0, size);
 			} else {
-				/* Clear the pad bytes first. */
-				memset(val + size - EXT4_XATTR_PAD, 0,
-				       EXT4_XATTR_PAD);
 				memcpy(val, i->value, i->value_len);
+				/* Clear padding bytes. */
+				memset(val + i->value_len, 0,
+				       size - i->value_len);
 			}
 		}
-		s->here->e_value_size = cpu_to_le32(i->value_len);
+		here->e_value_size = cpu_to_le32(i->value_len);
 	}
-
+	ret = 0;
 out:
-	return rc;
+	iput(old_ea_inode);
+	iput(new_ea_inode);
+	return ret;
 }
 
 struct ext4_xattr_block_find {
@@ -1223,6 +1613,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 	struct mb_cache_entry *ce = NULL;
 	int error = 0;
 	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+	struct inode *ea_inode = NULL;
+	size_t old_ea_inode_size = 0;
 
 #define header(x) ((struct ext4_xattr_header *)(x))
 
@@ -1277,6 +1669,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			header(s->base)->h_refcount = cpu_to_le32(1);
 			s->here = ENTRY(s->base + offset);
 			s->end = s->base + bs->bh->b_size;
+
+			/*
+			 * If existing entry points to an xattr inode, we need
+			 * to prevent ext4_xattr_set_entry() from decrementing
+			 * ref count on it because the reference belongs to the
+			 * original block. In this case, make the entry look
+			 * like it has an empty value.
+			 */
+			if (!s->not_found && s->here->e_value_inum) {
+				/*
+				 * Defer quota free call for previous inode
+				 * until success is guaranteed.
+				 */
+				old_ea_inode_size = le32_to_cpu(
+							s->here->e_value_size);
+				s->here->e_value_inum = 0;
+				s->here->e_value_size = 0;
+			}
 		}
 	} else {
 		/* Allocate a buffer where we construct the new block. */
@@ -1298,6 +1708,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		goto bad_block;
 	if (error)
 		goto cleanup;
+
+	if (i->value && s->here->e_value_inum) {
+		unsigned int ea_ino;
+
+		/*
+		 * A ref count on ea_inode has been taken as part of the call to
+		 * ext4_xattr_set_entry() above. We would like to drop this
+		 * extra ref but we have to wait until the xattr block is
+		 * initialized and has its own ref count on the ea_inode.
+		 */
+		ea_ino = le32_to_cpu(s->here->e_value_inum);
+		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+		if (error) {
+			ea_inode = NULL;
+			goto cleanup;
+		}
+	}
+
 	if (!IS_LAST_ENTRY(s->first))
 		ext4_xattr_rehash(header(s->base), s->here);
 
@@ -1408,6 +1836,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 						 EXT4_FREE_BLOCKS_METADATA);
 				goto cleanup;
 			}
+			error = ext4_xattr_inode_inc_ref_all(handle, inode,
+						      ENTRY(header(s->base)+1));
+			if (error)
+				goto getblk_failed;
+			if (ea_inode) {
+				/* Drop the extra ref on ea_inode. */
+				error = ext4_xattr_inode_dec_ref(handle,
+								 ea_inode);
+				if (error)
+					ext4_warning_inode(ea_inode,
+							   "dec ref error=%d",
+							   error);
+				iput(ea_inode);
+				ea_inode = NULL;
+			}
+
 			lock_buffer(new_bh);
 			error = ext4_journal_get_create_access(handle, new_bh);
 			if (error) {
@@ -1427,15 +1871,36 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		}
 	}
 
+	if (old_ea_inode_size)
+		ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
+
 	/* Update the inode. */
 	EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
 
 	/* Drop the previous xattr block. */
-	if (bs->bh && bs->bh != new_bh)
-		ext4_xattr_release_block(handle, inode, bs->bh);
+	if (bs->bh && bs->bh != new_bh) {
+		struct ext4_xattr_inode_array *ea_inode_array = NULL;
+		ext4_xattr_release_block(handle, inode, bs->bh,
+					 &ea_inode_array,
+					 0 /* extra_credits */);
+		ext4_xattr_inode_array_free(ea_inode_array);
+	}
 	error = 0;
 
 cleanup:
+	if (ea_inode) {
+		int error2;
+		error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (error2)
+			ext4_warning_inode(ea_inode, "dec ref error=%d",
+					   error2);
+
+		/* If there was an error, revert the quota charge. */
+		if (error)
+			ext4_xattr_inode_free_quota(inode,
+						    i_size_read(ea_inode));
+		iput(ea_inode);
+	}
 	if (ce)
 		mb_cache_entry_put(ext4_mb_cache, ce);
 	brelse(new_bh);
@@ -1546,6 +2011,117 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
+struct ext4_xattr_ea_info {
+	__le64 ref_count;	/* number of xattr entry references */
+	__le32 hash;		/* crc32c hash of xattr data */
+	__le32 reserved;	/* reserved, must be 0 */
+};
+
+static int ext4_xattr_inode_init(handle_t *handle, struct inode *ea_inode,
+				 u32 hash)
+{
+	struct ext4_xattr_ea_info ea_info = {
+		.ref_count = cpu_to_le64(1),
+		.hash = cpu_to_le32(hash),
+		.reserved = 0,
+	};
+	struct ext4_xattr_info i = {
+                .name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_EA_INFO,
+		.value = &ea_info,
+		.value_len = sizeof(ea_info),
+	};
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	int err;
+
+	err = ext4_get_inode_loc(ea_inode, &is.iloc);
+	if (err)
+		return err;
+
+	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
+	if (err)
+		return err;
+
+	return ext4_xattr_ibody_set(handle, ea_inode, &i, &is);
+}
+
+static int ext4_xattr_update_ea_info(struct inode *ea_inode, int ref_change,
+				     u64 *ref_return, u32 *hash)
+{
+	struct ext4_xattr_ea_info ea_info;
+	struct ext4_xattr_info i = {
+                .name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_EA_INFO,
+		.value = &ea_info,
+		.value_len = sizeof(ea_info),
+	};
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	int err;
+
+	err = ext4_get_inode_loc(ea_inode, &is.iloc);
+	if (err)
+		return err;
+
+	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
+	if (err)
+		return err;
+
+	if (WARN_ON(is.s.not_found) ||
+	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(ea_info)))
+		return -EFSCORRUPTED;
+
+	memcpy(&ea_info,
+	       ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs),
+	       sizeof(ea_info));
+
+	if (hash)
+		*hash = le32_to_cpu(ea_info.hash);
+
+	*ref_return = le64_to_cpu(ea_info.ref_count) + ref_change;
+	ea_info.ref_count = cpu_to_le64(*ref_return);
+
+	return ext4_xattr_set_entry(&i, &is.s, NULL, ea_inode);
+}
+
+static int ext4_xattr_read_ea_hash(struct inode *ea_inode, u32 *hash)
+{
+	struct ext4_xattr_info i = {
+                .name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_EA_INFO,
+	};
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext4_xattr_ea_info *ea_info;
+	void *ptr;
+	int err;
+
+	err = ext4_get_inode_loc(ea_inode, &is.iloc);
+	if (err)
+		return err;
+
+	err = ext4_xattr_ibody_find(ea_inode, &i, &is);
+	if (err)
+		return err;
+
+	if (WARN_ON(is.s.not_found) ||
+	    WARN_ON(le32_to_cpu(is.s.here->e_value_size) != sizeof(*ea_info)))
+		return -EFSCORRUPTED;
+
+	ptr = ((void *)is.s.base) + le16_to_cpu(is.s.here->e_value_offs);
+	ea_info = (struct ext4_xattr_ea_info *)ptr;
+
+	if (WARN_ON(ea_info->reserved != 0))
+		return -EFSCORRUPTED;
+
+	*hash = le32_to_cpu(ea_info->hash);
+	return 0;
+}
+
 static int ext4_xattr_value_same(struct ext4_xattr_search *s,
 				 struct ext4_xattr_info *i)
 {
@@ -1560,6 +2136,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
 	return !memcmp(value, i->value, i->value_len);
 }
 
+struct buffer_head *ext4_xattr_get_block(struct inode *inode)
+{
+	struct buffer_head *bh;
+	int error;
+
+	if (!EXT4_I(inode)->i_file_acl)
+		return NULL;
+	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+	if (!bh)
+		return ERR_PTR(-EIO);
+	error = ext4_xattr_check_block(inode, bh);
+	if (error)
+		return ERR_PTR(error);
+	return bh;
+}
+
 /*
  * ext4_xattr_set_handle()
  *
@@ -1602,9 +2194,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 
 	/* Check journal credits under write lock. */
 	if (ext4_handle_valid(handle)) {
+		struct buffer_head *bh;
 		int credits;
 
-		credits = ext4_xattr_set_credits(inode, value_len);
+		bh = ext4_xattr_get_block(inode);
+		if (IS_ERR(bh)) {
+			error = PTR_ERR(bh);
+			goto cleanup;
+		}
+
+		credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
+		brelse(bh);
+
 		if (!ext4_handle_has_enough_credits(handle, credits)) {
 			error = -ENOSPC;
 			goto cleanup;
@@ -1640,6 +2241,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		if (flags & XATTR_CREATE)
 			goto cleanup;
 	}
+
 	if (!value) {
 		if (!is.s.not_found)
 			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
@@ -1708,34 +2310,29 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	return error;
 }
 
-int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
+int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
 {
-	struct super_block *sb = inode->i_sb;
-	int credits;
-
-	if (!EXT4_SB(sb)->s_journal)
-		return 0;
-
-	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+	struct buffer_head *bh;
+	int err;
 
-	/*
-	 * In case of inline data, we may push out the data to a block,
-	 * so we need to reserve credits for this eventuality
-	 */
-	if (ext4_has_inline_data(inode))
-	        credits += ext4_writepage_trans_blocks(inode) + 1;
+	*credits = 0;
 
-	if (ext4_has_feature_ea_inode(sb)) {
-		int nrblocks = (value_len + sb->s_blocksize - 1) >>
-					sb->s_blocksize_bits;
+	if (!EXT4_SB(inode->i_sb)->s_journal)
+		return 0;
 
-		/* For new inode */
-		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
+	down_read(&EXT4_I(inode)->xattr_sem);
 
-		/* For data blocks of EA inode */
-		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
+	bh = ext4_xattr_get_block(inode);
+	if (IS_ERR(bh)) {
+		err = PTR_ERR(bh);
+	} else {
+		*credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
+		brelse(bh);
+		err = 0;
 	}
-	return credits;
+
+	up_read(&EXT4_I(inode)->xattr_sem);
+	return err;
 }
 
 /*
@@ -1760,7 +2357,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 		return error;
 
 retry:
-	credits = ext4_xattr_set_credits(inode, value_len);
+	error = ext4_xattr_set_credits(inode, value_len, &credits);
+	if (error)
+		return error;
+
 	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
@@ -2066,10 +2666,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 	return error;
 }
 
-
 #define EIA_INCR 16 /* must be 2^n */
 #define EIA_MASK (EIA_INCR - 1)
-/* Add the large xattr @inode into @ea_inode_array for later deletion.
+
+/* Add the large xattr @inode into @ea_inode_array for deferred iput().
  * If @ea_inode_array is new or full it will be grown and the old
  * contents copied over.
  */
@@ -2114,21 +2714,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
  * ext4_xattr_delete_inode()
  *
  * Free extended attribute resources associated with this inode. Traverse
- * all entries and unlink any xattr inodes associated with this inode. This
- * is called immediately before an inode is freed. We have exclusive
- * access to the inode. If an orphan inode is deleted it will also delete any
- * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
- * to ensure they belong to the parent inode and were not deleted already.
+ * all entries and decrement reference on any xattr inodes associated with this
+ * inode. This is called immediately before an inode is freed. We have exclusive
+ * access to the inode. If an orphan inode is deleted it will also release its
+ * references on xattr block and xattr inodes.
  */
-int
-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-			struct ext4_xattr_inode_array **ea_inode_array,
-			int extra_credits)
+int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
+			    struct ext4_xattr_inode_array **ea_inode_array,
+			    int extra_credits)
 {
 	struct buffer_head *bh = NULL;
 	struct ext4_xattr_ibody_header *header;
-	struct ext4_inode *raw_inode;
 	struct ext4_iloc iloc = { .bh = NULL };
+	struct ext4_xattr_entry *entry;
 	int error;
 
 	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
@@ -2140,66 +2738,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 		goto cleanup;
 	}
 
-	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
-		goto delete_external_ea;
-
-	error = ext4_get_inode_loc(inode, &iloc);
-	if (error)
-		goto cleanup;
+	if (ext4_has_feature_ea_inode(inode->i_sb) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
 
-	error = ext4_journal_get_write_access(handle, iloc.bh);
-	if (error)
-		goto cleanup;
+		error = ext4_get_inode_loc(inode, &iloc);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
+			goto cleanup;
+		}
 
-	raw_inode = ext4_raw_inode(&iloc);
-	header = IHDR(inode, raw_inode);
-	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
-				    false /* block_csum */, ea_inode_array,
-				    extra_credits);
+		error = ext4_journal_get_write_access(handle, iloc.bh);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "write access (error %d)",
+					 error);
+			goto cleanup;
+		}
 
-delete_external_ea:
-	if (!EXT4_I(inode)->i_file_acl) {
-		error = 0;
-		goto cleanup;
-	}
-	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-	if (!bh) {
-		EXT4_ERROR_INODE(inode, "block %llu read error",
-				 EXT4_I(inode)->i_file_acl);
-		error = -EIO;
-		goto cleanup;
-	}
-	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
-	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-		EXT4_ERROR_INODE(inode, "bad block %llu",
-				 EXT4_I(inode)->i_file_acl);
-		error = -EFSCORRUPTED;
-		goto cleanup;
+		header = IHDR(inode, ext4_raw_inode(&iloc));
+		if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
+			ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
+						     IFIRST(header),
+						     false /* block_csum */,
+						     ea_inode_array,
+						     extra_credits,
+						     false /* skip_quota */);
 	}
 
-	if (ext4_has_feature_ea_inode(inode->i_sb)) {
-		error = ext4_journal_get_write_access(handle, bh);
-		if (error) {
-			EXT4_ERROR_INODE(inode, "write access %llu",
+	if (EXT4_I(inode)->i_file_acl) {
+		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+		if (!bh) {
+			EXT4_ERROR_INODE(inode, "block %llu read error",
 					 EXT4_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		error = ext4_xattr_check_block(inode, bh);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
+					 EXT4_I(inode)->i_file_acl, error);
 			goto cleanup;
 		}
-		ext4_xattr_inode_remove_all(handle, inode, bh,
-					    BFIRST(bh),
-					    true /* block_csum */,
-					    ea_inode_array,
-					    extra_credits);
-	}
 
-	ext4_xattr_release_block(handle, inode, bh);
-	/* Update i_file_acl within the same transaction that releases block. */
-	EXT4_I(inode)->i_file_acl = 0;
-	error = ext4_mark_inode_dirty(handle, inode);
-	if (error) {
-		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
-				 error);
-		goto cleanup;
+		if (ext4_has_feature_ea_inode(inode->i_sb)) {
+			for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
+			     entry = EXT4_XATTR_NEXT(entry))
+				if (entry->e_value_inum)
+					ext4_xattr_inode_free_quota(inode,
+					      le32_to_cpu(entry->e_value_size));
+
+		}
+
+		ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
+					 extra_credits);
+		/*
+		 * Update i_file_acl value in the same transaction that releases
+		 * block.
+		 */
+		EXT4_I(inode)->i_file_acl = 0;
+		error = ext4_mark_inode_dirty(handle, inode);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
+					 error);
+			goto cleanup;
+		}
 	}
+	error = 0;
 cleanup:
 	brelse(iloc.bh);
 	brelse(bh);
@@ -2208,17 +2811,13 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 
 void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
 {
-	struct inode	*ea_inode;
-	int		idx = 0;
+	int idx;
 
 	if (ea_inode_array == NULL)
 		return;
 
-	for (; idx < ea_inode_array->count; ++idx) {
-		ea_inode = ea_inode_array->inodes[idx];
-		clear_nlink(ea_inode);
-		iput(ea_inode);
-	}
+	for (idx = 0; idx < ea_inode_array->count; ++idx)
+		iput(ea_inode_array->inodes[idx]);
 	kfree(ea_inode_array);
 }
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index b2005a2716d9..67616cb9a059 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -69,19 +69,6 @@ struct ext4_xattr_entry {
 		EXT4_I(inode)->i_extra_isize))
 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
 
-/*
- * Link EA inode back to parent one using i_mtime field.
- * Extra integer type conversion added to ignore higher
- * bits in i_mtime.tv_sec which might be set by ext4_get()
- */
-#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
-do {                                                  \
-      (inode)->i_mtime.tv_sec = inum;                 \
-} while(0)
-
-#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
-((__u32)(inode)->i_mtime.tv_sec)
-
 /*
  * The minimum size of EA value when you start storing it in an external inode
  * size of block - size of header - size of 1 entry - 4 null bytes
@@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
 extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
 extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
-extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
+extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
+				  int *credits);
 
-extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 				   struct ext4_xattr_inode_array **array,
 				   int extra_credits);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 77a5b99d8f92..7dfdca822ccb 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -13,10 +13,11 @@
  * mb_cache_entry_delete()).
  *
  * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
- * They use hash of a block contents as a key and block number as a value.
- * That's why keys need not be unique (different xattr blocks may end up having
- * the same hash). However block number always uniquely identifies a cache
- * entry.
+ * Ext4 also uses it for deduplication of xattr values stored in inodes.
+ * They use hash of data as a key and provide a value that may represent a
+ * block or inode number. That's why keys need not be unique (hash of different
+ * data may be the same). However user provided value always uniquely
+ * identifies a cache entry.
  *
  * We provide functions for creation and removal of entries, search by key,
  * and a special "delete entry with given key-value pair" operation. Fixed
-- 
2.13.0.506.g27d5fe0cd-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* Re: [PATCH 07/28] ext4: call journal revoke when freeing ea_inode blocks
  2017-05-31 16:12     ` Darrick J. Wong
                       ` (2 preceding siblings ...)
  (?)
@ 2017-06-05 22:08     ` Andreas Dilger
  -1 siblings, 0 replies; 100+ messages in thread
From: Andreas Dilger @ 2017-06-05 22:08 UTC (permalink / raw)
  To: Darrick J. Wong, Tahsin Erdogan
  Cc: Jan Kara, Theodore Ts'o, linux-ext4, lkml, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 3467 bytes --]

On May 31, 2017, at 10:12 AM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> 
> On Wed, May 31, 2017 at 01:14:56AM -0700, Tahsin Erdogan wrote:
>> ea_inode contents are treated as metadata, that's why it is journaled
>> during initial writes. Failing to call revoke during freeing could cause
>> user data to be overwritten with original ea_inode contents during journal
>> replay.
>> 
>> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
>> ---
>> fs/ext4/extents.c  | 3 ++-
>> fs/ext4/indirect.c | 3 ++-
>> 2 files changed, 4 insertions(+), 2 deletions(-)
>> 
>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>> index 3e36508610b7..e0a8425ff74d 100644
>> --- a/fs/ext4/extents.c
>> +++ b/fs/ext4/extents.c
>> @@ -2488,7 +2488,8 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
>> 
>> static inline int get_default_free_blocks_flags(struct inode *inode)
>> {
>> -	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
>> +	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
>> +	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
>> 		return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
>> 	else if (ext4_should_journal_data(inode))
>> 		return EXT4_FREE_BLOCKS_FORGET;
>> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
>> index bc15c2c17633..7ffa290cbb8e 100644
>> --- a/fs/ext4/indirect.c
>> +++ b/fs/ext4/indirect.c
>> @@ -829,7 +829,8 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
>> 	int	flags = EXT4_FREE_BLOCKS_VALIDATED;
>> 	int	err;
>> 
>> -	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
>> +	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
>> +	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
> 
> I appreciate the thoroughness of doing this even for blockmapped
> ea_inode files, and I'm not complaining about this hunk at all. :)
> 
> However, please consider requiring the extents feature + format as a
> prerequisite for ea_inodes.  ext4 has traditionally been very ...
> permissive about supporting a diverse range of feature options, but the
> cost of that diversity is that the feature support matrix that the
> community has to support is already untestably large.
> 
> I think it would be wise not to support !extents && ea_inode,
> particularly since blockmaps aren't protected by metadata_csum and so in
> the long run it's probably best to minimize the introduction of new
> blockmap files (on ext4 anyway).

Sorry, I have to disagree on this one.

The Lustre code ONLY uses the xattr inode with non-extent (indirect)
mapped inodes on the metadata target.  This is because the MDT only
stores inodes and directories (and a handful of regular files) that
don't benefit from extents at all, but rather are hurt because directory
blocks are typically allocated incrementally over time and result in
fragmented block numbers.  In this case, extents can increase the dir
size by 3x over indirect mapped files without any benefit.

Since the MDT only holds inodes, it never needs to be larger than 16TB
(by default only 2KB/inode gives a max MDT size of 8TB) so there is no
need for more than 2^32 blocks in the filesystem.

Cheers, Andreas

> 
> --D
> 
>> 		flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
>> 	else if (ext4_should_journal_data(inode))
>> 		flags |= EXT4_FREE_BLOCKS_FORGET;
>> --
>> 2.13.0.219.gdb65acc882-goog


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [PATCH v2 26/28] ext4: cleanup transaction restarts during inode deletion
  2017-05-31  8:15 ` [PATCH 26/28] ext4: cleanup transaction restarts during inode deletion Tahsin Erdogan
@ 2017-06-14 14:17   ` Tahsin Erdogan
  2017-06-15  0:11       ` [Ocfs2-devel] " Andreas Dilger
  0 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-14 14:17 UTC (permalink / raw)
  To: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

During inode deletion, journal credits that will be needed are hard to
determine, that is why we have journal extend/restart calls in several
places. Whenever a transaction is restarted, filesystem must be in a
consistent state because there is no atomicity guarantee beyond a
restart call.

Add ext4_xattr_ensure_credits() helper function which takes care of
journal extend/restart logic. It also handles getting jbd2 write access
and dirty metadata calls. This function is called at every iteration of
handling an ea_inode reference.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v2: made ext4_xattr_ensure_credits() static

 fs/ext4/inode.c |  66 ++++-----------
 fs/ext4/xattr.c | 257 ++++++++++++++++++++++++++++++++++++--------------------
 fs/ext4/xattr.h |   3 +-
 3 files changed, 183 insertions(+), 143 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cf91532765a4..4d6936f0d8a4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -239,7 +239,11 @@ void ext4_evict_inode(struct inode *inode)
 	 */
 	sb_start_intwrite(inode->i_sb);
 
-	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits);
+	if (!IS_NOQUOTA(inode))
+		extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
+
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+				 ext4_blocks_for_truncate(inode)+extra_credits);
 	if (IS_ERR(handle)) {
 		ext4_std_error(inode->i_sb, PTR_ERR(handle));
 		/*
@@ -251,36 +255,9 @@ void ext4_evict_inode(struct inode *inode)
 		sb_end_intwrite(inode->i_sb);
 		goto no_delete;
 	}
+
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
-
-	/*
-	 * Delete xattr inode before deleting the main inode.
-	 */
-	err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array);
-	if (err) {
-		ext4_warning(inode->i_sb,
-			     "couldn't delete inode's xattr (err %d)", err);
-		goto stop_handle;
-	}
-
-	if (!IS_NOQUOTA(inode))
-		extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
-
-	if (!ext4_handle_has_enough_credits(handle,
-			ext4_blocks_for_truncate(inode) + extra_credits)) {
-		err = ext4_journal_extend(handle,
-			ext4_blocks_for_truncate(inode) + extra_credits);
-		if (err > 0)
-			err = ext4_journal_restart(handle,
-			ext4_blocks_for_truncate(inode) + extra_credits);
-		if (err != 0) {
-			ext4_warning(inode->i_sb,
-				     "couldn't extend journal (err %d)", err);
-			goto stop_handle;
-		}
-	}
-
 	inode->i_size = 0;
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
@@ -298,25 +275,17 @@ void ext4_evict_inode(struct inode *inode)
 		}
 	}
 
-	/*
-	 * ext4_ext_truncate() doesn't reserve any slop when it
-	 * restarts journal transactions; therefore there may not be
-	 * enough credits left in the handle to remove the inode from
-	 * the orphan list and set the dtime field.
-	 */
-	if (!ext4_handle_has_enough_credits(handle, extra_credits)) {
-		err = ext4_journal_extend(handle, extra_credits);
-		if (err > 0)
-			err = ext4_journal_restart(handle, extra_credits);
-		if (err != 0) {
-			ext4_warning(inode->i_sb,
-				     "couldn't extend journal (err %d)", err);
-		stop_handle:
-			ext4_journal_stop(handle);
-			ext4_orphan_del(NULL, inode);
-			sb_end_intwrite(inode->i_sb);
-			goto no_delete;
-		}
+	/* Remove xattr references. */
+	err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
+				      extra_credits);
+	if (err) {
+		ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
+	stop_handle:
+		ext4_journal_stop(handle);
+		ext4_orphan_del(NULL, inode);
+		sb_end_intwrite(inode->i_sb);
+		ext4_xattr_inode_array_free(ea_inode_array);
+		goto no_delete;
 	}
 
 	/*
@@ -342,7 +311,6 @@ void ext4_evict_inode(struct inode *inode)
 		ext4_clear_inode(inode);
 	else
 		ext4_free_inode(handle, inode);
-
 	ext4_journal_stop(handle);
 	sb_end_intwrite(inode->i_sb);
 	ext4_xattr_inode_array_free(ea_inode_array);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3ee7e2f68476..abc7d5f84e5f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -108,6 +108,10 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
 #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
 				inode->i_sb->s_fs_info)->s_mb_cache)
 
+static int
+ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
+			struct inode *inode);
+
 #ifdef CONFIG_LOCKDEP
 void ext4_xattr_inode_set_class(struct inode *ea_inode)
 {
@@ -653,6 +657,127 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 	}
 }
 
+static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
+				     int credits, struct buffer_head *bh,
+				     bool dirty, bool block_csum)
+{
+	int error;
+
+	if (!ext4_handle_valid(handle))
+		return 0;
+
+	if (handle->h_buffer_credits >= credits)
+		return 0;
+
+	error = ext4_journal_extend(handle, credits - handle->h_buffer_credits);
+	if (!error)
+		return 0;
+	if (error < 0) {
+		ext4_warning(inode->i_sb, "Extend journal (error %d)", error);
+		return error;
+	}
+
+	if (bh && dirty) {
+		if (block_csum)
+			ext4_xattr_block_csum_set(inode, bh);
+		error = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (error) {
+			ext4_warning(inode->i_sb, "Handle metadata (error %d)",
+				     error);
+			return error;
+		}
+	}
+
+	error = ext4_journal_restart(handle, credits);
+	if (error) {
+		ext4_warning(inode->i_sb, "Restart journal (error %d)", error);
+		return error;
+	}
+
+	if (bh) {
+		error = ext4_journal_get_write_access(handle, bh);
+		if (error) {
+			ext4_warning(inode->i_sb,
+				     "Get write access failed (error %d)",
+				     error);
+			return error;
+		}
+	}
+	return 0;
+}
+
+static void
+ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
+			    struct buffer_head *bh,
+			    struct ext4_xattr_entry *first, bool block_csum,
+			    struct ext4_xattr_inode_array **ea_inode_array,
+			    int extra_credits)
+{
+	struct inode *ea_inode;
+	struct ext4_xattr_entry *entry;
+	bool dirty = false;
+	unsigned int ea_ino;
+	int err;
+	int credits;
+
+	/* One credit for dec ref on ea_inode, one for orphan list addition, */
+	credits = 2 + extra_credits;
+
+	for (entry = first; !IS_LAST_ENTRY(entry);
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		ea_ino = le32_to_cpu(entry->e_value_inum);
+		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		if (err)
+			continue;
+
+		err = ext4_expand_inode_array(ea_inode_array, ea_inode);
+		if (err) {
+			ext4_warning_inode(ea_inode,
+					   "Expand inode array err=%d", err);
+			iput(ea_inode);
+			continue;
+		}
+
+		err = ext4_xattr_ensure_credits(handle, parent, credits, bh,
+						dirty, block_csum);
+		if (err) {
+			ext4_warning_inode(ea_inode, "Ensure credits err=%d",
+					   err);
+			continue;
+		}
+
+		inode_lock(ea_inode);
+		clear_nlink(ea_inode);
+		ext4_orphan_add(handle, ea_inode);
+		inode_unlock(ea_inode);
+
+		/*
+		 * Forget about ea_inode within the same transaction that decrements the ref
+		 * count. This avoids duplicate decrements in case the rest of the work
+		 * spills over to subsequent transactions.
+		 */
+		entry->e_value_inum = 0;
+		entry->e_value_size = 0;
+
+		dirty = true;
+	}
+
+	if (dirty) {
+		/*
+		 * Note that we are deliberately skipping csum calculation for
+		 * the final update because we do not expect any journal
+		 * restarts until xattr block is freed.
+		 */
+
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (err)
+			ext4_warning_inode(parent,
+					   "handle dirty metadata err=%d", err);
+	}
+}
+
 /*
  * Release the xattr block BH: If the reference count is > 1, decrement it;
  * otherwise free the block.
@@ -1985,42 +2110,6 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
 	return 0;
 }
 
-/**
- * Add xattr inode to orphan list
- */
-static int
-ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits,
-			    struct ext4_xattr_inode_array *ea_inode_array)
-{
-	int idx = 0, error = 0;
-	struct inode *ea_inode;
-
-	if (ea_inode_array == NULL)
-		return 0;
-
-	for (; idx < ea_inode_array->count; ++idx) {
-		if (!ext4_handle_has_enough_credits(handle, credits)) {
-			error = ext4_journal_extend(handle, credits);
-			if (error > 0)
-				error = ext4_journal_restart(handle, credits);
-
-			if (error != 0) {
-				ext4_warning(inode->i_sb,
-					"couldn't extend journal "
-					"(err %d)", error);
-				return error;
-			}
-		}
-		ea_inode = ea_inode_array->inodes[idx];
-		inode_lock(ea_inode);
-		ext4_orphan_add(handle, ea_inode);
-		inode_unlock(ea_inode);
-		/* the inode's i_count will be released by caller */
-	}
-
-	return 0;
-}
-
 /*
  * ext4_xattr_delete_inode()
  *
@@ -2033,16 +2122,23 @@ ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits,
  */
 int
 ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-			struct ext4_xattr_inode_array **ea_inode_array)
+			struct ext4_xattr_inode_array **ea_inode_array,
+			int extra_credits)
 {
 	struct buffer_head *bh = NULL;
 	struct ext4_xattr_ibody_header *header;
 	struct ext4_inode *raw_inode;
-	struct ext4_iloc iloc;
-	struct ext4_xattr_entry *entry;
-	struct inode *ea_inode;
-	unsigned int ea_ino;
-	int credits = 3, error = 0;
+	struct ext4_iloc iloc = { .bh = NULL };
+	int error;
+
+	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
+					  NULL /* bh */,
+					  false /* dirty */,
+					  false /* block_csum */);
+	if (error) {
+		EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error);
+		goto cleanup;
+	}
 
 	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
 		goto delete_external_ea;
@@ -2050,31 +2146,20 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	error = ext4_get_inode_loc(inode, &iloc);
 	if (error)
 		goto cleanup;
+
+	error = ext4_journal_get_write_access(handle, iloc.bh);
+	if (error)
+		goto cleanup;
+
 	raw_inode = ext4_raw_inode(&iloc);
 	header = IHDR(inode, raw_inode);
-	for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
-	     entry = EXT4_XATTR_NEXT(entry)) {
-		if (!entry->e_value_inum)
-			continue;
-		ea_ino = le32_to_cpu(entry->e_value_inum);
-		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-		if (error)
-			continue;
-		error = ext4_expand_inode_array(ea_inode_array, ea_inode);
-		if (error) {
-			iput(ea_inode);
-			brelse(iloc.bh);
-			goto cleanup;
-		}
-		entry->e_value_inum = 0;
-	}
-	brelse(iloc.bh);
+	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
+				    false /* block_csum */, ea_inode_array,
+				    extra_credits);
 
 delete_external_ea:
 	if (!EXT4_I(inode)->i_file_acl) {
-		/* add xattr inode to orphan list */
-		error = ext4_xattr_inode_orphan_add(handle, inode, credits,
-						    *ea_inode_array);
+		error = 0;
 		goto cleanup;
 	}
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
@@ -2092,46 +2177,32 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 		goto cleanup;
 	}
 
-	for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
-	     entry = EXT4_XATTR_NEXT(entry)) {
-		if (!entry->e_value_inum)
-			continue;
-		ea_ino = le32_to_cpu(entry->e_value_inum);
-		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-		if (error)
-			continue;
-		error = ext4_expand_inode_array(ea_inode_array, ea_inode);
-		if (error)
-			goto cleanup;
-		entry->e_value_inum = 0;
-	}
-
-	/* add xattr inode to orphan list */
-	error = ext4_xattr_inode_orphan_add(handle, inode, credits,
-					*ea_inode_array);
-	if (error)
-		goto cleanup;
-
-	if (!IS_NOQUOTA(inode))
-		credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
-
-	if (!ext4_handle_has_enough_credits(handle, credits)) {
-		error = ext4_journal_extend(handle, credits);
-		if (error > 0)
-			error = ext4_journal_restart(handle, credits);
+	if (ext4_has_feature_ea_inode(inode->i_sb)) {
+		error = ext4_journal_get_write_access(handle, bh);
 		if (error) {
-			ext4_warning(inode->i_sb,
-				"couldn't extend journal (err %d)", error);
+			EXT4_ERROR_INODE(inode, "write access %llu",
+					 EXT4_I(inode)->i_file_acl);
 			goto cleanup;
 		}
+		ext4_xattr_inode_remove_all(handle, inode, bh,
+					    BFIRST(bh),
+					    true /* block_csum */,
+					    ea_inode_array,
+					    extra_credits);
 	}
 
 	ext4_xattr_release_block(handle, inode, bh);
+	/* Update i_file_acl within the same transaction that releases block. */
 	EXT4_I(inode)->i_file_acl = 0;
-
+	error = ext4_mark_inode_dirty(handle, inode);
+	if (error) {
+		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
+				 error);
+		goto cleanup;
+	}
 cleanup:
+	brelse(iloc.bh);
 	brelse(bh);
-
 	return error;
 }
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index adf761518a73..b2005a2716d9 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -169,7 +169,8 @@ extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
 
 extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-				   struct ext4_xattr_inode_array **array);
+				   struct ext4_xattr_inode_array **array,
+				   int extra_credits);
 extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
 
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
-- 
2.13.1.508.gb3defc5cc-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH v4 27/28] ext4: xattr inode deduplication
  2017-06-02 23:35             ` [PATCH v3 " Tahsin Erdogan
@ 2017-06-14 14:34               ` Tahsin Erdogan
  2017-06-14 23:26                 ` Andreas Dilger
  0 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-14 14:34 UTC (permalink / raw)
  To: Darrick J . Wong, Jan Kara, Theodore Ts'o, Andreas Dilger,
	Dave Kleikamp, Alexander Viro, Mark Fasheh, Joel Becker,
	Jens Axboe, Deepa Dinamani, Mike Christie, Fabian Frederick,
	linux-ext4
  Cc: linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel, Tahsin Erdogan

Ext4 now supports xattr values that are up to 64k in size (vfs limit).
Large xattr values are stored in external inodes each one holding a
single value. Once written the data blocks of these inodes are immutable.

The real world use cases are expected to have a lot of value duplication
such as inherited acls etc. To reduce data duplication on disk, this patch
implements a deduplicator that allows sharing of xattr inodes.

The deduplication is based on an in-memory hash lookup that is a best
effort sharing scheme. When a xattr inode is read from disk (i.e.
getxattr() call), its crc32c hash is added to a hash table. Before
creating a new xattr inode for a value being set, the hash table is
checked to see if an existing inode holds an identical value. If such an
inode is found, the ref count on that inode is incremented. On value
removal the ref count is decremented and if it reaches zero the inode is
deleted.

The quota charging for such inodes is manually managed. Every reference
holder is charged the full size as if there was no sharing happening.
This is consistent with how xattr blocks are also charged.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v4:
 - eliminated xattr entry in the xattr inode to avoid complexity and
   recursion in xattr update path. Now the ref count and hash are stored
   in i_[c/m/a]time.tv_sec fields.
 - some clean up in ext4_xattr_set_entry() to reduce code duplication and
   complexity

v3:
 - use s_csum_seed for hash calculations when available
 - return error on stored vs calculated hash mismatch
 
v2:
 - make dependency on crc32c dynamic
 - update ext4_has_metadata_csum() and ext4_has_group_desc_csum() so that
   they do not misinterpret existence of EXT4_SB(sb)->s_chksum_driver

 fs/ext4/acl.c   |    5 +-
 fs/ext4/ext4.h  |   22 +-
 fs/ext4/inode.c |    9 +-
 fs/ext4/super.c |   25 +-
 fs/ext4/xattr.c | 1000 +++++++++++++++++++++++++++++++++++++++++--------------
 fs/ext4/xattr.h |   17 +-
 fs/mbcache.c    |    9 +-
 7 files changed, 806 insertions(+), 281 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 74f7ac539e00..8db03e5c78bc 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	if (error)
 		return error;
 retry:
-	credits = ext4_xattr_set_credits(inode, acl_size);
+	error = ext4_xattr_set_credits(inode, acl_size, &credits);
+	if (error)
+		return error;
+
 	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d79d8d7bee88..7ceb1f81e4b8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1517,6 +1517,7 @@ struct ext4_sb_info {
 	long s_es_nr_inode;
 	struct ext4_es_stats s_es_stats;
 	struct mb_cache *s_mb_cache;
+	struct mb_cache *s_ea_inode_cache;
 	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
 
 	/* Ratelimit ext4 messages. */
@@ -2099,7 +2100,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
 	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
 }
 
-#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
+static inline bool ext4_is_quota_file(struct inode *inode)
+{
+	return IS_NOQUOTA(inode) &&
+	       !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
+}
 
 /*
  * This structure is stuffed into the struct file's private_data field
@@ -2709,19 +2714,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
 extern int ext4_register_li_request(struct super_block *sb,
 				    ext4_group_t first_not_zeroed);
 
-static inline int ext4_has_group_desc_csum(struct super_block *sb)
-{
-	return ext4_has_feature_gdt_csum(sb) ||
-	       EXT4_SB(sb)->s_chksum_driver != NULL;
-}
-
 static inline int ext4_has_metadata_csum(struct super_block *sb)
 {
 	WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
 		     !EXT4_SB(sb)->s_chksum_driver);
 
-	return (EXT4_SB(sb)->s_chksum_driver != NULL);
+	return ext4_has_feature_metadata_csum(sb) &&
+	       (EXT4_SB(sb)->s_chksum_driver != NULL);
 }
+
+static inline int ext4_has_group_desc_csum(struct super_block *sb)
+{
+	return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
+}
+
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
 	return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4d6936f0d8a4..6f5872197d6c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4843,8 +4843,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	}
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
-	if (ei->i_flags & EXT4_EA_INODE_FL)
+
+	if (ei->i_flags & EXT4_EA_INODE_FL) {
 		ext4_xattr_inode_set_class(inode);
+
+		inode_lock(inode);
+		inode->i_flags |= S_NOQUOTA;
+		inode_unlock(inode);
+	}
+
 	unlock_new_inode(inode);
 	return inode;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b02a23ec92ca..9fcd29e21dc7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
 		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
+	if (sbi->s_ea_inode_cache) {
+		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+		sbi->s_ea_inode_cache = NULL;
+	}
 	if (sbi->s_mb_cache) {
 		ext4_xattr_destroy_cache(sbi->s_mb_cache);
 		sbi->s_mb_cache = NULL;
@@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 	if (res)
 		return res;
 retry:
-	credits = ext4_xattr_set_credits(inode, len);
+	res = ext4_xattr_set_credits(inode, len, &credits);
+	if (res)
+		return res;
+
 	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -3445,7 +3452,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* Load the checksum driver */
-	if (ext4_has_feature_metadata_csum(sb)) {
+	if (ext4_has_feature_metadata_csum(sb) ||
+	    ext4_has_feature_ea_inode(sb)) {
 		sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
 		if (IS_ERR(sbi->s_chksum_driver)) {
 			ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
@@ -4067,6 +4075,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount_wq;
 	}
 
+	if (ext4_has_feature_ea_inode(sb)) {
+		sbi->s_ea_inode_cache = ext4_xattr_create_cache();
+		if (!sbi->s_ea_inode_cache) {
+			ext4_msg(sb, KERN_ERR,
+				 "Failed to create an s_ea_inode_cache");
+			goto failed_mount_wq;
+		}
+	}
+
 	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
 	    (blocksize != PAGE_SIZE)) {
 		ext4_msg(sb, KERN_ERR,
@@ -4296,6 +4313,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (EXT4_SB(sb)->rsv_conversion_wq)
 		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
 failed_mount_wq:
+	if (sbi->s_ea_inode_cache) {
+		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+		sbi->s_ea_inode_cache = NULL;
+	}
 	if (sbi->s_mb_cache) {
 		ext4_xattr_destroy_cache(sbi->s_mb_cache);
 		sbi->s_mb_cache = NULL;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index abc7d5f84e5f..2f9bcafd9aed 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -108,6 +108,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
 #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
 				inode->i_sb->s_fs_info)->s_mb_cache)
 
+#define EA_INODE_CACHE(inode)	(((struct ext4_sb_info *) \
+				inode->i_sb->s_fs_info)->s_ea_inode_cache)
+
 static int
 ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
 			struct inode *inode);
@@ -280,6 +283,34 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
 	return cmp ? -ENODATA : 0;
 }
 
+static u32
+ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
+{
+	return ext4_chksum(sbi, sbi->s_csum_seed ?: ~0, buffer, size);
+}
+
+static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
+{
+	return ((u64)ea_inode->i_ctime.tv_sec << 32) |
+	       ((u32)ea_inode->i_mtime.tv_sec);
+}
+
+static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
+{
+	ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32);
+	ea_inode->i_mtime.tv_sec = (u32)ref_count;
+}
+
+static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
+{
+	return (u32)ea_inode->i_atime.tv_sec;
+}
+
+static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
+{
+	ea_inode->i_atime.tv_sec = hash;
+}
+
 /*
  * Read the EA value from an inode.
  */
@@ -289,6 +320,8 @@ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
 	struct buffer_head *bh = NULL;
 	int blocksize = ea_inode->i_sb->s_blocksize;
 	size_t csize, copied = 0;
+	void *copy_pos = buf;
+	u32 calc_hash, stored_hash;
 
 	while (copied < size) {
 		csize = (size - copied) > blocksize ? blocksize : size - copied;
@@ -298,13 +331,24 @@ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
 		if (!bh)
 			return -EFSCORRUPTED;
 
-		memcpy(buf, bh->b_data, csize);
+		memcpy(copy_pos, bh->b_data, csize);
 		brelse(bh);
 
-		buf += csize;
+		copy_pos += csize;
 		block += 1;
 		copied += csize;
 	}
+
+	calc_hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buf, size);
+
+	/* Verify stored hash matches calculated hash. */
+	stored_hash = ext4_xattr_inode_get_hash(ea_inode);
+	if (calc_hash != stored_hash) {
+		ext4_warning_inode(ea_inode,
+			"EA inode calc_hash=%#x does not match stored_hash=%#x",
+			calc_hash, stored_hash);
+		return -EFSCORRUPTED;
+	}
 	return 0;
 }
 
@@ -329,14 +373,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
 		goto error;
 	}
 
-	if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
-	    inode->i_generation != parent->i_generation) {
-		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
-			   "to parent is invalid.", ea_ino);
-		err = -EINVAL;
-		goto error;
-	}
-
 	if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
 		ext4_error(parent->i_sb, "EA inode %lu does not have "
 			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
@@ -358,17 +394,34 @@ static int
 ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
 		     size_t size)
 {
+	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
 	struct inode *ea_inode;
-	int ret;
+	u32 hash;
+	int err;
 
-	ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-	if (ret)
-		return ret;
+	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+	if (err) {
+		ea_inode = NULL;
+		goto out;
+	}
 
-	ret = ext4_xattr_inode_read(ea_inode, buffer, size);
-	iput(ea_inode);
+	if (i_size_read(ea_inode) != size) {
+		ext4_warning_inode(ea_inode,
+				   "ea_inode file size=%llu entry size=%zu",
+				   i_size_read(ea_inode), size);
+		err = -EFSCORRUPTED;
+		goto out;
+	}
 
-	return ret;
+	err = ext4_xattr_inode_read(ea_inode, buffer, size);
+	if (!err) {
+		hash = ext4_xattr_inode_get_hash(ea_inode);
+		mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
+				      ea_inode->i_ino, true /* reusable */);
+	}
+out:
+	iput(ea_inode);
+	return err;
 }
 
 static int
@@ -657,6 +710,101 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 	}
 }
 
+static inline size_t round_up_cluster(struct inode *inode, size_t length)
+{
+	struct super_block *sb = inode->i_sb;
+	size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
+				    inode->i_blkbits);
+	size_t mask = ~(cluster_size - 1);
+
+	return (length + cluster_size - 1) & mask;
+}
+
+static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
+{
+	int err;
+
+	err = dquot_alloc_inode(inode);
+	if (err)
+		return err;
+	err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
+	if (err)
+		dquot_free_inode(inode);
+	return err;
+}
+
+static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
+{
+	dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
+	dquot_free_inode(inode);
+}
+
+static int __ext4_xattr_set_credits(struct super_block *sb,
+				    struct buffer_head *block_bh,
+				    size_t value_len)
+{
+	int credits;
+	int blocks;
+
+	/*
+	 * 1) Owner inode update
+	 * 2) Ref count update on old xattr block
+	 * 3) new xattr block
+	 * 4) block bitmap update for new xattr block
+	 * 5) group descriptor for new xattr block
+	 */
+	credits = 5;
+
+	/* We are done if ea_inode feature is not enabled. */
+	if (!ext4_has_feature_ea_inode(sb))
+		return credits;
+
+	/* New ea_inode, inode map, block bitmap, group descriptor. */
+	credits += 4;
+
+	/* Data blocks. */
+	blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+
+	/* Indirection block. */
+	blocks += 1;
+
+	/* Block bitmap and group descriptor updates for each block. */
+	credits += blocks * 2;
+
+	/* Blocks themselves. */
+	credits += blocks;
+
+	/* Dereference ea_inode holding old xattr value.
+	 * Old ea_inode, inode map, block bitmap, group descriptor.
+	 */
+	credits += 4;
+
+	/* Data blocks for old ea_inode. */
+	blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
+
+	/* Indirection block for old ea_inode. */
+	blocks += 1;
+
+	/* Block bitmap and group descriptor updates for each block. */
+	credits += blocks * 2;
+
+	/* Quota updates. */
+	credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
+
+	/* We may need to clone the existing xattr block in which case we need
+	 * to increment ref counts for existing ea_inodes referenced by it.
+	 */
+	if (block_bh) {
+		struct ext4_xattr_entry *entry = BFIRST(block_bh);
+
+		for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
+			if (entry->e_value_inum)
+				/* Ref count update on ea_inode. */
+				credits += 1;
+	}
+	return credits;
+}
+
 static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
 				     int credits, struct buffer_head *bh,
 				     bool dirty, bool block_csum)
@@ -706,12 +854,140 @@ static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
+static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
+				       int ref_change)
+{
+	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
+	struct ext4_iloc iloc;
+	s64 ref_count;
+	u32 hash;
+	int ret;
+
+	inode_lock(ea_inode);
+
+	ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
+	if (ret) {
+		iloc.bh = NULL;
+		goto out;
+	}
+
+	ref_count = ext4_xattr_inode_get_ref(ea_inode);
+	ref_count += ref_change;
+	ext4_xattr_inode_set_ref(ea_inode, ref_count);
+
+	if (ref_change > 0) {
+		WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld",
+			  ea_inode->i_ino, ref_count);
+
+		if (ref_count == 1) {
+			WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
+				  ea_inode->i_ino, ea_inode->i_nlink);
+
+			set_nlink(ea_inode, 1);
+			ext4_orphan_del(handle, ea_inode);
+
+			hash = ext4_xattr_inode_get_hash(ea_inode);
+			mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
+					      ea_inode->i_ino,
+					      true /* reusable */);
+		}
+	} else {
+		WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
+			  ea_inode->i_ino, ref_count);
+
+		if (ref_count == 0) {
+			WARN_ONCE(ea_inode->i_nlink != 1,
+				  "EA inode %lu i_nlink=%u",
+				  ea_inode->i_ino, ea_inode->i_nlink);
+
+			clear_nlink(ea_inode);
+			ext4_orphan_add(handle, ea_inode);
+
+			hash = ext4_xattr_inode_get_hash(ea_inode);
+			mb_cache_entry_delete(ea_inode_cache, hash,
+					      ea_inode->i_ino);
+		}
+	}
+
+	ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
+	iloc.bh = NULL;
+	if (ret)
+		ext4_warning_inode(ea_inode,
+				   "ext4_mark_iloc_dirty() failed ret=%d", ret);
+out:
+	brelse(iloc.bh);
+	inode_unlock(ea_inode);
+	return ret;
+}
+
+static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
+{
+	return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
+}
+
+static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
+{
+	return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
+}
+
+static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
+					struct ext4_xattr_entry *first)
+{
+	struct inode *ea_inode;
+	struct ext4_xattr_entry *entry;
+	struct ext4_xattr_entry *failed_entry;
+	unsigned int ea_ino;
+	int err, saved_err;
+
+	for (entry = first; !IS_LAST_ENTRY(entry);
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		ea_ino = le32_to_cpu(entry->e_value_inum);
+		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		if (err)
+			goto cleanup;
+		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
+		if (err) {
+			ext4_warning_inode(ea_inode, "inc ref error %d", err);
+			iput(ea_inode);
+			goto cleanup;
+		}
+		iput(ea_inode);
+	}
+	return 0;
+
+cleanup:
+	saved_err = err;
+	failed_entry = entry;
+
+	for (entry = first; entry != failed_entry;
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		ea_ino = le32_to_cpu(entry->e_value_inum);
+		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		if (err) {
+			ext4_warning(parent->i_sb,
+				     "cleanup ea_ino %u iget error %d", ea_ino,
+				     err);
+			continue;
+		}
+		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (err)
+			ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
+					   err);
+		iput(ea_inode);
+	}
+	return saved_err;
+}
+
 static void
-ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
-			    struct buffer_head *bh,
-			    struct ext4_xattr_entry *first, bool block_csum,
-			    struct ext4_xattr_inode_array **ea_inode_array,
-			    int extra_credits)
+ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
+			     struct buffer_head *bh,
+			     struct ext4_xattr_entry *first, bool block_csum,
+			     struct ext4_xattr_inode_array **ea_inode_array,
+			     int extra_credits, bool skip_quota)
 {
 	struct inode *ea_inode;
 	struct ext4_xattr_entry *entry;
@@ -748,10 +1024,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
 			continue;
 		}
 
-		inode_lock(ea_inode);
-		clear_nlink(ea_inode);
-		ext4_orphan_add(handle, ea_inode);
-		inode_unlock(ea_inode);
+		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (err) {
+			ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
+					   err);
+			continue;
+		}
+
+		if (!skip_quota)
+			ext4_xattr_inode_free_quota(parent,
+					      le32_to_cpu(entry->e_value_size));
 
 		/*
 		 * Forget about ea_inode within the same transaction that decrements the ref
@@ -784,7 +1066,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
  */
 static void
 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
-			 struct buffer_head *bh)
+			 struct buffer_head *bh,
+			 struct ext4_xattr_inode_array **ea_inode_array,
+			 int extra_credits)
 {
 	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 	u32 hash, ref;
@@ -807,6 +1091,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
 		get_bh(bh);
 		unlock_buffer(bh);
+
+		if (ext4_has_feature_ea_inode(inode->i_sb))
+			ext4_xattr_inode_dec_ref_all(handle, inode, bh,
+						     BFIRST(bh),
+						     true /* block_csum */,
+						     ea_inode_array,
+						     extra_credits,
+						     true /* skip_quota */);
 		ext4_free_blocks(handle, inode, bh, 0, 1,
 				 EXT4_FREE_BLOCKS_METADATA |
 				 EXT4_FREE_BLOCKS_FORGET);
@@ -947,7 +1239,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
  * Create an inode to store the value of a large EA.
  */
 static struct inode *ext4_xattr_inode_create(handle_t *handle,
-					     struct inode *inode)
+					     struct inode *inode, u32 hash)
 {
 	struct inode *ea_inode = NULL;
 	uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
@@ -965,67 +1257,121 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 		ea_inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(ea_inode);
 		ext4_xattr_inode_set_class(ea_inode);
-		ea_inode->i_generation = inode->i_generation;
-		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
-
-		/*
-		 * A back-pointer from EA inode to parent inode will be useful
-		 * for e2fsck.
-		 */
-		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
 		unlock_new_inode(ea_inode);
-		err = ext4_inode_attach_jinode(ea_inode);
+		ext4_xattr_inode_set_ref(ea_inode, 1);
+		ext4_xattr_inode_set_hash(ea_inode, hash);
+		err = ext4_mark_inode_dirty(handle, ea_inode);
+		if (!err)
+			err = ext4_inode_attach_jinode(ea_inode);
 		if (err) {
 			iput(ea_inode);
 			return ERR_PTR(err);
 		}
+
+		/*
+		 * Xattr inodes are shared therefore quota charging is performed
+		 * at a higher level.
+		 */
+		dquot_free_inode(ea_inode);
+		dquot_drop(ea_inode);
+		inode_lock(ea_inode);
+		ea_inode->i_flags |= S_NOQUOTA;
+		inode_unlock(ea_inode);
 	}
 
 	return ea_inode;
 }
 
-/*
- * Unlink the inode storing the value of the EA.
- */
-int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
+static struct inode *
+ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
+			    size_t value_len, u32 hash)
 {
-	struct inode *ea_inode = NULL;
+	struct inode *ea_inode;
+	struct mb_cache_entry *ce;
+	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
+	void *ea_data = NULL;
 	int err;
 
-	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-	if (err)
-		return err;
+	ce = mb_cache_entry_find_first(ea_inode_cache, hash);
+	while (ce) {
+		ea_inode = ext4_iget(inode->i_sb, ce->e_value);
+		if (IS_ERR(ea_inode)) {
+			ea_inode = NULL;
+			goto next;
+		}
 
-	clear_nlink(ea_inode);
-	iput(ea_inode);
+		if (is_bad_inode(ea_inode) ||
+		    !(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) ||
+		    i_size_read(ea_inode) != value_len)
+			goto next;
 
-	return 0;
+		if (!ea_data)
+			ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
+
+		if (!ea_data) {
+			iput(ea_inode);
+			return NULL;
+		}
+
+		err = ext4_xattr_inode_read(ea_inode, ea_data, value_len);
+		if (unlikely(err))
+			goto next;
+
+		if (!memcmp(value, ea_data, value_len)) {
+			mb_cache_entry_touch(ea_inode_cache, ce);
+			mb_cache_entry_put(ea_inode_cache, ce);
+			kvfree(ea_data);
+			return ea_inode;
+		}
+next:
+		iput(ea_inode);
+		ce = mb_cache_entry_find_next(ea_inode_cache, ce);
+	}
+	kvfree(ea_data);
+	return NULL;
 }
 
 /*
  * Add value of the EA in an inode.
  */
-static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
-				unsigned long *ea_ino, const void *value,
-				size_t value_len)
+static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
+					  const void *value, size_t value_len,
+					  struct inode **ret_inode)
 {
 	struct inode *ea_inode;
+	u32 hash;
 	int err;
 
+	hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
+	ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
+	if (ea_inode) {
+		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
+		if (err) {
+			iput(ea_inode);
+			return err;
+		}
+
+		*ret_inode = ea_inode;
+		return 0;
+	}
+
 	/* Create an inode for the EA value */
-	ea_inode = ext4_xattr_inode_create(handle, inode);
+	ea_inode = ext4_xattr_inode_create(handle, inode, hash);
 	if (IS_ERR(ea_inode))
 		return PTR_ERR(ea_inode);
 
 	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
-	if (err)
-		clear_nlink(ea_inode);
-	else
-		*ea_ino = ea_inode->i_ino;
+	if (err) {
+		ext4_xattr_inode_dec_ref(handle, ea_inode);
+		iput(ea_inode);
+		return err;
+	}
 
-	iput(ea_inode);
+	mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
+			      ea_inode->i_ino, true /* reusable */);
 
-	return err;
+	*ret_inode = ea_inode;
+	return 0;
 }
 
 static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
@@ -1033,9 +1379,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 				handle_t *handle, struct inode *inode)
 {
 	struct ext4_xattr_entry *last;
-	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
+	struct ext4_xattr_entry *here = s->here;
+	size_t min_offs = s->end - s->base, name_len = strlen(i->name);
 	int in_inode = i->in_inode;
-	int rc;
+	struct inode *old_ea_inode = NULL;
+	struct inode *new_ea_inode = NULL;
+	size_t old_size, new_size;
+	int ret;
+
+	/* Space used by old and new values. */
+	old_size = (!s->not_found && !here->e_value_inum) ?
+			EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0;
+	new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0;
+
+	/*
+	 * Optimization for the simple case when old and new values have the
+	 * same padded sizes. Not applicable if external inodes are involved.
+	 */
+	if (new_size && new_size == old_size) {
+		size_t offs = le16_to_cpu(here->e_value_offs);
+		void *val = s->base + offs;
+
+		here->e_value_size = cpu_to_le32(i->value_len);
+		if (i->value == EXT4_ZERO_XATTR_VALUE) {
+			memset(val, 0, new_size);
+		} else {
+			memcpy(val, i->value, i->value_len);
+			/* Clear padding bytes. */
+			memset(val + i->value_len, 0, new_size - i->value_len);
+		}
+		return 0;
+	}
 
 	/* Compute min_offs and last. */
 	last = s->first;
@@ -1046,122 +1420,148 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 				min_offs = offs;
 		}
 	}
-	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
-	if (!s->not_found) {
-		if (!in_inode &&
-		    !s->here->e_value_inum && s->here->e_value_size) {
-			size_t size = le32_to_cpu(s->here->e_value_size);
-			free += EXT4_XATTR_SIZE(size);
-		}
-		free += EXT4_XATTR_LEN(name_len);
-	}
+
+	/* Check whether we have enough space. */
 	if (i->value) {
-		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
+		size_t free;
 
-		if (in_inode)
-			value_len = 0;
+		free = min_offs - ((void *)last - s->base) - sizeof(__u32);
+		if (!s->not_found)
+			free += EXT4_XATTR_LEN(name_len) + old_size;
 
-		if (free < EXT4_XATTR_LEN(name_len) + value_len)
-			return -ENOSPC;
+		if (free < EXT4_XATTR_LEN(name_len) + new_size) {
+			ret = -ENOSPC;
+			goto out;
+		}
 	}
 
-	if (i->value && s->not_found) {
-		/* Insert the new name. */
-		size_t size = EXT4_XATTR_LEN(name_len);
-		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
-		memmove((void *)s->here + size, s->here, rest);
-		memset(s->here, 0, size);
-		s->here->e_name_index = i->name_index;
-		s->here->e_name_len = name_len;
-		memcpy(s->here->e_name, i->name, name_len);
-	} else {
-		if (!s->here->e_value_inum && s->here->e_value_size &&
-		    s->here->e_value_offs > 0) {
-			void *first_val = s->base + min_offs;
-			size_t offs = le16_to_cpu(s->here->e_value_offs);
-			void *val = s->base + offs;
-			size_t size = EXT4_XATTR_SIZE(
-				le32_to_cpu(s->here->e_value_size));
-
-			if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
-				/* The old and the new value have the same
-				   size. Just replace. */
-				s->here->e_value_size =
-					cpu_to_le32(i->value_len);
-				if (i->value == EXT4_ZERO_XATTR_VALUE) {
-					memset(val, 0, size);
-				} else {
-					/* Clear pad bytes first. */
-					memset(val + size - EXT4_XATTR_PAD, 0,
-					       EXT4_XATTR_PAD);
-					memcpy(val, i->value, i->value_len);
-				}
-				return 0;
-			}
+	/*
+	 * Getting access to old and new ea inodes is subject to failures.
+	 * Finish that work before doing any modifications to the xattr data.
+	 */
+	if (!s->not_found && here->e_value_inum) {
+		ret = ext4_xattr_inode_iget(inode,
+					    le32_to_cpu(here->e_value_inum),
+					    &old_ea_inode);
+		if (ret) {
+			old_ea_inode = NULL;
+			goto out;
+		}
+	}
+	if (i->value && in_inode) {
+		WARN_ON_ONCE(!i->value_len);
 
-			/* Remove the old value. */
-			memmove(first_val + size, first_val, val - first_val);
-			memset(first_val, 0, size);
-			s->here->e_value_size = 0;
-			s->here->e_value_offs = 0;
-			min_offs += size;
-
-			/* Adjust all value offsets. */
-			last = s->first;
-			while (!IS_LAST_ENTRY(last)) {
-				size_t o = le16_to_cpu(last->e_value_offs);
-				if (!last->e_value_inum &&
-				    last->e_value_size && o < offs)
-					last->e_value_offs =
-						cpu_to_le16(o + size);
-				last = EXT4_XATTR_NEXT(last);
-			}
+		ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
+		if (ret)
+			goto out;
+
+		ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
+						     i->value_len,
+						     &new_ea_inode);
+		if (ret) {
+			new_ea_inode = NULL;
+			ext4_xattr_inode_free_quota(inode, i->value_len);
+			goto out;
 		}
-		if (s->here->e_value_inum) {
-			ext4_xattr_inode_unlink(inode,
-					    le32_to_cpu(s->here->e_value_inum));
-			s->here->e_value_inum = 0;
+	}
+
+	if (old_ea_inode) {
+		/* We are ready to release ref count on the old_ea_inode. */
+		ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
+		if (ret) {
+			/* Release newly required ref count on new_ea_inode. */
+			if (new_ea_inode) {
+				int err;
+
+				err = ext4_xattr_inode_dec_ref(handle,
+							       new_ea_inode);
+				if (err)
+					ext4_warning_inode(new_ea_inode,
+						  "dec ref new_ea_inode err=%d",
+						  err);
+				ext4_xattr_inode_free_quota(inode,
+							    i->value_len);
+			}
+			goto out;
 		}
-		if (!i->value) {
-			/* Remove the old name. */
-			size_t size = EXT4_XATTR_LEN(name_len);
-			last = ENTRY((void *)last - size);
-			memmove(s->here, (void *)s->here + size,
-				(void *)last - (void *)s->here + sizeof(__u32));
-			memset(last, 0, size);
+
+		ext4_xattr_inode_free_quota(inode,
+					    le32_to_cpu(here->e_value_size));
+	}
+
+	/* No failures allowed past this point. */
+
+	if (!s->not_found && here->e_value_offs) {
+		/* Remove the old value. */
+		void *first_val = s->base + min_offs;
+		size_t offs = le16_to_cpu(here->e_value_offs);
+		void *val = s->base + offs;
+
+		memmove(first_val + old_size, first_val, val - first_val);
+		memset(first_val, 0, old_size);
+		min_offs += old_size;
+
+		/* Adjust all value offsets. */
+		last = s->first;
+		while (!IS_LAST_ENTRY(last)) {
+			size_t o = le16_to_cpu(last->e_value_offs);
+
+			if (!last->e_value_inum &&
+			    last->e_value_size && o < offs)
+				last->e_value_offs = cpu_to_le16(o + old_size);
+			last = EXT4_XATTR_NEXT(last);
 		}
 	}
 
+	if (!i->value) {
+		/* Remove old name. */
+		size_t size = EXT4_XATTR_LEN(name_len);
+
+		last = ENTRY((void *)last - size);
+		memmove(here, (void *)here + size,
+			(void *)last - (void *)here + sizeof(__u32));
+		memset(last, 0, size);
+	} else if (s->not_found) {
+		/* Insert new name. */
+		size_t size = EXT4_XATTR_LEN(name_len);
+		size_t rest = (void *)last - (void *)here + sizeof(__u32);
+
+		memmove((void *)here + size, here, rest);
+		memset(here, 0, size);
+		here->e_name_index = i->name_index;
+		here->e_name_len = name_len;
+		memcpy(here->e_name, i->name, name_len);
+	} else {
+		/* This is an update, reset value info. */
+		here->e_value_inum = 0;
+		here->e_value_offs = 0;
+		here->e_value_size = 0;
+	}
+
 	if (i->value) {
-		/* Insert the new value. */
+		/* Insert new value. */
 		if (in_inode) {
-			unsigned long ea_ino =
-				le32_to_cpu(s->here->e_value_inum);
-			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
-						  i->value, i->value_len);
-			if (rc)
-				goto out;
-			s->here->e_value_inum = cpu_to_le32(ea_ino);
-			s->here->e_value_offs = 0;
+			here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
 		} else if (i->value_len) {
-			size_t size = EXT4_XATTR_SIZE(i->value_len);
-			void *val = s->base + min_offs - size;
-			s->here->e_value_offs = cpu_to_le16(min_offs - size);
-			s->here->e_value_inum = 0;
+			void *val = s->base + min_offs - new_size;
+
+			here->e_value_offs = cpu_to_le16(min_offs - new_size);
 			if (i->value == EXT4_ZERO_XATTR_VALUE) {
-				memset(val, 0, size);
+				memset(val, 0, new_size);
 			} else {
-				/* Clear the pad bytes first. */
-				memset(val + size - EXT4_XATTR_PAD, 0,
-				       EXT4_XATTR_PAD);
 				memcpy(val, i->value, i->value_len);
+				/* Clear padding bytes. */
+				memset(val + i->value_len, 0,
+				       new_size - i->value_len);
 			}
 		}
-		s->here->e_value_size = cpu_to_le32(i->value_len);
+		here->e_value_size = cpu_to_le32(i->value_len);
 	}
-
+	ret = 0;
 out:
-	return rc;
+	iput(old_ea_inode);
+	iput(new_ea_inode);
+	return ret;
 }
 
 struct ext4_xattr_block_find {
@@ -1223,6 +1623,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 	struct mb_cache_entry *ce = NULL;
 	int error = 0;
 	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+	struct inode *ea_inode = NULL;
+	size_t old_ea_inode_size = 0;
 
 #define header(x) ((struct ext4_xattr_header *)(x))
 
@@ -1277,6 +1679,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			header(s->base)->h_refcount = cpu_to_le32(1);
 			s->here = ENTRY(s->base + offset);
 			s->end = s->base + bs->bh->b_size;
+
+			/*
+			 * If existing entry points to an xattr inode, we need
+			 * to prevent ext4_xattr_set_entry() from decrementing
+			 * ref count on it because the reference belongs to the
+			 * original block. In this case, make the entry look
+			 * like it has an empty value.
+			 */
+			if (!s->not_found && s->here->e_value_inum) {
+				/*
+				 * Defer quota free call for previous inode
+				 * until success is guaranteed.
+				 */
+				old_ea_inode_size = le32_to_cpu(
+							s->here->e_value_size);
+				s->here->e_value_inum = 0;
+				s->here->e_value_size = 0;
+			}
 		}
 	} else {
 		/* Allocate a buffer where we construct the new block. */
@@ -1298,6 +1718,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		goto bad_block;
 	if (error)
 		goto cleanup;
+
+	if (i->value && s->here->e_value_inum) {
+		unsigned int ea_ino;
+
+		/*
+		 * A ref count on ea_inode has been taken as part of the call to
+		 * ext4_xattr_set_entry() above. We would like to drop this
+		 * extra ref but we have to wait until the xattr block is
+		 * initialized and has its own ref count on the ea_inode.
+		 */
+		ea_ino = le32_to_cpu(s->here->e_value_inum);
+		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+		if (error) {
+			ea_inode = NULL;
+			goto cleanup;
+		}
+	}
+
 	if (!IS_LAST_ENTRY(s->first))
 		ext4_xattr_rehash(header(s->base), s->here);
 
@@ -1408,6 +1846,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 						 EXT4_FREE_BLOCKS_METADATA);
 				goto cleanup;
 			}
+			error = ext4_xattr_inode_inc_ref_all(handle, inode,
+						      ENTRY(header(s->base)+1));
+			if (error)
+				goto getblk_failed;
+			if (ea_inode) {
+				/* Drop the extra ref on ea_inode. */
+				error = ext4_xattr_inode_dec_ref(handle,
+								 ea_inode);
+				if (error)
+					ext4_warning_inode(ea_inode,
+							   "dec ref error=%d",
+							   error);
+				iput(ea_inode);
+				ea_inode = NULL;
+			}
+
 			lock_buffer(new_bh);
 			error = ext4_journal_get_create_access(handle, new_bh);
 			if (error) {
@@ -1427,15 +1881,38 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		}
 	}
 
+	if (old_ea_inode_size)
+		ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
+
 	/* Update the inode. */
 	EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
 
 	/* Drop the previous xattr block. */
-	if (bs->bh && bs->bh != new_bh)
-		ext4_xattr_release_block(handle, inode, bs->bh);
+	if (bs->bh && bs->bh != new_bh) {
+		struct ext4_xattr_inode_array *ea_inode_array = NULL;
+
+		ext4_xattr_release_block(handle, inode, bs->bh,
+					 &ea_inode_array,
+					 0 /* extra_credits */);
+		ext4_xattr_inode_array_free(ea_inode_array);
+	}
 	error = 0;
 
 cleanup:
+	if (ea_inode) {
+		int error2;
+
+		error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (error2)
+			ext4_warning_inode(ea_inode, "dec ref error=%d",
+					   error2);
+
+		/* If there was an error, revert the quota charge. */
+		if (error)
+			ext4_xattr_inode_free_quota(inode,
+						    i_size_read(ea_inode));
+		iput(ea_inode);
+	}
 	if (ce)
 		mb_cache_entry_put(ext4_mb_cache, ce);
 	brelse(new_bh);
@@ -1560,6 +2037,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
 	return !memcmp(value, i->value, i->value_len);
 }
 
+static struct buffer_head *ext4_xattr_get_block(struct inode *inode)
+{
+	struct buffer_head *bh;
+	int error;
+
+	if (!EXT4_I(inode)->i_file_acl)
+		return NULL;
+	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+	if (!bh)
+		return ERR_PTR(-EIO);
+	error = ext4_xattr_check_block(inode, bh);
+	if (error)
+		return ERR_PTR(error);
+	return bh;
+}
+
 /*
  * ext4_xattr_set_handle()
  *
@@ -1602,9 +2095,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 
 	/* Check journal credits under write lock. */
 	if (ext4_handle_valid(handle)) {
+		struct buffer_head *bh;
 		int credits;
 
-		credits = ext4_xattr_set_credits(inode, value_len);
+		bh = ext4_xattr_get_block(inode);
+		if (IS_ERR(bh)) {
+			error = PTR_ERR(bh);
+			goto cleanup;
+		}
+
+		credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
+		brelse(bh);
+
 		if (!ext4_handle_has_enough_credits(handle, credits)) {
 			error = -ENOSPC;
 			goto cleanup;
@@ -1640,6 +2142,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		if (flags & XATTR_CREATE)
 			goto cleanup;
 	}
+
 	if (!value) {
 		if (!is.s.not_found)
 			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
@@ -1708,34 +2211,29 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	return error;
 }
 
-int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
+int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
 {
-	struct super_block *sb = inode->i_sb;
-	int credits;
-
-	if (!EXT4_SB(sb)->s_journal)
-		return 0;
-
-	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+	struct buffer_head *bh;
+	int err;
 
-	/*
-	 * In case of inline data, we may push out the data to a block,
-	 * so we need to reserve credits for this eventuality
-	 */
-	if (ext4_has_inline_data(inode))
-	        credits += ext4_writepage_trans_blocks(inode) + 1;
+	*credits = 0;
 
-	if (ext4_has_feature_ea_inode(sb)) {
-		int nrblocks = (value_len + sb->s_blocksize - 1) >>
-					sb->s_blocksize_bits;
+	if (!EXT4_SB(inode->i_sb)->s_journal)
+		return 0;
 
-		/* For new inode */
-		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
+	down_read(&EXT4_I(inode)->xattr_sem);
 
-		/* For data blocks of EA inode */
-		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
+	bh = ext4_xattr_get_block(inode);
+	if (IS_ERR(bh)) {
+		err = PTR_ERR(bh);
+	} else {
+		*credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
+		brelse(bh);
+		err = 0;
 	}
-	return credits;
+
+	up_read(&EXT4_I(inode)->xattr_sem);
+	return err;
 }
 
 /*
@@ -1760,7 +2258,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 		return error;
 
 retry:
-	credits = ext4_xattr_set_credits(inode, value_len);
+	error = ext4_xattr_set_credits(inode, value_len, &credits);
+	if (error)
+		return error;
+
 	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
@@ -2066,10 +2567,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 	return error;
 }
 
-
 #define EIA_INCR 16 /* must be 2^n */
 #define EIA_MASK (EIA_INCR - 1)
-/* Add the large xattr @inode into @ea_inode_array for later deletion.
+
+/* Add the large xattr @inode into @ea_inode_array for deferred iput().
  * If @ea_inode_array is new or full it will be grown and the old
  * contents copied over.
  */
@@ -2114,21 +2615,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
  * ext4_xattr_delete_inode()
  *
  * Free extended attribute resources associated with this inode. Traverse
- * all entries and unlink any xattr inodes associated with this inode. This
- * is called immediately before an inode is freed. We have exclusive
- * access to the inode. If an orphan inode is deleted it will also delete any
- * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
- * to ensure they belong to the parent inode and were not deleted already.
+ * all entries and decrement reference on any xattr inodes associated with this
+ * inode. This is called immediately before an inode is freed. We have exclusive
+ * access to the inode. If an orphan inode is deleted it will also release its
+ * references on xattr block and xattr inodes.
  */
-int
-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-			struct ext4_xattr_inode_array **ea_inode_array,
-			int extra_credits)
+int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
+			    struct ext4_xattr_inode_array **ea_inode_array,
+			    int extra_credits)
 {
 	struct buffer_head *bh = NULL;
 	struct ext4_xattr_ibody_header *header;
-	struct ext4_inode *raw_inode;
 	struct ext4_iloc iloc = { .bh = NULL };
+	struct ext4_xattr_entry *entry;
 	int error;
 
 	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
@@ -2140,66 +2639,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 		goto cleanup;
 	}
 
-	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
-		goto delete_external_ea;
+	if (ext4_has_feature_ea_inode(inode->i_sb) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
 
-	error = ext4_get_inode_loc(inode, &iloc);
-	if (error)
-		goto cleanup;
-
-	error = ext4_journal_get_write_access(handle, iloc.bh);
-	if (error)
-		goto cleanup;
+		error = ext4_get_inode_loc(inode, &iloc);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
+			goto cleanup;
+		}
 
-	raw_inode = ext4_raw_inode(&iloc);
-	header = IHDR(inode, raw_inode);
-	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
-				    false /* block_csum */, ea_inode_array,
-				    extra_credits);
+		error = ext4_journal_get_write_access(handle, iloc.bh);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "write access (error %d)",
+					 error);
+			goto cleanup;
+		}
 
-delete_external_ea:
-	if (!EXT4_I(inode)->i_file_acl) {
-		error = 0;
-		goto cleanup;
-	}
-	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-	if (!bh) {
-		EXT4_ERROR_INODE(inode, "block %llu read error",
-				 EXT4_I(inode)->i_file_acl);
-		error = -EIO;
-		goto cleanup;
-	}
-	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
-	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-		EXT4_ERROR_INODE(inode, "bad block %llu",
-				 EXT4_I(inode)->i_file_acl);
-		error = -EFSCORRUPTED;
-		goto cleanup;
+		header = IHDR(inode, ext4_raw_inode(&iloc));
+		if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
+			ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
+						     IFIRST(header),
+						     false /* block_csum */,
+						     ea_inode_array,
+						     extra_credits,
+						     false /* skip_quota */);
 	}
 
-	if (ext4_has_feature_ea_inode(inode->i_sb)) {
-		error = ext4_journal_get_write_access(handle, bh);
-		if (error) {
-			EXT4_ERROR_INODE(inode, "write access %llu",
+	if (EXT4_I(inode)->i_file_acl) {
+		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+		if (!bh) {
+			EXT4_ERROR_INODE(inode, "block %llu read error",
 					 EXT4_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		error = ext4_xattr_check_block(inode, bh);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
+					 EXT4_I(inode)->i_file_acl, error);
 			goto cleanup;
 		}
-		ext4_xattr_inode_remove_all(handle, inode, bh,
-					    BFIRST(bh),
-					    true /* block_csum */,
-					    ea_inode_array,
-					    extra_credits);
-	}
 
-	ext4_xattr_release_block(handle, inode, bh);
-	/* Update i_file_acl within the same transaction that releases block. */
-	EXT4_I(inode)->i_file_acl = 0;
-	error = ext4_mark_inode_dirty(handle, inode);
-	if (error) {
-		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
-				 error);
-		goto cleanup;
+		if (ext4_has_feature_ea_inode(inode->i_sb)) {
+			for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
+			     entry = EXT4_XATTR_NEXT(entry))
+				if (entry->e_value_inum)
+					ext4_xattr_inode_free_quota(inode,
+					      le32_to_cpu(entry->e_value_size));
+
+		}
+
+		ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
+					 extra_credits);
+		/*
+		 * Update i_file_acl value in the same transaction that releases
+		 * block.
+		 */
+		EXT4_I(inode)->i_file_acl = 0;
+		error = ext4_mark_inode_dirty(handle, inode);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
+					 error);
+			goto cleanup;
+		}
 	}
+	error = 0;
 cleanup:
 	brelse(iloc.bh);
 	brelse(bh);
@@ -2208,17 +2712,13 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 
 void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
 {
-	struct inode	*ea_inode;
-	int		idx = 0;
+	int idx;
 
 	if (ea_inode_array == NULL)
 		return;
 
-	for (; idx < ea_inode_array->count; ++idx) {
-		ea_inode = ea_inode_array->inodes[idx];
-		clear_nlink(ea_inode);
-		iput(ea_inode);
-	}
+	for (idx = 0; idx < ea_inode_array->count; ++idx)
+		iput(ea_inode_array->inodes[idx]);
 	kfree(ea_inode_array);
 }
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index b2005a2716d9..67616cb9a059 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -69,19 +69,6 @@ struct ext4_xattr_entry {
 		EXT4_I(inode)->i_extra_isize))
 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
 
-/*
- * Link EA inode back to parent one using i_mtime field.
- * Extra integer type conversion added to ignore higher
- * bits in i_mtime.tv_sec which might be set by ext4_get()
- */
-#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
-do {                                                  \
-      (inode)->i_mtime.tv_sec = inum;                 \
-} while(0)
-
-#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
-((__u32)(inode)->i_mtime.tv_sec)
-
 /*
  * The minimum size of EA value when you start storing it in an external inode
  * size of block - size of header - size of 1 entry - 4 null bytes
@@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
 extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
 extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
-extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
+extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
+				  int *credits);
 
-extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 				   struct ext4_xattr_inode_array **array,
 				   int extra_credits);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 77a5b99d8f92..7dfdca822ccb 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -13,10 +13,11 @@
  * mb_cache_entry_delete()).
  *
  * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
- * They use hash of a block contents as a key and block number as a value.
- * That's why keys need not be unique (different xattr blocks may end up having
- * the same hash). However block number always uniquely identifies a cache
- * entry.
+ * Ext4 also uses it for deduplication of xattr values stored in inodes.
+ * They use hash of data as a key and provide a value that may represent a
+ * block or inode number. That's why keys need not be unique (hash of different
+ * data may be the same). However user provided value always uniquely
+ * identifies a cache entry.
  *
  * We provide functions for creation and removal of entries, search by key,
  * and a special "delete entry with given key-value pair" operation. Fixed
-- 
2.13.1.508.gb3defc5cc-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* Re: [PATCH v4 27/28] ext4: xattr inode deduplication
  2017-06-14 14:34               ` [PATCH v4 " Tahsin Erdogan
@ 2017-06-14 23:26                 ` Andreas Dilger
  2017-06-20  9:07                   ` [PATCH v5 " Tahsin Erdogan
  0 siblings, 1 reply; 100+ messages in thread
From: Andreas Dilger @ 2017-06-14 23:26 UTC (permalink / raw)
  To: Tahsin Erdogan; +Cc: Darrick J . Wong, Jan Kara, Theodore Ts'o, linux-ext4

[-- Attachment #1: Type: text/plain, Size: 12726 bytes --]

[reduced CC list to linux-ext4]

On Jun 14, 2017, at 8:34 AM, Tahsin Erdogan <tahsin@google.com> wrote:
> 
> Ext4 now supports xattr values that are up to 64k in size (vfs limit).
> Large xattr values are stored in external inodes each one holding a
> single value. Once written the data blocks of these inodes are immutable.
> 
> The real world use cases are expected to have a lot of value duplication
> such as inherited acls etc. To reduce data duplication on disk, this patch
> implements a deduplicator that allows sharing of xattr inodes.
> 
> The deduplication is based on an in-memory hash lookup that is a best
> effort sharing scheme. When a xattr inode is read from disk (i.e.
> getxattr() call), its crc32c hash is added to a hash table. Before
> creating a new xattr inode for a value being set, the hash table is
> checked to see if an existing inode holds an identical value. If such an
> inode is found, the ref count on that inode is incremented. On value
> removal the ref count is decremented and if it reaches zero the inode is
> deleted.
> 
> The quota charging for such inodes is manually managed. Every reference
> holder is charged the full size as if there was no sharing happening.
> This is consistent with how xattr blocks are also charged.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
> v4:
> - eliminated xattr entry in the xattr inode to avoid complexity and
>   recursion in xattr update path. Now the ref count and hash are stored
>   in i_[c/m/a]time.tv_sec fields.
> - some clean up in ext4_xattr_set_entry() to reduce code duplication and
>   complexity
> 
> v3:
> - use s_csum_seed for hash calculations when available
> - return error on stored vs calculated hash mismatch
> 
> v2:
> - make dependency on crc32c dynamic
> - update ext4_has_metadata_csum() and ext4_has_group_desc_csum() so that
>   they do not misinterpret existence of EXT4_SB(sb)->s_chksum_driver
> 
> fs/ext4/acl.c   |    5 +-
> fs/ext4/ext4.h  |   22 +-
> fs/ext4/inode.c |    9 +-
> fs/ext4/super.c |   25 +-
> fs/ext4/xattr.c | 1000 +++++++++++++++++++++++++++++++++++++++++--------------
> fs/ext4/xattr.h |   17 +-
> fs/mbcache.c    |    9 +-
> 7 files changed, 806 insertions(+), 281 deletions(-)
> 
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index b02a23ec92ca..9fcd29e21dc7 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -4067,6 +4075,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> 		goto failed_mount_wq;
> 	}
> 
> +	if (ext4_has_feature_ea_inode(sb)) {
> +		sbi->s_ea_inode_cache = ext4_xattr_create_cache();
> +		if (!sbi->s_ea_inode_cache) {
> +			ext4_msg(sb, KERN_ERR,
> +				 "Failed to create an s_ea_inode_cache");
> +			goto failed_mount_wq;
> +		}
> +	}

It would be preferable to allow a mount option like "no_mbcache" to disable
the use of shared xattrs.  In the Lustre case at least, there will never be
shared large xattrs, and we've had a bunch of performance issues with mbcache
due to lock contention among many server threads doing concurrent lookups and
inserting many thousands of unique entries into the cache.

> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index abc7d5f84e5f..2f9bcafd9aed 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -280,6 +283,34 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
> 	return cmp ? -ENODATA : 0;
> }
> 
> +static u32
> +ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
> +{
> +	return ext4_chksum(sbi, sbi->s_csum_seed ?: ~0, buffer, size);
> +}

This should follow the existing convention of always using s_csum_seed to seed
the checksum, and change ext4_fill_super() to initialize s_csum_seed to ~0 if
ext4_has_metadata_csum() is false, or always use the same value regardless of
whether ext4_has_metadata_csum() is set or not.

> +static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
> +{
> +	return ((u64)ea_inode->i_ctime.tv_sec << 32) |
> +	       ((u32)ea_inode->i_mtime.tv_sec);
> +}

If it really necessary to have more than 2^32 references on a single shared
inode then it would be better to avoid the re-use of i_mtime, which breaks
the backref for unshared xattrs, and using i_size isn't enough of a guarantee
that this is the correct parent inode in case of on-disk corruption.

If you think that > 2^32 references to a single xattr is really needed, you
can use i_ctime_extra, since this will almost certainly only be used on ext4
filesystems with 256-byte or larger inodes.  It is highly unlikely that there
are filesystems with multi-billions of shared xattrs that are ext2-formatted.

This allows for a period of transition between existing single-user xattr inodes
(which use i_mtime for the parent back-ref) and the shared xattr inodes, instead
of requiring a full e2fsck when upgrading from an older version of xattr inodes
to a new kernel, and then doing the same if there is a need to downgrade again.

> +static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
> +{
> +	ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32);
> +	ea_inode->i_mtime.tv_sec = (u32)ref_count;
> +}
> +
> +static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
> +{
> +	return (u32)ea_inode->i_atime.tv_sec;
> +}
> +
> +static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
> +{
> +	ea_inode->i_atime.tv_sec = hash;
> +}
> +
> /*
>  * Read the EA value from an inode.
>  */
> @@ -298,13 +331,24 @@ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
> 		if (!bh)
> 			return -EFSCORRUPTED;
> 
> -		memcpy(buf, bh->b_data, csize);
> +		memcpy(copy_pos, bh->b_data, csize);
> 		brelse(bh);
> 
> -		buf += csize;
> +		copy_pos += csize;
> 		block += 1;
> 		copied += csize;
> 	}
> +
> +	calc_hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buf, size);
> +
> +	/* Verify stored hash matches calculated hash. */
> +	stored_hash = ext4_xattr_inode_get_hash(ea_inode);
> +	if (calc_hash != stored_hash) {
> +		ext4_warning_inode(ea_inode,
> +			"EA inode calc_hash=%#x does not match stored_hash=%#x",
> +			calc_hash, stored_hash);
> +		return -EFSCORRUPTED;
> +	}

Should this be contingent on ext4_has_metadata_csum() feature being enabled, or
alternately check if EXT4_XATTR_INODE_GET_PARENT() and i_generation match before
returning an error.  This will allow a smooth transition from existing filesystems
that do not store the hash, but have only a single-use xattr inode with a parent
backref.

> @@ -329,14 +373,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
> 		goto error;
> 	}
> 
> -	if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
> -	    inode->i_generation != parent->i_generation) {
> -		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
> -			   "to parent is invalid.", ea_ino);
> -		err = -EINVAL;
> -		goto error;
> -	}

This check here should be moved up to ext4_xattr_inode_read().

> +static int __ext4_xattr_set_credits(struct super_block *sb,

> +				    struct buffer_head *block_bh,
> +				    size_t value_len)
> +{
> +	int credits;
> +	int blocks;
> +
> +	/*
> +	 * 1) Owner inode update
> +	 * 2) Ref count update on old xattr block
> +	 * 3) new xattr block
> +	 * 4) block bitmap update for new xattr block
> +	 * 5) group descriptor for new xattr block
> +	 */
> +	credits = 5;
> +
> +	/* We are done if ea_inode feature is not enabled. */
> +	if (!ext4_has_feature_ea_inode(sb))
> +		return credits;
> +
> +	/* New ea_inode, inode map, block bitmap, group descriptor. */
> +	credits += 4;
> +
> +	/* Data blocks. */
> +	blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
> +
> +	/* Indirection block. */
> +	blocks += 1;

Strictly speaking, this is only needed "if (blocks > EXT4_NDIR_BLOCKS)".

> +
> +	/* Block bitmap and group descriptor updates for each block. */
> +	credits += blocks * 2;
> +
> +	/* Blocks themselves. */
> +	credits += blocks;
> +
> +	/* Dereference ea_inode holding old xattr value.
> +	 * Old ea_inode, inode map, block bitmap, group descriptor.
> +	 */
> +	credits += 4;
> +
> +	/* Data blocks for old ea_inode. */
> +	blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
> +
> +	/* Indirection block for old ea_inode. */
> +	blocks += 1;
> +
> +	/* Block bitmap and group descriptor updates for each block. */
> +	credits += blocks * 2;
> +
> +	/* Quota updates. */
> +	credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
> +
> +	/* We may need to clone the existing xattr block in which case we need
> +	 * to increment ref counts for existing ea_inodes referenced by it.
> +	 */

Just to clarify here, in the case of cloning an existing xattr block, are the
refcounts being _incremented_ or _decremented_ on the existing ea_inodes?  I'm
trying to figure out if we really need to have credits for both old and new
xattr inodes, as well as these additional credits.  Since this is reserving
about 110 blocks for every setxattr, this can add significant pressure on the
journal if there are lots of threads creating files and/or setting xattrs.

> +	if (block_bh) {
> +		struct ext4_xattr_entry *entry = BFIRST(block_bh);
> +
> +		for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
> +			if (entry->e_value_inum)
> +				/* Ref count update on ea_inode. */
> +				credits += 1;
> +	}
> +	return credits;
> +}
> 
> @@ -965,67 +1257,121 @@ static struct inode *ext4_xattr_inode_create(handle_t	+static struct inode *
> +ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
> +			    size_t value_len, u32 hash)
> {
> +	struct inode *ea_inode;
> +	struct mb_cache_entry *ce;
> +	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
> +	void *ea_data = NULL;
> 	int err;

This function should just return NULL if ea_inode_cache is NULL (e.g. in
the case of "no_mbcache" mount option).

> +	ce = mb_cache_entry_find_first(ea_inode_cache, hash);
> +	while (ce) {
> +		ea_inode = ext4_iget(inode->i_sb, ce->e_value);
> +		if (IS_ERR(ea_inode)) {
> +			ea_inode = NULL;
> +			goto next;
> +		}
> 
> +		if (is_bad_inode(ea_inode) ||
> +		    !(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) ||
> +		    i_size_read(ea_inode) != value_len)
> +			goto next;
> 
> +		if (!ea_data)
> +			ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
> +
> +		if (!ea_data) {
> +			iput(ea_inode);
> +			return NULL;
> +		}
> +
> +		err = ext4_xattr_inode_read(ea_inode, ea_data, value_len);
> +		if (unlikely(err))
> +			goto next;
> +
> +		if (!memcmp(value, ea_data, value_len)) {
> +			mb_cache_entry_touch(ea_inode_cache, ce);
> +			mb_cache_entry_put(ea_inode_cache, ce);
> +			kvfree(ea_data);
> +			return ea_inode;
> +		}
> +next:
> +		iput(ea_inode);
> +		ce = mb_cache_entry_find_next(ea_inode_cache, ce);
> +	}
> +	kvfree(ea_data);
> +	return NULL;
> }
> 
> /*
>  * Add value of the EA in an inode.
>  */
> -static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
> -				unsigned long *ea_ino, const void *value,
> -				size_t value_len)
> +static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
> +					  const void *value, size_t value_len,
> +					  struct inode **ret_inode)
> {
> 	struct inode *ea_inode;
> +	u32 hash;
> 	int err;
> 
> +	hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
> +	ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
> +	if (ea_inode) {
> +		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
> +		if (err) {
> +			iput(ea_inode);
> +			return err;
> +		}
> +
> +		*ret_inode = ea_inode;
> +		return 0;
> +	}
> +
> 	/* Create an inode for the EA value */
> -	ea_inode = ext4_xattr_inode_create(handle, inode);
> +	ea_inode = ext4_xattr_inode_create(handle, inode, hash);
> 	if (IS_ERR(ea_inode))
> 		return PTR_ERR(ea_inode);
> 
> 	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
> -	if (err)
> -		clear_nlink(ea_inode);
> -	else
> -		*ea_ino = ea_inode->i_ino;
> +	if (err) {
> +		ext4_xattr_inode_dec_ref(handle, ea_inode);
> +		iput(ea_inode);
> +		return err;
> +	}
> 
> -	iput(ea_inode);
> +	mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
> +			      ea_inode->i_ino, true /* reusable */);

Should skip mb_cache if EA_INODE_CACHE(inode) is NULL, or have a wrapper
like ext4_xattr_inode_cache_insert() to match ext4_xattr_inode_cache_find()
that does the same.

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v2 26/28] ext4: cleanup transaction restarts during inode deletion
  2017-06-14 14:17   ` [PATCH v2 " Tahsin Erdogan
@ 2017-06-15  0:11       ` Andreas Dilger
  0 siblings, 0 replies; 100+ messages in thread
From: Andreas Dilger @ 2017-06-15  0:11 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Theodore Ts'o, Dave Kleikamp, Alexander Viro,
	Mark Fasheh, Joel Becker, Jens Axboe, Deepa Dinamani,
	Mike Christie, Fabian Frederick, linux-ext4, lkml,
	jfs-discussion, linux-fsdevel, ocfs2-devel, reiserfs-devel

[-- Attachment #1: Type: text/plain, Size: 15153 bytes --]

On Jun 14, 2017, at 8:17 AM, Tahsin Erdogan <tahsin@google.com> wrote:
> 
> During inode deletion, journal credits that will be needed are hard to
> determine, that is why we have journal extend/restart calls in several
> places. Whenever a transaction is restarted, filesystem must be in a
> consistent state because there is no atomicity guarantee beyond a
> restart call.
> 
> Add ext4_xattr_ensure_credits() helper function which takes care of
> journal extend/restart logic. It also handles getting jbd2 write access
> and dirty metadata calls. This function is called at every iteration of
> handling an ea_inode reference.

Another option that might be less complex is to just add the xattr inodes
to the orphan list in the main transaction (which should be a fixed number
of credits), and then truncate/unlink the xattr inodes after the main
transaction has completed rather than making the transactions arbitrarily
large.  At one point we even had a separate unlink thread to handle this
in the background to reduce the unlink latency for very large files, which
also avoids issues with nested transactions.

Cheers, Andreas

> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
> v2: made ext4_xattr_ensure_credits() static
> 
> fs/ext4/inode.c |  66 ++++-----------
> fs/ext4/xattr.c | 257 ++++++++++++++++++++++++++++++++++++--------------------
> fs/ext4/xattr.h |   3 +-
> 3 files changed, 183 insertions(+), 143 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index cf91532765a4..4d6936f0d8a4 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -239,7 +239,11 @@ void ext4_evict_inode(struct inode *inode)
> 	 */
> 	sb_start_intwrite(inode->i_sb);
> 
> -	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits);
> +	if (!IS_NOQUOTA(inode))
> +		extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
> +
> +	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
> +				 ext4_blocks_for_truncate(inode)+extra_credits);
> 	if (IS_ERR(handle)) {
> 		ext4_std_error(inode->i_sb, PTR_ERR(handle));
> 		/*
> @@ -251,36 +255,9 @@ void ext4_evict_inode(struct inode *inode)
> 		sb_end_intwrite(inode->i_sb);
> 		goto no_delete;
> 	}
> +
> 	if (IS_SYNC(inode))
> 		ext4_handle_sync(handle);
> -
> -	/*
> -	 * Delete xattr inode before deleting the main inode.
> -	 */
> -	err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array);
> -	if (err) {
> -		ext4_warning(inode->i_sb,
> -			     "couldn't delete inode's xattr (err %d)", err);
> -		goto stop_handle;
> -	}
> -
> -	if (!IS_NOQUOTA(inode))
> -		extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
> -
> -	if (!ext4_handle_has_enough_credits(handle,
> -			ext4_blocks_for_truncate(inode) + extra_credits)) {
> -		err = ext4_journal_extend(handle,
> -			ext4_blocks_for_truncate(inode) + extra_credits);
> -		if (err > 0)
> -			err = ext4_journal_restart(handle,
> -			ext4_blocks_for_truncate(inode) + extra_credits);
> -		if (err != 0) {
> -			ext4_warning(inode->i_sb,
> -				     "couldn't extend journal (err %d)", err);
> -			goto stop_handle;
> -		}
> -	}
> -
> 	inode->i_size = 0;
> 	err = ext4_mark_inode_dirty(handle, inode);
> 	if (err) {
> @@ -298,25 +275,17 @@ void ext4_evict_inode(struct inode *inode)
> 		}
> 	}
> 
> -	/*
> -	 * ext4_ext_truncate() doesn't reserve any slop when it
> -	 * restarts journal transactions; therefore there may not be
> -	 * enough credits left in the handle to remove the inode from
> -	 * the orphan list and set the dtime field.
> -	 */
> -	if (!ext4_handle_has_enough_credits(handle, extra_credits)) {
> -		err = ext4_journal_extend(handle, extra_credits);
> -		if (err > 0)
> -			err = ext4_journal_restart(handle, extra_credits);
> -		if (err != 0) {
> -			ext4_warning(inode->i_sb,
> -				     "couldn't extend journal (err %d)", err);
> -		stop_handle:
> -			ext4_journal_stop(handle);
> -			ext4_orphan_del(NULL, inode);
> -			sb_end_intwrite(inode->i_sb);
> -			goto no_delete;
> -		}
> +	/* Remove xattr references. */
> +	err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
> +				      extra_credits);
> +	if (err) {
> +		ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
> +	stop_handle:
> +		ext4_journal_stop(handle);
> +		ext4_orphan_del(NULL, inode);
> +		sb_end_intwrite(inode->i_sb);
> +		ext4_xattr_inode_array_free(ea_inode_array);
> +		goto no_delete;
> 	}
> 
> 	/*
> @@ -342,7 +311,6 @@ void ext4_evict_inode(struct inode *inode)
> 		ext4_clear_inode(inode);
> 	else
> 		ext4_free_inode(handle, inode);
> -
> 	ext4_journal_stop(handle);
> 	sb_end_intwrite(inode->i_sb);
> 	ext4_xattr_inode_array_free(ea_inode_array);
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 3ee7e2f68476..abc7d5f84e5f 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -108,6 +108,10 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
> #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
> 				inode->i_sb->s_fs_info)->s_mb_cache)
> 
> +static int
> +ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
> +			struct inode *inode);
> +
> #ifdef CONFIG_LOCKDEP
> void ext4_xattr_inode_set_class(struct inode *ea_inode)
> {
> @@ -653,6 +657,127 @@ static void ext4_xattr_update_super_block(handle_t *handle,
> 	}
> }
> 
> +static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
> +				     int credits, struct buffer_head *bh,
> +				     bool dirty, bool block_csum)
> +{
> +	int error;
> +
> +	if (!ext4_handle_valid(handle))
> +		return 0;
> +
> +	if (handle->h_buffer_credits >= credits)
> +		return 0;
> +
> +	error = ext4_journal_extend(handle, credits - handle->h_buffer_credits);
> +	if (!error)
> +		return 0;
> +	if (error < 0) {
> +		ext4_warning(inode->i_sb, "Extend journal (error %d)", error);
> +		return error;
> +	}
> +
> +	if (bh && dirty) {
> +		if (block_csum)
> +			ext4_xattr_block_csum_set(inode, bh);
> +		error = ext4_handle_dirty_metadata(handle, NULL, bh);
> +		if (error) {
> +			ext4_warning(inode->i_sb, "Handle metadata (error %d)",
> +				     error);
> +			return error;
> +		}
> +	}
> +
> +	error = ext4_journal_restart(handle, credits);
> +	if (error) {
> +		ext4_warning(inode->i_sb, "Restart journal (error %d)", error);
> +		return error;
> +	}
> +
> +	if (bh) {
> +		error = ext4_journal_get_write_access(handle, bh);
> +		if (error) {
> +			ext4_warning(inode->i_sb,
> +				     "Get write access failed (error %d)",
> +				     error);
> +			return error;
> +		}
> +	}
> +	return 0;
> +}
> +
> +static void
> +ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
> +			    struct buffer_head *bh,
> +			    struct ext4_xattr_entry *first, bool block_csum,
> +			    struct ext4_xattr_inode_array **ea_inode_array,
> +			    int extra_credits)
> +{
> +	struct inode *ea_inode;
> +	struct ext4_xattr_entry *entry;
> +	bool dirty = false;
> +	unsigned int ea_ino;
> +	int err;
> +	int credits;
> +
> +	/* One credit for dec ref on ea_inode, one for orphan list addition, */
> +	credits = 2 + extra_credits;
> +
> +	for (entry = first; !IS_LAST_ENTRY(entry);
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		ea_ino = le32_to_cpu(entry->e_value_inum);
> +		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +		if (err)
> +			continue;
> +
> +		err = ext4_expand_inode_array(ea_inode_array, ea_inode);
> +		if (err) {
> +			ext4_warning_inode(ea_inode,
> +					   "Expand inode array err=%d", err);
> +			iput(ea_inode);
> +			continue;
> +		}
> +
> +		err = ext4_xattr_ensure_credits(handle, parent, credits, bh,
> +						dirty, block_csum);
> +		if (err) {
> +			ext4_warning_inode(ea_inode, "Ensure credits err=%d",
> +					   err);
> +			continue;
> +		}
> +
> +		inode_lock(ea_inode);
> +		clear_nlink(ea_inode);
> +		ext4_orphan_add(handle, ea_inode);
> +		inode_unlock(ea_inode);
> +
> +		/*
> +		 * Forget about ea_inode within the same transaction that decrements the ref
> +		 * count. This avoids duplicate decrements in case the rest of the work
> +		 * spills over to subsequent transactions.
> +		 */
> +		entry->e_value_inum = 0;
> +		entry->e_value_size = 0;
> +
> +		dirty = true;
> +	}
> +
> +	if (dirty) {
> +		/*
> +		 * Note that we are deliberately skipping csum calculation for
> +		 * the final update because we do not expect any journal
> +		 * restarts until xattr block is freed.
> +		 */
> +
> +		err = ext4_handle_dirty_metadata(handle, NULL, bh);
> +		if (err)
> +			ext4_warning_inode(parent,
> +					   "handle dirty metadata err=%d", err);
> +	}
> +}
> +
> /*
>  * Release the xattr block BH: If the reference count is > 1, decrement it;
>  * otherwise free the block.
> @@ -1985,42 +2110,6 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
> 	return 0;
> }
> 
> -/**
> - * Add xattr inode to orphan list
> - */
> -static int
> -ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits,
> -			    struct ext4_xattr_inode_array *ea_inode_array)
> -{
> -	int idx = 0, error = 0;
> -	struct inode *ea_inode;
> -
> -	if (ea_inode_array == NULL)
> -		return 0;
> -
> -	for (; idx < ea_inode_array->count; ++idx) {
> -		if (!ext4_handle_has_enough_credits(handle, credits)) {
> -			error = ext4_journal_extend(handle, credits);
> -			if (error > 0)
> -				error = ext4_journal_restart(handle, credits);
> -
> -			if (error != 0) {
> -				ext4_warning(inode->i_sb,
> -					"couldn't extend journal "
> -					"(err %d)", error);
> -				return error;
> -			}
> -		}
> -		ea_inode = ea_inode_array->inodes[idx];
> -		inode_lock(ea_inode);
> -		ext4_orphan_add(handle, ea_inode);
> -		inode_unlock(ea_inode);
> -		/* the inode's i_count will be released by caller */
> -	}
> -
> -	return 0;
> -}
> -
> /*
>  * ext4_xattr_delete_inode()
>  *
> @@ -2033,16 +2122,23 @@ ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits,
>  */
> int
> ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> -			struct ext4_xattr_inode_array **ea_inode_array)
> +			struct ext4_xattr_inode_array **ea_inode_array,
> +			int extra_credits)
> {
> 	struct buffer_head *bh = NULL;
> 	struct ext4_xattr_ibody_header *header;
> 	struct ext4_inode *raw_inode;
> -	struct ext4_iloc iloc;
> -	struct ext4_xattr_entry *entry;
> -	struct inode *ea_inode;
> -	unsigned int ea_ino;
> -	int credits = 3, error = 0;
> +	struct ext4_iloc iloc = { .bh = NULL };
> +	int error;
> +
> +	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
> +					  NULL /* bh */,
> +					  false /* dirty */,
> +					  false /* block_csum */);
> +	if (error) {
> +		EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error);
> +		goto cleanup;
> +	}
> 
> 	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
> 		goto delete_external_ea;
> @@ -2050,31 +2146,20 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> 	error = ext4_get_inode_loc(inode, &iloc);
> 	if (error)
> 		goto cleanup;
> +
> +	error = ext4_journal_get_write_access(handle, iloc.bh);
> +	if (error)
> +		goto cleanup;
> +
> 	raw_inode = ext4_raw_inode(&iloc);
> 	header = IHDR(inode, raw_inode);
> -	for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
> -	     entry = EXT4_XATTR_NEXT(entry)) {
> -		if (!entry->e_value_inum)
> -			continue;
> -		ea_ino = le32_to_cpu(entry->e_value_inum);
> -		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -		if (error)
> -			continue;
> -		error = ext4_expand_inode_array(ea_inode_array, ea_inode);
> -		if (error) {
> -			iput(ea_inode);
> -			brelse(iloc.bh);
> -			goto cleanup;
> -		}
> -		entry->e_value_inum = 0;
> -	}
> -	brelse(iloc.bh);
> +	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
> +				    false /* block_csum */, ea_inode_array,
> +				    extra_credits);
> 
> delete_external_ea:
> 	if (!EXT4_I(inode)->i_file_acl) {
> -		/* add xattr inode to orphan list */
> -		error = ext4_xattr_inode_orphan_add(handle, inode, credits,
> -						    *ea_inode_array);
> +		error = 0;
> 		goto cleanup;
> 	}
> 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> @@ -2092,46 +2177,32 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> 		goto cleanup;
> 	}
> 
> -	for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> -	     entry = EXT4_XATTR_NEXT(entry)) {
> -		if (!entry->e_value_inum)
> -			continue;
> -		ea_ino = le32_to_cpu(entry->e_value_inum);
> -		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -		if (error)
> -			continue;
> -		error = ext4_expand_inode_array(ea_inode_array, ea_inode);
> -		if (error)
> -			goto cleanup;
> -		entry->e_value_inum = 0;
> -	}
> -
> -	/* add xattr inode to orphan list */
> -	error = ext4_xattr_inode_orphan_add(handle, inode, credits,
> -					*ea_inode_array);
> -	if (error)
> -		goto cleanup;
> -
> -	if (!IS_NOQUOTA(inode))
> -		credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
> -
> -	if (!ext4_handle_has_enough_credits(handle, credits)) {
> -		error = ext4_journal_extend(handle, credits);
> -		if (error > 0)
> -			error = ext4_journal_restart(handle, credits);
> +	if (ext4_has_feature_ea_inode(inode->i_sb)) {
> +		error = ext4_journal_get_write_access(handle, bh);
> 		if (error) {
> -			ext4_warning(inode->i_sb,
> -				"couldn't extend journal (err %d)", error);
> +			EXT4_ERROR_INODE(inode, "write access %llu",
> +					 EXT4_I(inode)->i_file_acl);
> 			goto cleanup;
> 		}
> +		ext4_xattr_inode_remove_all(handle, inode, bh,
> +					    BFIRST(bh),
> +					    true /* block_csum */,
> +					    ea_inode_array,
> +					    extra_credits);
> 	}
> 
> 	ext4_xattr_release_block(handle, inode, bh);
> +	/* Update i_file_acl within the same transaction that releases block. */
> 	EXT4_I(inode)->i_file_acl = 0;
> -
> +	error = ext4_mark_inode_dirty(handle, inode);
> +	if (error) {
> +		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> +				 error);
> +		goto cleanup;
> +	}
> cleanup:
> +	brelse(iloc.bh);
> 	brelse(bh);
> -
> 	return error;
> }
> 
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index adf761518a73..b2005a2716d9 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -169,7 +169,8 @@ extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
> 
> extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
> extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> -				   struct ext4_xattr_inode_array **array);
> +				   struct ext4_xattr_inode_array **array,
> +				   int extra_credits);
> extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
> 
> extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
> --
> 2.13.1.508.gb3defc5cc-goog
> 


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [Ocfs2-devel] [PATCH v2 26/28] ext4: cleanup transaction restarts during inode deletion
@ 2017-06-15  0:11       ` Andreas Dilger
  0 siblings, 0 replies; 100+ messages in thread
From: Andreas Dilger @ 2017-06-15  0:11 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Theodore Ts'o, Dave Kleikamp, Alexander Viro,
	Mark Fasheh, Joel Becker, Jens Axboe, Deepa Dinamani,
	Mike Christie, Fabian Frederick, linux-ext4, lkml,
	jfs-discussion, linux-fsdevel, ocfs2-devel, reiserfs-devel

On Jun 14, 2017, at 8:17 AM, Tahsin Erdogan <tahsin@google.com> wrote:
> 
> During inode deletion, journal credits that will be needed are hard to
> determine, that is why we have journal extend/restart calls in several
> places. Whenever a transaction is restarted, filesystem must be in a
> consistent state because there is no atomicity guarantee beyond a
> restart call.
> 
> Add ext4_xattr_ensure_credits() helper function which takes care of
> journal extend/restart logic. It also handles getting jbd2 write access
> and dirty metadata calls. This function is called at every iteration of
> handling an ea_inode reference.

Another option that might be less complex is to just add the xattr inodes
to the orphan list in the main transaction (which should be a fixed number
of credits), and then truncate/unlink the xattr inodes after the main
transaction has completed rather than making the transactions arbitrarily
large.  At one point we even had a separate unlink thread to handle this
in the background to reduce the unlink latency for very large files, which
also avoids issues with nested transactions.

Cheers, Andreas

> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
> v2: made ext4_xattr_ensure_credits() static
> 
> fs/ext4/inode.c |  66 ++++-----------
> fs/ext4/xattr.c | 257 ++++++++++++++++++++++++++++++++++++--------------------
> fs/ext4/xattr.h |   3 +-
> 3 files changed, 183 insertions(+), 143 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index cf91532765a4..4d6936f0d8a4 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -239,7 +239,11 @@ void ext4_evict_inode(struct inode *inode)
> 	 */
> 	sb_start_intwrite(inode->i_sb);
> 
> -	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits);
> +	if (!IS_NOQUOTA(inode))
> +		extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
> +
> +	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
> +				 ext4_blocks_for_truncate(inode)+extra_credits);
> 	if (IS_ERR(handle)) {
> 		ext4_std_error(inode->i_sb, PTR_ERR(handle));
> 		/*
> @@ -251,36 +255,9 @@ void ext4_evict_inode(struct inode *inode)
> 		sb_end_intwrite(inode->i_sb);
> 		goto no_delete;
> 	}
> +
> 	if (IS_SYNC(inode))
> 		ext4_handle_sync(handle);
> -
> -	/*
> -	 * Delete xattr inode before deleting the main inode.
> -	 */
> -	err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array);
> -	if (err) {
> -		ext4_warning(inode->i_sb,
> -			     "couldn't delete inode's xattr (err %d)", err);
> -		goto stop_handle;
> -	}
> -
> -	if (!IS_NOQUOTA(inode))
> -		extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
> -
> -	if (!ext4_handle_has_enough_credits(handle,
> -			ext4_blocks_for_truncate(inode) + extra_credits)) {
> -		err = ext4_journal_extend(handle,
> -			ext4_blocks_for_truncate(inode) + extra_credits);
> -		if (err > 0)
> -			err = ext4_journal_restart(handle,
> -			ext4_blocks_for_truncate(inode) + extra_credits);
> -		if (err != 0) {
> -			ext4_warning(inode->i_sb,
> -				     "couldn't extend journal (err %d)", err);
> -			goto stop_handle;
> -		}
> -	}
> -
> 	inode->i_size = 0;
> 	err = ext4_mark_inode_dirty(handle, inode);
> 	if (err) {
> @@ -298,25 +275,17 @@ void ext4_evict_inode(struct inode *inode)
> 		}
> 	}
> 
> -	/*
> -	 * ext4_ext_truncate() doesn't reserve any slop when it
> -	 * restarts journal transactions; therefore there may not be
> -	 * enough credits left in the handle to remove the inode from
> -	 * the orphan list and set the dtime field.
> -	 */
> -	if (!ext4_handle_has_enough_credits(handle, extra_credits)) {
> -		err = ext4_journal_extend(handle, extra_credits);
> -		if (err > 0)
> -			err = ext4_journal_restart(handle, extra_credits);
> -		if (err != 0) {
> -			ext4_warning(inode->i_sb,
> -				     "couldn't extend journal (err %d)", err);
> -		stop_handle:
> -			ext4_journal_stop(handle);
> -			ext4_orphan_del(NULL, inode);
> -			sb_end_intwrite(inode->i_sb);
> -			goto no_delete;
> -		}
> +	/* Remove xattr references. */
> +	err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
> +				      extra_credits);
> +	if (err) {
> +		ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
> +	stop_handle:
> +		ext4_journal_stop(handle);
> +		ext4_orphan_del(NULL, inode);
> +		sb_end_intwrite(inode->i_sb);
> +		ext4_xattr_inode_array_free(ea_inode_array);
> +		goto no_delete;
> 	}
> 
> 	/*
> @@ -342,7 +311,6 @@ void ext4_evict_inode(struct inode *inode)
> 		ext4_clear_inode(inode);
> 	else
> 		ext4_free_inode(handle, inode);
> -
> 	ext4_journal_stop(handle);
> 	sb_end_intwrite(inode->i_sb);
> 	ext4_xattr_inode_array_free(ea_inode_array);
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 3ee7e2f68476..abc7d5f84e5f 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -108,6 +108,10 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
> #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
> 				inode->i_sb->s_fs_info)->s_mb_cache)
> 
> +static int
> +ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
> +			struct inode *inode);
> +
> #ifdef CONFIG_LOCKDEP
> void ext4_xattr_inode_set_class(struct inode *ea_inode)
> {
> @@ -653,6 +657,127 @@ static void ext4_xattr_update_super_block(handle_t *handle,
> 	}
> }
> 
> +static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
> +				     int credits, struct buffer_head *bh,
> +				     bool dirty, bool block_csum)
> +{
> +	int error;
> +
> +	if (!ext4_handle_valid(handle))
> +		return 0;
> +
> +	if (handle->h_buffer_credits >= credits)
> +		return 0;
> +
> +	error = ext4_journal_extend(handle, credits - handle->h_buffer_credits);
> +	if (!error)
> +		return 0;
> +	if (error < 0) {
> +		ext4_warning(inode->i_sb, "Extend journal (error %d)", error);
> +		return error;
> +	}
> +
> +	if (bh && dirty) {
> +		if (block_csum)
> +			ext4_xattr_block_csum_set(inode, bh);
> +		error = ext4_handle_dirty_metadata(handle, NULL, bh);
> +		if (error) {
> +			ext4_warning(inode->i_sb, "Handle metadata (error %d)",
> +				     error);
> +			return error;
> +		}
> +	}
> +
> +	error = ext4_journal_restart(handle, credits);
> +	if (error) {
> +		ext4_warning(inode->i_sb, "Restart journal (error %d)", error);
> +		return error;
> +	}
> +
> +	if (bh) {
> +		error = ext4_journal_get_write_access(handle, bh);
> +		if (error) {
> +			ext4_warning(inode->i_sb,
> +				     "Get write access failed (error %d)",
> +				     error);
> +			return error;
> +		}
> +	}
> +	return 0;
> +}
> +
> +static void
> +ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
> +			    struct buffer_head *bh,
> +			    struct ext4_xattr_entry *first, bool block_csum,
> +			    struct ext4_xattr_inode_array **ea_inode_array,
> +			    int extra_credits)
> +{
> +	struct inode *ea_inode;
> +	struct ext4_xattr_entry *entry;
> +	bool dirty = false;
> +	unsigned int ea_ino;
> +	int err;
> +	int credits;
> +
> +	/* One credit for dec ref on ea_inode, one for orphan list addition, */
> +	credits = 2 + extra_credits;
> +
> +	for (entry = first; !IS_LAST_ENTRY(entry);
> +	     entry = EXT4_XATTR_NEXT(entry)) {
> +		if (!entry->e_value_inum)
> +			continue;
> +		ea_ino = le32_to_cpu(entry->e_value_inum);
> +		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +		if (err)
> +			continue;
> +
> +		err = ext4_expand_inode_array(ea_inode_array, ea_inode);
> +		if (err) {
> +			ext4_warning_inode(ea_inode,
> +					   "Expand inode array err=%d", err);
> +			iput(ea_inode);
> +			continue;
> +		}
> +
> +		err = ext4_xattr_ensure_credits(handle, parent, credits, bh,
> +						dirty, block_csum);
> +		if (err) {
> +			ext4_warning_inode(ea_inode, "Ensure credits err=%d",
> +					   err);
> +			continue;
> +		}
> +
> +		inode_lock(ea_inode);
> +		clear_nlink(ea_inode);
> +		ext4_orphan_add(handle, ea_inode);
> +		inode_unlock(ea_inode);
> +
> +		/*
> +		 * Forget about ea_inode within the same transaction that decrements the ref
> +		 * count. This avoids duplicate decrements in case the rest of the work
> +		 * spills over to subsequent transactions.
> +		 */
> +		entry->e_value_inum = 0;
> +		entry->e_value_size = 0;
> +
> +		dirty = true;
> +	}
> +
> +	if (dirty) {
> +		/*
> +		 * Note that we are deliberately skipping csum calculation for
> +		 * the final update because we do not expect any journal
> +		 * restarts until xattr block is freed.
> +		 */
> +
> +		err = ext4_handle_dirty_metadata(handle, NULL, bh);
> +		if (err)
> +			ext4_warning_inode(parent,
> +					   "handle dirty metadata err=%d", err);
> +	}
> +}
> +
> /*
>  * Release the xattr block BH: If the reference count is > 1, decrement it;
>  * otherwise free the block.
> @@ -1985,42 +2110,6 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
> 	return 0;
> }
> 
> -/**
> - * Add xattr inode to orphan list
> - */
> -static int
> -ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits,
> -			    struct ext4_xattr_inode_array *ea_inode_array)
> -{
> -	int idx = 0, error = 0;
> -	struct inode *ea_inode;
> -
> -	if (ea_inode_array == NULL)
> -		return 0;
> -
> -	for (; idx < ea_inode_array->count; ++idx) {
> -		if (!ext4_handle_has_enough_credits(handle, credits)) {
> -			error = ext4_journal_extend(handle, credits);
> -			if (error > 0)
> -				error = ext4_journal_restart(handle, credits);
> -
> -			if (error != 0) {
> -				ext4_warning(inode->i_sb,
> -					"couldn't extend journal "
> -					"(err %d)", error);
> -				return error;
> -			}
> -		}
> -		ea_inode = ea_inode_array->inodes[idx];
> -		inode_lock(ea_inode);
> -		ext4_orphan_add(handle, ea_inode);
> -		inode_unlock(ea_inode);
> -		/* the inode's i_count will be released by caller */
> -	}
> -
> -	return 0;
> -}
> -
> /*
>  * ext4_xattr_delete_inode()
>  *
> @@ -2033,16 +2122,23 @@ ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits,
>  */
> int
> ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> -			struct ext4_xattr_inode_array **ea_inode_array)
> +			struct ext4_xattr_inode_array **ea_inode_array,
> +			int extra_credits)
> {
> 	struct buffer_head *bh = NULL;
> 	struct ext4_xattr_ibody_header *header;
> 	struct ext4_inode *raw_inode;
> -	struct ext4_iloc iloc;
> -	struct ext4_xattr_entry *entry;
> -	struct inode *ea_inode;
> -	unsigned int ea_ino;
> -	int credits = 3, error = 0;
> +	struct ext4_iloc iloc = { .bh = NULL };
> +	int error;
> +
> +	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
> +					  NULL /* bh */,
> +					  false /* dirty */,
> +					  false /* block_csum */);
> +	if (error) {
> +		EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error);
> +		goto cleanup;
> +	}
> 
> 	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
> 		goto delete_external_ea;
> @@ -2050,31 +2146,20 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> 	error = ext4_get_inode_loc(inode, &iloc);
> 	if (error)
> 		goto cleanup;
> +
> +	error = ext4_journal_get_write_access(handle, iloc.bh);
> +	if (error)
> +		goto cleanup;
> +
> 	raw_inode = ext4_raw_inode(&iloc);
> 	header = IHDR(inode, raw_inode);
> -	for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
> -	     entry = EXT4_XATTR_NEXT(entry)) {
> -		if (!entry->e_value_inum)
> -			continue;
> -		ea_ino = le32_to_cpu(entry->e_value_inum);
> -		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -		if (error)
> -			continue;
> -		error = ext4_expand_inode_array(ea_inode_array, ea_inode);
> -		if (error) {
> -			iput(ea_inode);
> -			brelse(iloc.bh);
> -			goto cleanup;
> -		}
> -		entry->e_value_inum = 0;
> -	}
> -	brelse(iloc.bh);
> +	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
> +				    false /* block_csum */, ea_inode_array,
> +				    extra_credits);
> 
> delete_external_ea:
> 	if (!EXT4_I(inode)->i_file_acl) {
> -		/* add xattr inode to orphan list */
> -		error = ext4_xattr_inode_orphan_add(handle, inode, credits,
> -						    *ea_inode_array);
> +		error = 0;
> 		goto cleanup;
> 	}
> 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> @@ -2092,46 +2177,32 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> 		goto cleanup;
> 	}
> 
> -	for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> -	     entry = EXT4_XATTR_NEXT(entry)) {
> -		if (!entry->e_value_inum)
> -			continue;
> -		ea_ino = le32_to_cpu(entry->e_value_inum);
> -		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -		if (error)
> -			continue;
> -		error = ext4_expand_inode_array(ea_inode_array, ea_inode);
> -		if (error)
> -			goto cleanup;
> -		entry->e_value_inum = 0;
> -	}
> -
> -	/* add xattr inode to orphan list */
> -	error = ext4_xattr_inode_orphan_add(handle, inode, credits,
> -					*ea_inode_array);
> -	if (error)
> -		goto cleanup;
> -
> -	if (!IS_NOQUOTA(inode))
> -		credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
> -
> -	if (!ext4_handle_has_enough_credits(handle, credits)) {
> -		error = ext4_journal_extend(handle, credits);
> -		if (error > 0)
> -			error = ext4_journal_restart(handle, credits);
> +	if (ext4_has_feature_ea_inode(inode->i_sb)) {
> +		error = ext4_journal_get_write_access(handle, bh);
> 		if (error) {
> -			ext4_warning(inode->i_sb,
> -				"couldn't extend journal (err %d)", error);
> +			EXT4_ERROR_INODE(inode, "write access %llu",
> +					 EXT4_I(inode)->i_file_acl);
> 			goto cleanup;
> 		}
> +		ext4_xattr_inode_remove_all(handle, inode, bh,
> +					    BFIRST(bh),
> +					    true /* block_csum */,
> +					    ea_inode_array,
> +					    extra_credits);
> 	}
> 
> 	ext4_xattr_release_block(handle, inode, bh);
> +	/* Update i_file_acl within the same transaction that releases block. */
> 	EXT4_I(inode)->i_file_acl = 0;
> -
> +	error = ext4_mark_inode_dirty(handle, inode);
> +	if (error) {
> +		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> +				 error);
> +		goto cleanup;
> +	}
> cleanup:
> +	brelse(iloc.bh);
> 	brelse(bh);
> -
> 	return error;
> }
> 
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index adf761518a73..b2005a2716d9 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -169,7 +169,8 @@ extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
> 
> extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
> extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> -				   struct ext4_xattr_inode_array **array);
> +				   struct ext4_xattr_inode_array **array,
> +				   int extra_credits);
> extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
> 
> extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
> --
> 2.13.1.508.gb3defc5cc-goog
> 


Cheers, Andreas





-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 195 bytes
Desc: Message signed with OpenPGP
Url : http://oss.oracle.com/pipermail/ocfs2-devel/attachments/20170614/9174eb88/attachment.bin 

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 23/28] mbcache: make mbcache more generic
  2017-05-31  8:15 ` [PATCH 23/28] mbcache: make mbcache more generic Tahsin Erdogan
@ 2017-06-15  7:41     ` Jan Kara
  0 siblings, 0 replies; 100+ messages in thread
From: Jan Kara @ 2017-06-15  7:41 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

On Wed 31-05-17 01:15:12, Tahsin Erdogan wrote:
> Large xattr feature would like to use the mbcache for xattr value
> deduplication. Current implementation is geared towards xattr block
> deduplication. Make it more generic so that it can be used by both.

Can you explain a bit more what do you mean by "make it more generic" as it
seems you just rename a couple of things here...

								Honza

> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext2/xattr.c         | 18 +++++++++---------
>  fs/ext4/xattr.c         | 10 +++++-----
>  fs/mbcache.c            | 43 +++++++++++++++++++++----------------------
>  include/linux/mbcache.h | 14 ++++++++------
>  4 files changed, 43 insertions(+), 42 deletions(-)
> 
> diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
> index fbdb8f171893..1e5f76070580 100644
> --- a/fs/ext2/xattr.c
> +++ b/fs/ext2/xattr.c
> @@ -493,8 +493,8 @@ bad_block:		ext2_error(sb, "ext2_xattr_set",
>  			 * This must happen under buffer lock for
>  			 * ext2_xattr_set2() to reliably detect modified block
>  			 */
> -			mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
> -						    hash, bh->b_blocknr);
> +			mb_cache_entry_delete(EXT2_SB(sb)->s_mb_cache, hash,
> +					      bh->b_blocknr);
>  
>  			/* keep the buffer locked while modifying it. */
>  		} else {
> @@ -721,8 +721,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
>  			 * This must happen under buffer lock for
>  			 * ext2_xattr_set2() to reliably detect freed block
>  			 */
> -			mb_cache_entry_delete_block(ext2_mb_cache,
> -						    hash, old_bh->b_blocknr);
> +			mb_cache_entry_delete(ext2_mb_cache, hash,
> +					      old_bh->b_blocknr);
>  			/* Free the old block. */
>  			ea_bdebug(old_bh, "freeing");
>  			ext2_free_blocks(inode, old_bh->b_blocknr, 1);
> @@ -795,8 +795,8 @@ ext2_xattr_delete_inode(struct inode *inode)
>  		 * This must happen under buffer lock for ext2_xattr_set2() to
>  		 * reliably detect freed block
>  		 */
> -		mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
> -					    hash, bh->b_blocknr);
> +		mb_cache_entry_delete(EXT2_SB(inode->i_sb)->s_mb_cache, hash,
> +				      bh->b_blocknr);
>  		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
>  		get_bh(bh);
>  		bforget(bh);
> @@ -907,11 +907,11 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
>  	while (ce) {
>  		struct buffer_head *bh;
>  
> -		bh = sb_bread(inode->i_sb, ce->e_block);
> +		bh = sb_bread(inode->i_sb, ce->e_value);
>  		if (!bh) {
>  			ext2_error(inode->i_sb, "ext2_xattr_cache_find",
>  				"inode %ld: block %ld read error",
> -				inode->i_ino, (unsigned long) ce->e_block);
> +				inode->i_ino, (unsigned long) ce->e_value);
>  		} else {
>  			lock_buffer(bh);
>  			/*
> @@ -931,7 +931,7 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
>  			} else if (le32_to_cpu(HDR(bh)->h_refcount) >
>  				   EXT2_XATTR_REFCOUNT_MAX) {
>  				ea_idebug(inode, "block %ld refcount %d>%d",
> -					  (unsigned long) ce->e_block,
> +					  (unsigned long) ce->e_value,
>  					  le32_to_cpu(HDR(bh)->h_refcount),
>  					  EXT2_XATTR_REFCOUNT_MAX);
>  			} else if (!ext2_xattr_cmp(header, HDR(bh))) {
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 886d06e409b6..772948f168c3 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -678,7 +678,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>  		 * This must happen under buffer lock for
>  		 * ext4_xattr_block_set() to reliably detect freed block
>  		 */
> -		mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
> +		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
>  		get_bh(bh);
>  		unlock_buffer(bh);
>  		ext4_free_blocks(handle, inode, bh, 0, 1,
> @@ -1115,8 +1115,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  			 * ext4_xattr_block_set() to reliably detect modified
>  			 * block
>  			 */
> -			mb_cache_entry_delete_block(ext4_mb_cache, hash,
> -						    bs->bh->b_blocknr);
> +			mb_cache_entry_delete(ext4_mb_cache, hash,
> +					      bs->bh->b_blocknr);
>  			ea_bdebug(bs->bh, "modifying in-place");
>  			error = ext4_xattr_set_entry(i, s, handle, inode);
>  			if (!error) {
> @@ -2238,10 +2238,10 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
>  	while (ce) {
>  		struct buffer_head *bh;
>  
> -		bh = sb_bread(inode->i_sb, ce->e_block);
> +		bh = sb_bread(inode->i_sb, ce->e_value);
>  		if (!bh) {
>  			EXT4_ERROR_INODE(inode, "block %lu read error",
> -					 (unsigned long) ce->e_block);
> +					 (unsigned long) ce->e_value);
>  		} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
>  			*pce = ce;
>  			return bh;
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index b19be429d655..77a5b99d8f92 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -10,7 +10,7 @@
>  /*
>   * Mbcache is a simple key-value store. Keys need not be unique, however
>   * key-value pairs are expected to be unique (we use this fact in
> - * mb_cache_entry_delete_block()).
> + * mb_cache_entry_delete()).
>   *
>   * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
>   * They use hash of a block contents as a key and block number as a value.
> @@ -62,15 +62,15 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
>   * @cache - cache where the entry should be created
>   * @mask - gfp mask with which the entry should be allocated
>   * @key - key of the entry
> - * @block - block that contains data
> - * @reusable - is the block reusable by other inodes?
> + * @value - value of the entry
> + * @reusable - is the entry reusable by others?
>   *
> - * Creates entry in @cache with key @key and records that data is stored in
> - * block @block. The function returns -EBUSY if entry with the same key
> - * and for the same block already exists in cache. Otherwise 0 is returned.
> + * Creates entry in @cache with key @key and value @value. The function returns
> + * -EBUSY if entry with the same key and value already exists in cache.
> + * Otherwise 0 is returned.
>   */
>  int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
> -			  sector_t block, bool reusable)
> +			  cache_value_t value, bool reusable)
>  {
>  	struct mb_cache_entry *entry, *dup;
>  	struct hlist_bl_node *dup_node;
> @@ -91,12 +91,12 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
>  	/* One ref for hash, one ref returned */
>  	atomic_set(&entry->e_refcnt, 1);
>  	entry->e_key = key;
> -	entry->e_block = block;
> +	entry->e_value = value;
>  	entry->e_reusable = reusable;
>  	head = mb_cache_entry_head(cache, key);
>  	hlist_bl_lock(head);
>  	hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
> -		if (dup->e_key == key && dup->e_block == block) {
> +		if (dup->e_key == key && dup->e_value == value) {
>  			hlist_bl_unlock(head);
>  			kmem_cache_free(mb_entry_cache, entry);
>  			return -EBUSY;
> @@ -187,13 +187,13 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
>  EXPORT_SYMBOL(mb_cache_entry_find_next);
>  
>  /*
> - * mb_cache_entry_get - get a cache entry by block number (and key)
> + * mb_cache_entry_get - get a cache entry by value (and key)
>   * @cache - cache we work with
> - * @key - key of block number @block
> - * @block - block number
> + * @key - key
> + * @value - value
>   */
>  struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
> -					  sector_t block)
> +					  cache_value_t value)
>  {
>  	struct hlist_bl_node *node;
>  	struct hlist_bl_head *head;
> @@ -202,7 +202,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
>  	head = mb_cache_entry_head(cache, key);
>  	hlist_bl_lock(head);
>  	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
> -		if (entry->e_key == key && entry->e_block == block) {
> +		if (entry->e_key == key && entry->e_value == value) {
>  			atomic_inc(&entry->e_refcnt);
>  			goto out;
>  		}
> @@ -214,15 +214,14 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
>  }
>  EXPORT_SYMBOL(mb_cache_entry_get);
>  
> -/* mb_cache_entry_delete_block - remove information about block from cache
> +/* mb_cache_entry_delete - remove a cache entry
>   * @cache - cache we work with
> - * @key - key of block @block
> - * @block - block number
> + * @key - key
> + * @value - value
>   *
> - * Remove entry from cache @cache with key @key with data stored in @block.
> + * Remove entry from cache @cache with key @key and value @value.
>   */
> -void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
> -				 sector_t block)
> +void mb_cache_entry_delete(struct mb_cache *cache, u32 key, cache_value_t value)
>  {
>  	struct hlist_bl_node *node;
>  	struct hlist_bl_head *head;
> @@ -231,7 +230,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
>  	head = mb_cache_entry_head(cache, key);
>  	hlist_bl_lock(head);
>  	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
> -		if (entry->e_key == key && entry->e_block == block) {
> +		if (entry->e_key == key && entry->e_value == value) {
>  			/* We keep hash list reference to keep entry alive */
>  			hlist_bl_del_init(&entry->e_hash_list);
>  			hlist_bl_unlock(head);
> @@ -248,7 +247,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
>  	}
>  	hlist_bl_unlock(head);
>  }
> -EXPORT_SYMBOL(mb_cache_entry_delete_block);
> +EXPORT_SYMBOL(mb_cache_entry_delete);
>  
>  /* mb_cache_entry_touch - cache entry got used
>   * @cache - cache the entry belongs to
> diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
> index 86c9a8b480c5..e2d9f2f926a4 100644
> --- a/include/linux/mbcache.h
> +++ b/include/linux/mbcache.h
> @@ -9,6 +9,8 @@
>  
>  struct mb_cache;
>  
> +typedef sector_t cache_value_t;
> +
>  struct mb_cache_entry {
>  	/* List of entries in cache - protected by cache->c_list_lock */
>  	struct list_head	e_list;
> @@ -19,15 +21,15 @@ struct mb_cache_entry {
>  	u32			e_key;
>  	u32			e_referenced:1;
>  	u32			e_reusable:1;
> -	/* Block number of hashed block - stable during lifetime of the entry */
> -	sector_t		e_block;
> +	/* User provided value - stable during lifetime of the entry */
> +	cache_value_t		e_value;
>  };
>  
>  struct mb_cache *mb_cache_create(int bucket_bits);
>  void mb_cache_destroy(struct mb_cache *cache);
>  
>  int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
> -			  sector_t block, bool reusable);
> +			  cache_value_t value, bool reusable);
>  void __mb_cache_entry_free(struct mb_cache_entry *entry);
>  static inline int mb_cache_entry_put(struct mb_cache *cache,
>  				     struct mb_cache_entry *entry)
> @@ -38,10 +40,10 @@ static inline int mb_cache_entry_put(struct mb_cache *cache,
>  	return 1;
>  }
>  
> -void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
> -				  sector_t block);
> +void mb_cache_entry_delete(struct mb_cache *cache, u32 key,
> +			   cache_value_t value);
>  struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
> -					  sector_t block);
> +					  cache_value_t value);
>  struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
>  						 u32 key);
>  struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
> -- 
> 2.13.0.219.gdb65acc882-goog
> 
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [Ocfs2-devel] [PATCH 23/28] mbcache: make mbcache more generic
@ 2017-06-15  7:41     ` Jan Kara
  0 siblings, 0 replies; 100+ messages in thread
From: Jan Kara @ 2017-06-15  7:41 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

On Wed 31-05-17 01:15:12, Tahsin Erdogan wrote:
> Large xattr feature would like to use the mbcache for xattr value
> deduplication. Current implementation is geared towards xattr block
> deduplication. Make it more generic so that it can be used by both.

Can you explain a bit more what do you mean by "make it more generic" as it
seems you just rename a couple of things here...

								Honza

> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext2/xattr.c         | 18 +++++++++---------
>  fs/ext4/xattr.c         | 10 +++++-----
>  fs/mbcache.c            | 43 +++++++++++++++++++++----------------------
>  include/linux/mbcache.h | 14 ++++++++------
>  4 files changed, 43 insertions(+), 42 deletions(-)
> 
> diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
> index fbdb8f171893..1e5f76070580 100644
> --- a/fs/ext2/xattr.c
> +++ b/fs/ext2/xattr.c
> @@ -493,8 +493,8 @@ bad_block:		ext2_error(sb, "ext2_xattr_set",
>  			 * This must happen under buffer lock for
>  			 * ext2_xattr_set2() to reliably detect modified block
>  			 */
> -			mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
> -						    hash, bh->b_blocknr);
> +			mb_cache_entry_delete(EXT2_SB(sb)->s_mb_cache, hash,
> +					      bh->b_blocknr);
>  
>  			/* keep the buffer locked while modifying it. */
>  		} else {
> @@ -721,8 +721,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
>  			 * This must happen under buffer lock for
>  			 * ext2_xattr_set2() to reliably detect freed block
>  			 */
> -			mb_cache_entry_delete_block(ext2_mb_cache,
> -						    hash, old_bh->b_blocknr);
> +			mb_cache_entry_delete(ext2_mb_cache, hash,
> +					      old_bh->b_blocknr);
>  			/* Free the old block. */
>  			ea_bdebug(old_bh, "freeing");
>  			ext2_free_blocks(inode, old_bh->b_blocknr, 1);
> @@ -795,8 +795,8 @@ ext2_xattr_delete_inode(struct inode *inode)
>  		 * This must happen under buffer lock for ext2_xattr_set2() to
>  		 * reliably detect freed block
>  		 */
> -		mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
> -					    hash, bh->b_blocknr);
> +		mb_cache_entry_delete(EXT2_SB(inode->i_sb)->s_mb_cache, hash,
> +				      bh->b_blocknr);
>  		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
>  		get_bh(bh);
>  		bforget(bh);
> @@ -907,11 +907,11 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
>  	while (ce) {
>  		struct buffer_head *bh;
>  
> -		bh = sb_bread(inode->i_sb, ce->e_block);
> +		bh = sb_bread(inode->i_sb, ce->e_value);
>  		if (!bh) {
>  			ext2_error(inode->i_sb, "ext2_xattr_cache_find",
>  				"inode %ld: block %ld read error",
> -				inode->i_ino, (unsigned long) ce->e_block);
> +				inode->i_ino, (unsigned long) ce->e_value);
>  		} else {
>  			lock_buffer(bh);
>  			/*
> @@ -931,7 +931,7 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
>  			} else if (le32_to_cpu(HDR(bh)->h_refcount) >
>  				   EXT2_XATTR_REFCOUNT_MAX) {
>  				ea_idebug(inode, "block %ld refcount %d>%d",
> -					  (unsigned long) ce->e_block,
> +					  (unsigned long) ce->e_value,
>  					  le32_to_cpu(HDR(bh)->h_refcount),
>  					  EXT2_XATTR_REFCOUNT_MAX);
>  			} else if (!ext2_xattr_cmp(header, HDR(bh))) {
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 886d06e409b6..772948f168c3 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -678,7 +678,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>  		 * This must happen under buffer lock for
>  		 * ext4_xattr_block_set() to reliably detect freed block
>  		 */
> -		mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
> +		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
>  		get_bh(bh);
>  		unlock_buffer(bh);
>  		ext4_free_blocks(handle, inode, bh, 0, 1,
> @@ -1115,8 +1115,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  			 * ext4_xattr_block_set() to reliably detect modified
>  			 * block
>  			 */
> -			mb_cache_entry_delete_block(ext4_mb_cache, hash,
> -						    bs->bh->b_blocknr);
> +			mb_cache_entry_delete(ext4_mb_cache, hash,
> +					      bs->bh->b_blocknr);
>  			ea_bdebug(bs->bh, "modifying in-place");
>  			error = ext4_xattr_set_entry(i, s, handle, inode);
>  			if (!error) {
> @@ -2238,10 +2238,10 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
>  	while (ce) {
>  		struct buffer_head *bh;
>  
> -		bh = sb_bread(inode->i_sb, ce->e_block);
> +		bh = sb_bread(inode->i_sb, ce->e_value);
>  		if (!bh) {
>  			EXT4_ERROR_INODE(inode, "block %lu read error",
> -					 (unsigned long) ce->e_block);
> +					 (unsigned long) ce->e_value);
>  		} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
>  			*pce = ce;
>  			return bh;
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index b19be429d655..77a5b99d8f92 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -10,7 +10,7 @@
>  /*
>   * Mbcache is a simple key-value store. Keys need not be unique, however
>   * key-value pairs are expected to be unique (we use this fact in
> - * mb_cache_entry_delete_block()).
> + * mb_cache_entry_delete()).
>   *
>   * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
>   * They use hash of a block contents as a key and block number as a value.
> @@ -62,15 +62,15 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
>   * @cache - cache where the entry should be created
>   * @mask - gfp mask with which the entry should be allocated
>   * @key - key of the entry
> - * @block - block that contains data
> - * @reusable - is the block reusable by other inodes?
> + * @value - value of the entry
> + * @reusable - is the entry reusable by others?
>   *
> - * Creates entry in @cache with key @key and records that data is stored in
> - * block @block. The function returns -EBUSY if entry with the same key
> - * and for the same block already exists in cache. Otherwise 0 is returned.
> + * Creates entry in @cache with key @key and value @value. The function returns
> + * -EBUSY if entry with the same key and value already exists in cache.
> + * Otherwise 0 is returned.
>   */
>  int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
> -			  sector_t block, bool reusable)
> +			  cache_value_t value, bool reusable)
>  {
>  	struct mb_cache_entry *entry, *dup;
>  	struct hlist_bl_node *dup_node;
> @@ -91,12 +91,12 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
>  	/* One ref for hash, one ref returned */
>  	atomic_set(&entry->e_refcnt, 1);
>  	entry->e_key = key;
> -	entry->e_block = block;
> +	entry->e_value = value;
>  	entry->e_reusable = reusable;
>  	head = mb_cache_entry_head(cache, key);
>  	hlist_bl_lock(head);
>  	hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
> -		if (dup->e_key == key && dup->e_block == block) {
> +		if (dup->e_key == key && dup->e_value == value) {
>  			hlist_bl_unlock(head);
>  			kmem_cache_free(mb_entry_cache, entry);
>  			return -EBUSY;
> @@ -187,13 +187,13 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
>  EXPORT_SYMBOL(mb_cache_entry_find_next);
>  
>  /*
> - * mb_cache_entry_get - get a cache entry by block number (and key)
> + * mb_cache_entry_get - get a cache entry by value (and key)
>   * @cache - cache we work with
> - * @key - key of block number @block
> - * @block - block number
> + * @key - key
> + * @value - value
>   */
>  struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
> -					  sector_t block)
> +					  cache_value_t value)
>  {
>  	struct hlist_bl_node *node;
>  	struct hlist_bl_head *head;
> @@ -202,7 +202,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
>  	head = mb_cache_entry_head(cache, key);
>  	hlist_bl_lock(head);
>  	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
> -		if (entry->e_key == key && entry->e_block == block) {
> +		if (entry->e_key == key && entry->e_value == value) {
>  			atomic_inc(&entry->e_refcnt);
>  			goto out;
>  		}
> @@ -214,15 +214,14 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
>  }
>  EXPORT_SYMBOL(mb_cache_entry_get);
>  
> -/* mb_cache_entry_delete_block - remove information about block from cache
> +/* mb_cache_entry_delete - remove a cache entry
>   * @cache - cache we work with
> - * @key - key of block @block
> - * @block - block number
> + * @key - key
> + * @value - value
>   *
> - * Remove entry from cache @cache with key @key with data stored in @block.
> + * Remove entry from cache @cache with key @key and value @value.
>   */
> -void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
> -				 sector_t block)
> +void mb_cache_entry_delete(struct mb_cache *cache, u32 key, cache_value_t value)
>  {
>  	struct hlist_bl_node *node;
>  	struct hlist_bl_head *head;
> @@ -231,7 +230,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
>  	head = mb_cache_entry_head(cache, key);
>  	hlist_bl_lock(head);
>  	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
> -		if (entry->e_key == key && entry->e_block == block) {
> +		if (entry->e_key == key && entry->e_value == value) {
>  			/* We keep hash list reference to keep entry alive */
>  			hlist_bl_del_init(&entry->e_hash_list);
>  			hlist_bl_unlock(head);
> @@ -248,7 +247,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
>  	}
>  	hlist_bl_unlock(head);
>  }
> -EXPORT_SYMBOL(mb_cache_entry_delete_block);
> +EXPORT_SYMBOL(mb_cache_entry_delete);
>  
>  /* mb_cache_entry_touch - cache entry got used
>   * @cache - cache the entry belongs to
> diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
> index 86c9a8b480c5..e2d9f2f926a4 100644
> --- a/include/linux/mbcache.h
> +++ b/include/linux/mbcache.h
> @@ -9,6 +9,8 @@
>  
>  struct mb_cache;
>  
> +typedef sector_t cache_value_t;
> +
>  struct mb_cache_entry {
>  	/* List of entries in cache - protected by cache->c_list_lock */
>  	struct list_head	e_list;
> @@ -19,15 +21,15 @@ struct mb_cache_entry {
>  	u32			e_key;
>  	u32			e_referenced:1;
>  	u32			e_reusable:1;
> -	/* Block number of hashed block - stable during lifetime of the entry */
> -	sector_t		e_block;
> +	/* User provided value - stable during lifetime of the entry */
> +	cache_value_t		e_value;
>  };
>  
>  struct mb_cache *mb_cache_create(int bucket_bits);
>  void mb_cache_destroy(struct mb_cache *cache);
>  
>  int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
> -			  sector_t block, bool reusable);
> +			  cache_value_t value, bool reusable);
>  void __mb_cache_entry_free(struct mb_cache_entry *entry);
>  static inline int mb_cache_entry_put(struct mb_cache *cache,
>  				     struct mb_cache_entry *entry)
> @@ -38,10 +40,10 @@ static inline int mb_cache_entry_put(struct mb_cache *cache,
>  	return 1;
>  }
>  
> -void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
> -				  sector_t block);
> +void mb_cache_entry_delete(struct mb_cache *cache, u32 key,
> +			   cache_value_t value);
>  struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
> -					  sector_t block);
> +					  cache_value_t value);
>  struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
>  						 u32 key);
>  struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
> -- 
> 2.13.0.219.gdb65acc882-goog
> 
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 28/28] quota: add extra inode count to dquot transfer functions
  2017-05-31  8:15 ` [PATCH 28/28] quota: add extra inode count to dquot transfer functions Tahsin Erdogan
@ 2017-06-15  7:57     ` Jan Kara
  0 siblings, 0 replies; 100+ messages in thread
From: Jan Kara @ 2017-06-15  7:57 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

On Wed 31-05-17 01:15:17, Tahsin Erdogan wrote:
> Ext4 ea_inode feature allows storing xattr values in external inodes to
> be able to store values that are bigger than a block in size. Ext4 also
> has deduplication support for these type of inodes. With deduplication,
> the actual storage waste is eliminated but the users of such inodes are
> still charged full quota for the inodes as if there was no sharing
> happening in the background.
> 
> This design requires ext4 to manually charge the users because the
> inodes are shared.
> 
> An implication of this is that, if someone calls chown on a file that
> has such references we need to transfer the quota for the file and xattr
> inodes. Current dquot_transfer() function implicitly transfers one inode
> charge. In our case, we would like to specify additional inodes to be
> transferred.

Hum, rather handle this similarly to how we handle delalloc reserved space.
Add a callback to dq_ops to get "inode usage" of an inode and then use it
in dquot_transfer(), dquot_free_inode(), dquot_alloc_inode().

								Honza

 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext2/inode.c          |  2 +-
>  fs/ext4/inode.c          |  8 ++++++-
>  fs/ext4/ioctl.c          | 13 +++++++++++-
>  fs/ext4/xattr.c          | 54 ++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/ext4/xattr.h          |  2 ++
>  fs/jfs/file.c            |  2 +-
>  fs/ocfs2/file.c          |  2 +-
>  fs/quota/dquot.c         | 16 +++++++-------
>  fs/reiserfs/inode.c      |  2 +-
>  include/linux/quotaops.h |  8 ++++---
>  10 files changed, 93 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> index 2dcbd5698884..a13ba5dcb355 100644
> --- a/fs/ext2/inode.c
> +++ b/fs/ext2/inode.c
> @@ -1656,7 +1656,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
>  	}
>  	if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
>  	    (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
> -		error = dquot_transfer(inode, iattr);
> +		error = dquot_transfer(inode, iattr, 0);
>  		if (error)
>  			return error;
>  	}
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 6f5872197d6c..28abbbdbbb80 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5267,6 +5267,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
>  	int error, rc = 0;
>  	int orphan = 0;
>  	const unsigned int ia_valid = attr->ia_valid;
> +	int ea_inode_refs;
>  
>  	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
>  		return -EIO;
> @@ -5293,7 +5294,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
>  			error = PTR_ERR(handle);
>  			goto err_out;
>  		}
> -		error = dquot_transfer(inode, attr);
> +
> +		down_read(&EXT4_I(inode)->xattr_sem);
> +		error = ea_inode_refs = ext4_xattr_inode_count(inode);
> +		if (ea_inode_refs >= 0)
> +			error = dquot_transfer(inode, attr, ea_inode_refs);
> +		up_read(&EXT4_I(inode)->xattr_sem);
>  		if (error) {
>  			ext4_journal_stop(handle);
>  			return error;
> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> index dde8deb11e59..9938dc8e24c8 100644
> --- a/fs/ext4/ioctl.c
> +++ b/fs/ext4/ioctl.c
> @@ -21,6 +21,7 @@
>  #include "ext4.h"
>  #include <linux/fsmap.h>
>  #include "fsmap.h"
> +#include "xattr.h"
>  #include <trace/events/ext4.h>
>  
>  /**
> @@ -319,6 +320,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
>  	struct ext4_iloc iloc;
>  	struct ext4_inode *raw_inode;
>  	struct dquot *transfer_to[MAXQUOTAS] = { };
> +	int ea_inode_refs;
>  
>  	if (!ext4_has_feature_project(sb)) {
>  		if (projid != EXT4_DEF_PROJID)
> @@ -371,9 +373,17 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
>  	if (err)
>  		goto out_stop;
>  
> +	down_read(&EXT4_I(inode)->xattr_sem);
> +	ea_inode_refs = ext4_xattr_inode_count(inode);
> +	if (ea_inode_refs < 0) {
> +		up_read(&EXT4_I(inode)->xattr_sem);
> +		err = ea_inode_refs;
> +		goto out_stop;
> +	}
> +
>  	transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
>  	if (!IS_ERR(transfer_to[PRJQUOTA])) {
> -		err = __dquot_transfer(inode, transfer_to);
> +		err = __dquot_transfer(inode, transfer_to, ea_inode_refs);
>  		dqput(transfer_to[PRJQUOTA]);
>  		if (err)
>  			goto out_dirty;
> @@ -382,6 +392,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
>  	EXT4_I(inode)->i_projid = kprojid;
>  	inode->i_ctime = current_time(inode);
>  out_dirty:
> +	up_read(&EXT4_I(inode)->xattr_sem);
>  	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
>  	if (!err)
>  		err = rc;
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index caddc176a612..1d6fcbb01517 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -701,6 +701,60 @@ static void ext4_xattr_update_super_block(handle_t *handle,
>  	}
>  }
>  
> +int ext4_xattr_inode_count(struct inode *inode)
> +{
> +	struct ext4_iloc iloc = { .bh = NULL };
> +	struct buffer_head *bh = NULL;
> +	struct ext4_inode *raw_inode;
> +	struct ext4_xattr_ibody_header *header;
> +	struct ext4_xattr_entry *entry;
> +	int inode_count = 0;
> +	void *end;
> +	int ret;
> +
> +	lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);
> +
> +	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
> +		ret = ext4_get_inode_loc(inode, &iloc);
> +		if (ret)
> +			goto out;
> +		raw_inode = ext4_raw_inode(&iloc);
> +		header = IHDR(inode, raw_inode);
> +		end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
> +		ret = xattr_check_inode(inode, header, end);
> +		if (ret)
> +			goto out;
> +
> +		for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
> +		     entry = EXT4_XATTR_NEXT(entry))
> +			if (entry->e_value_inum)
> +				inode_count++;
> +	}
> +
> +	if (EXT4_I(inode)->i_file_acl) {
> +		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +		if (!bh) {
> +			ret = -EIO;
> +			goto out;
> +		}
> +
> +		if (ext4_xattr_check_block(inode, bh)) {
> +			ret = -EFSCORRUPTED;
> +			goto out;
> +		}
> +
> +		for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> +		     entry = EXT4_XATTR_NEXT(entry))
> +			if (entry->e_value_inum)
> +				inode_count++;
> +	}
> +	ret = inode_count;
> +out:
> +	brelse(iloc.bh);
> +	brelse(bh);
> +	return ret;
> +}
> +
>  static inline size_t round_up_cluster(struct inode *inode, size_t length)
>  {
>  	struct super_block *sb = inode->i_sb;
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index 67616cb9a059..8ef6fe123255 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -193,3 +193,5 @@ extern void ext4_xattr_inode_set_class(struct inode *ea_inode);
>  #else
>  static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { }
>  #endif
> +
> +int ext4_xattr_inode_count(struct inode *inode);
> diff --git a/fs/jfs/file.c b/fs/jfs/file.c
> index 739492c7a3fd..b08e0b0449a7 100644
> --- a/fs/jfs/file.c
> +++ b/fs/jfs/file.c
> @@ -114,7 +114,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
>  	}
>  	if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
>  	    (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
> -		rc = dquot_transfer(inode, iattr);
> +		rc = dquot_transfer(inode, iattr, 0);
>  		if (rc)
>  			return rc;
>  	}
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index bfeb647459d9..d3cbf6467af6 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -1259,7 +1259,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
>  			mlog_errno(status);
>  			goto bail_unlock;
>  		}
> -		status = __dquot_transfer(inode, transfer_to);
> +		status = __dquot_transfer(inode, transfer_to, 0);
>  		if (status < 0)
>  			goto bail_commit;
>  	} else {
> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> index 48813aeaab80..16e13d554aaa 100644
> --- a/fs/quota/dquot.c
> +++ b/fs/quota/dquot.c
> @@ -1906,10 +1906,12 @@ EXPORT_SYMBOL(dquot_free_inode);
>   * We are holding reference on transfer_from & transfer_to, no need to
>   * protect them by srcu_read_lock().
>   */
> -int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
> +int __dquot_transfer(struct inode *inode, struct dquot **transfer_to,
> +		     int inodes_extra)
>  {
>  	qsize_t space, cur_space;
>  	qsize_t rsv_space = 0;
> +	qsize_t inode_count = 1 + inodes_extra;
>  	struct dquot *transfer_from[MAXQUOTAS] = {};
>  	int cnt, ret = 0;
>  	char is_valid[MAXQUOTAS] = {};
> @@ -1946,7 +1948,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>  			continue;
>  		is_valid[cnt] = 1;
>  		transfer_from[cnt] = i_dquot(inode)[cnt];
> -		ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]);
> +		ret = check_idq(transfer_to[cnt], inode_count, &warn_to[cnt]);
>  		if (ret)
>  			goto over_quota;
>  		ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]);
> @@ -1963,7 +1965,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>  		/* Due to IO error we might not have transfer_from[] structure */
>  		if (transfer_from[cnt]) {
>  			int wtype;
> -			wtype = info_idq_free(transfer_from[cnt], 1);
> +			wtype = info_idq_free(transfer_from[cnt], inode_count);
>  			if (wtype != QUOTA_NL_NOWARN)
>  				prepare_warning(&warn_from_inodes[cnt],
>  						transfer_from[cnt], wtype);
> @@ -1971,13 +1973,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>  			if (wtype != QUOTA_NL_NOWARN)
>  				prepare_warning(&warn_from_space[cnt],
>  						transfer_from[cnt], wtype);
> -			dquot_decr_inodes(transfer_from[cnt], 1);
> +			dquot_decr_inodes(transfer_from[cnt], inode_count);
>  			dquot_decr_space(transfer_from[cnt], cur_space);
>  			dquot_free_reserved_space(transfer_from[cnt],
>  						  rsv_space);
>  		}
>  
> -		dquot_incr_inodes(transfer_to[cnt], 1);
> +		dquot_incr_inodes(transfer_to[cnt], inode_count);
>  		dquot_incr_space(transfer_to[cnt], cur_space);
>  		dquot_resv_space(transfer_to[cnt], rsv_space);
>  
> @@ -2005,7 +2007,7 @@ EXPORT_SYMBOL(__dquot_transfer);
>  /* Wrapper for transferring ownership of an inode for uid/gid only
>   * Called from FSXXX_setattr()
>   */
> -int dquot_transfer(struct inode *inode, struct iattr *iattr)
> +int dquot_transfer(struct inode *inode, struct iattr *iattr, int inodes_extra)
>  {
>  	struct dquot *transfer_to[MAXQUOTAS] = {};
>  	struct dquot *dquot;
> @@ -2037,7 +2039,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
>  		}
>  		transfer_to[GRPQUOTA] = dquot;
>  	}
> -	ret = __dquot_transfer(inode, transfer_to);
> +	ret = __dquot_transfer(inode, transfer_to, inodes_extra);
>  out_put:
>  	dqput_all(transfer_to);
>  	return ret;
> diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
> index 873fc04e9403..51586051b5dd 100644
> --- a/fs/reiserfs/inode.c
> +++ b/fs/reiserfs/inode.c
> @@ -3370,7 +3370,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
>  		reiserfs_write_unlock(inode->i_sb);
>  		if (error)
>  			goto out;
> -		error = dquot_transfer(inode, attr);
> +		error = dquot_transfer(inode, attr, 0);
>  		reiserfs_write_lock(inode->i_sb);
>  		if (error) {
>  			journal_end(&th);
> diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
> index dda22f45fc1b..b7bcd9c6db6c 100644
> --- a/include/linux/quotaops.h
> +++ b/include/linux/quotaops.h
> @@ -106,8 +106,9 @@ int dquot_get_next_dqblk(struct super_block *sb, struct kqid *id,
>  int dquot_set_dqblk(struct super_block *sb, struct kqid id,
>  		struct qc_dqblk *di);
>  
> -int __dquot_transfer(struct inode *inode, struct dquot **transfer_to);
> -int dquot_transfer(struct inode *inode, struct iattr *iattr);
> +int __dquot_transfer(struct inode *inode, struct dquot **transfer_to,
> +		int inodes_extra);
> +int dquot_transfer(struct inode *inode, struct iattr *iattr, int inodes_extra);
>  
>  static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type)
>  {
> @@ -226,7 +227,8 @@ static inline void dquot_free_inode(struct inode *inode)
>  {
>  }
>  
> -static inline int dquot_transfer(struct inode *inode, struct iattr *iattr)
> +static inline int dquot_transfer(struct inode *inode, struct iattr *iattr,
> +		int inodes_extra)
>  {
>  	return 0;
>  }
> -- 
> 2.13.0.219.gdb65acc882-goog
> 
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [Ocfs2-devel] [PATCH 28/28] quota: add extra inode count to dquot transfer functions
@ 2017-06-15  7:57     ` Jan Kara
  0 siblings, 0 replies; 100+ messages in thread
From: Jan Kara @ 2017-06-15  7:57 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

On Wed 31-05-17 01:15:17, Tahsin Erdogan wrote:
> Ext4 ea_inode feature allows storing xattr values in external inodes to
> be able to store values that are bigger than a block in size. Ext4 also
> has deduplication support for these type of inodes. With deduplication,
> the actual storage waste is eliminated but the users of such inodes are
> still charged full quota for the inodes as if there was no sharing
> happening in the background.
> 
> This design requires ext4 to manually charge the users because the
> inodes are shared.
> 
> An implication of this is that, if someone calls chown on a file that
> has such references we need to transfer the quota for the file and xattr
> inodes. Current dquot_transfer() function implicitly transfers one inode
> charge. In our case, we would like to specify additional inodes to be
> transferred.

Hum, rather handle this similarly to how we handle delalloc reserved space.
Add a callback to dq_ops to get "inode usage" of an inode and then use it
in dquot_transfer(), dquot_free_inode(), dquot_alloc_inode().

								Honza

 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/ext2/inode.c          |  2 +-
>  fs/ext4/inode.c          |  8 ++++++-
>  fs/ext4/ioctl.c          | 13 +++++++++++-
>  fs/ext4/xattr.c          | 54 ++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/ext4/xattr.h          |  2 ++
>  fs/jfs/file.c            |  2 +-
>  fs/ocfs2/file.c          |  2 +-
>  fs/quota/dquot.c         | 16 +++++++-------
>  fs/reiserfs/inode.c      |  2 +-
>  include/linux/quotaops.h |  8 ++++---
>  10 files changed, 93 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> index 2dcbd5698884..a13ba5dcb355 100644
> --- a/fs/ext2/inode.c
> +++ b/fs/ext2/inode.c
> @@ -1656,7 +1656,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
>  	}
>  	if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
>  	    (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
> -		error = dquot_transfer(inode, iattr);
> +		error = dquot_transfer(inode, iattr, 0);
>  		if (error)
>  			return error;
>  	}
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 6f5872197d6c..28abbbdbbb80 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5267,6 +5267,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
>  	int error, rc = 0;
>  	int orphan = 0;
>  	const unsigned int ia_valid = attr->ia_valid;
> +	int ea_inode_refs;
>  
>  	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
>  		return -EIO;
> @@ -5293,7 +5294,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
>  			error = PTR_ERR(handle);
>  			goto err_out;
>  		}
> -		error = dquot_transfer(inode, attr);
> +
> +		down_read(&EXT4_I(inode)->xattr_sem);
> +		error = ea_inode_refs = ext4_xattr_inode_count(inode);
> +		if (ea_inode_refs >= 0)
> +			error = dquot_transfer(inode, attr, ea_inode_refs);
> +		up_read(&EXT4_I(inode)->xattr_sem);
>  		if (error) {
>  			ext4_journal_stop(handle);
>  			return error;
> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> index dde8deb11e59..9938dc8e24c8 100644
> --- a/fs/ext4/ioctl.c
> +++ b/fs/ext4/ioctl.c
> @@ -21,6 +21,7 @@
>  #include "ext4.h"
>  #include <linux/fsmap.h>
>  #include "fsmap.h"
> +#include "xattr.h"
>  #include <trace/events/ext4.h>
>  
>  /**
> @@ -319,6 +320,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
>  	struct ext4_iloc iloc;
>  	struct ext4_inode *raw_inode;
>  	struct dquot *transfer_to[MAXQUOTAS] = { };
> +	int ea_inode_refs;
>  
>  	if (!ext4_has_feature_project(sb)) {
>  		if (projid != EXT4_DEF_PROJID)
> @@ -371,9 +373,17 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
>  	if (err)
>  		goto out_stop;
>  
> +	down_read(&EXT4_I(inode)->xattr_sem);
> +	ea_inode_refs = ext4_xattr_inode_count(inode);
> +	if (ea_inode_refs < 0) {
> +		up_read(&EXT4_I(inode)->xattr_sem);
> +		err = ea_inode_refs;
> +		goto out_stop;
> +	}
> +
>  	transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
>  	if (!IS_ERR(transfer_to[PRJQUOTA])) {
> -		err = __dquot_transfer(inode, transfer_to);
> +		err = __dquot_transfer(inode, transfer_to, ea_inode_refs);
>  		dqput(transfer_to[PRJQUOTA]);
>  		if (err)
>  			goto out_dirty;
> @@ -382,6 +392,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
>  	EXT4_I(inode)->i_projid = kprojid;
>  	inode->i_ctime = current_time(inode);
>  out_dirty:
> +	up_read(&EXT4_I(inode)->xattr_sem);
>  	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
>  	if (!err)
>  		err = rc;
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index caddc176a612..1d6fcbb01517 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -701,6 +701,60 @@ static void ext4_xattr_update_super_block(handle_t *handle,
>  	}
>  }
>  
> +int ext4_xattr_inode_count(struct inode *inode)
> +{
> +	struct ext4_iloc iloc = { .bh = NULL };
> +	struct buffer_head *bh = NULL;
> +	struct ext4_inode *raw_inode;
> +	struct ext4_xattr_ibody_header *header;
> +	struct ext4_xattr_entry *entry;
> +	int inode_count = 0;
> +	void *end;
> +	int ret;
> +
> +	lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);
> +
> +	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
> +		ret = ext4_get_inode_loc(inode, &iloc);
> +		if (ret)
> +			goto out;
> +		raw_inode = ext4_raw_inode(&iloc);
> +		header = IHDR(inode, raw_inode);
> +		end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
> +		ret = xattr_check_inode(inode, header, end);
> +		if (ret)
> +			goto out;
> +
> +		for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
> +		     entry = EXT4_XATTR_NEXT(entry))
> +			if (entry->e_value_inum)
> +				inode_count++;
> +	}
> +
> +	if (EXT4_I(inode)->i_file_acl) {
> +		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +		if (!bh) {
> +			ret = -EIO;
> +			goto out;
> +		}
> +
> +		if (ext4_xattr_check_block(inode, bh)) {
> +			ret = -EFSCORRUPTED;
> +			goto out;
> +		}
> +
> +		for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> +		     entry = EXT4_XATTR_NEXT(entry))
> +			if (entry->e_value_inum)
> +				inode_count++;
> +	}
> +	ret = inode_count;
> +out:
> +	brelse(iloc.bh);
> +	brelse(bh);
> +	return ret;
> +}
> +
>  static inline size_t round_up_cluster(struct inode *inode, size_t length)
>  {
>  	struct super_block *sb = inode->i_sb;
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index 67616cb9a059..8ef6fe123255 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -193,3 +193,5 @@ extern void ext4_xattr_inode_set_class(struct inode *ea_inode);
>  #else
>  static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { }
>  #endif
> +
> +int ext4_xattr_inode_count(struct inode *inode);
> diff --git a/fs/jfs/file.c b/fs/jfs/file.c
> index 739492c7a3fd..b08e0b0449a7 100644
> --- a/fs/jfs/file.c
> +++ b/fs/jfs/file.c
> @@ -114,7 +114,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
>  	}
>  	if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
>  	    (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
> -		rc = dquot_transfer(inode, iattr);
> +		rc = dquot_transfer(inode, iattr, 0);
>  		if (rc)
>  			return rc;
>  	}
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index bfeb647459d9..d3cbf6467af6 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -1259,7 +1259,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
>  			mlog_errno(status);
>  			goto bail_unlock;
>  		}
> -		status = __dquot_transfer(inode, transfer_to);
> +		status = __dquot_transfer(inode, transfer_to, 0);
>  		if (status < 0)
>  			goto bail_commit;
>  	} else {
> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> index 48813aeaab80..16e13d554aaa 100644
> --- a/fs/quota/dquot.c
> +++ b/fs/quota/dquot.c
> @@ -1906,10 +1906,12 @@ EXPORT_SYMBOL(dquot_free_inode);
>   * We are holding reference on transfer_from & transfer_to, no need to
>   * protect them by srcu_read_lock().
>   */
> -int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
> +int __dquot_transfer(struct inode *inode, struct dquot **transfer_to,
> +		     int inodes_extra)
>  {
>  	qsize_t space, cur_space;
>  	qsize_t rsv_space = 0;
> +	qsize_t inode_count = 1 + inodes_extra;
>  	struct dquot *transfer_from[MAXQUOTAS] = {};
>  	int cnt, ret = 0;
>  	char is_valid[MAXQUOTAS] = {};
> @@ -1946,7 +1948,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>  			continue;
>  		is_valid[cnt] = 1;
>  		transfer_from[cnt] = i_dquot(inode)[cnt];
> -		ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]);
> +		ret = check_idq(transfer_to[cnt], inode_count, &warn_to[cnt]);
>  		if (ret)
>  			goto over_quota;
>  		ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]);
> @@ -1963,7 +1965,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>  		/* Due to IO error we might not have transfer_from[] structure */
>  		if (transfer_from[cnt]) {
>  			int wtype;
> -			wtype = info_idq_free(transfer_from[cnt], 1);
> +			wtype = info_idq_free(transfer_from[cnt], inode_count);
>  			if (wtype != QUOTA_NL_NOWARN)
>  				prepare_warning(&warn_from_inodes[cnt],
>  						transfer_from[cnt], wtype);
> @@ -1971,13 +1973,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>  			if (wtype != QUOTA_NL_NOWARN)
>  				prepare_warning(&warn_from_space[cnt],
>  						transfer_from[cnt], wtype);
> -			dquot_decr_inodes(transfer_from[cnt], 1);
> +			dquot_decr_inodes(transfer_from[cnt], inode_count);
>  			dquot_decr_space(transfer_from[cnt], cur_space);
>  			dquot_free_reserved_space(transfer_from[cnt],
>  						  rsv_space);
>  		}
>  
> -		dquot_incr_inodes(transfer_to[cnt], 1);
> +		dquot_incr_inodes(transfer_to[cnt], inode_count);
>  		dquot_incr_space(transfer_to[cnt], cur_space);
>  		dquot_resv_space(transfer_to[cnt], rsv_space);
>  
> @@ -2005,7 +2007,7 @@ EXPORT_SYMBOL(__dquot_transfer);
>  /* Wrapper for transferring ownership of an inode for uid/gid only
>   * Called from FSXXX_setattr()
>   */
> -int dquot_transfer(struct inode *inode, struct iattr *iattr)
> +int dquot_transfer(struct inode *inode, struct iattr *iattr, int inodes_extra)
>  {
>  	struct dquot *transfer_to[MAXQUOTAS] = {};
>  	struct dquot *dquot;
> @@ -2037,7 +2039,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
>  		}
>  		transfer_to[GRPQUOTA] = dquot;
>  	}
> -	ret = __dquot_transfer(inode, transfer_to);
> +	ret = __dquot_transfer(inode, transfer_to, inodes_extra);
>  out_put:
>  	dqput_all(transfer_to);
>  	return ret;
> diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
> index 873fc04e9403..51586051b5dd 100644
> --- a/fs/reiserfs/inode.c
> +++ b/fs/reiserfs/inode.c
> @@ -3370,7 +3370,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
>  		reiserfs_write_unlock(inode->i_sb);
>  		if (error)
>  			goto out;
> -		error = dquot_transfer(inode, attr);
> +		error = dquot_transfer(inode, attr, 0);
>  		reiserfs_write_lock(inode->i_sb);
>  		if (error) {
>  			journal_end(&th);
> diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
> index dda22f45fc1b..b7bcd9c6db6c 100644
> --- a/include/linux/quotaops.h
> +++ b/include/linux/quotaops.h
> @@ -106,8 +106,9 @@ int dquot_get_next_dqblk(struct super_block *sb, struct kqid *id,
>  int dquot_set_dqblk(struct super_block *sb, struct kqid id,
>  		struct qc_dqblk *di);
>  
> -int __dquot_transfer(struct inode *inode, struct dquot **transfer_to);
> -int dquot_transfer(struct inode *inode, struct iattr *iattr);
> +int __dquot_transfer(struct inode *inode, struct dquot **transfer_to,
> +		int inodes_extra);
> +int dquot_transfer(struct inode *inode, struct iattr *iattr, int inodes_extra);
>  
>  static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type)
>  {
> @@ -226,7 +227,8 @@ static inline void dquot_free_inode(struct inode *inode)
>  {
>  }
>  
> -static inline int dquot_transfer(struct inode *inode, struct iattr *iattr)
> +static inline int dquot_transfer(struct inode *inode, struct iattr *iattr,
> +		int inodes_extra)
>  {
>  	return 0;
>  }
> -- 
> 2.13.0.219.gdb65acc882-goog
> 
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 23/28] mbcache: make mbcache more generic
  2017-06-15  7:41     ` [Ocfs2-devel] " Jan Kara
  (?)
@ 2017-06-15 18:25     ` Tahsin Erdogan
  2017-06-19  8:50         ` [Ocfs2-devel] " Jan Kara
  -1 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-15 18:25 UTC (permalink / raw)
  To: Jan Kara
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

On Thu, Jun 15, 2017 at 12:41 AM, Jan Kara <jack@suse.cz> wrote:
> Can you explain a bit more what do you mean by "make it more generic" as it
> seems you just rename a couple of things here...

The change is really just that, having names that are more generic
which do not limit use cases to block sharing. In a subsequent patch
in the series ("[PATCH v4 27/28] ext4: xattr inode deduplication"), we
start using the mbcache code to share xattr inodes. With that patch,
old mb_cache_entry.e_block field could be holding either a block
number or an inode number, so I renamed things to make them more
generic.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 28/28] quota: add extra inode count to dquot transfer functions
  2017-06-15  7:57     ` [Ocfs2-devel] " Jan Kara
  (?)
@ 2017-06-17  1:50     ` Tahsin Erdogan
  2017-06-19  9:03         ` [Ocfs2-devel] " Jan Kara
  -1 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-17  1:50 UTC (permalink / raw)
  To: Jan Kara
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

On Thu, Jun 15, 2017 at 12:57 AM, Jan Kara <jack@suse.cz> wrote:
> Hum, rather handle this similarly to how we handle delalloc reserved space.
> Add a callback to dq_ops to get "inode usage" of an inode and then use it
> in dquot_transfer(), dquot_free_inode(), dquot_alloc_inode().

I tried that approach by adding a "int get_inode_usage(struct inode
*inode, qsize_t *usage)" callback to dquot_operations. Unfortunately,
ext4 code that calculates the number of internal inodes
(ext4_xattr_inode_count()) is subject to failures so the callback has
to be able to report errors. And, that itself is problematic because
we can't afford to have errors in dquot_free_inode(). If you have
thoughts about how to address this please let me know.

Alternatively, I could try to make this patch less intrusive by
keeping the existing dquot_transfer() signature and add a new
dquot_transfer_usage() that accepts inode_usage as a parameter. What
do you think?

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 23/28] mbcache: make mbcache more generic
  2017-06-15 18:25     ` Tahsin Erdogan
@ 2017-06-19  8:50         ` Jan Kara
  0 siblings, 0 replies; 100+ messages in thread
From: Jan Kara @ 2017-06-19  8:50 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Jan Kara, Theodore Ts'o, Andreas Dilger,
	Dave Kleikamp, Alexander Viro, Mark Fasheh, Joel Becker,
	Jens Axboe, Deepa Dinamani, Mike Christie, Fabian Frederick,
	linux-ext4, linux-kernel, jfs-discussion, linux-fsdevel,
	ocfs2-devel, reiserfs-devel

On Thu 15-06-17 11:25:02, Tahsin Erdogan wrote:
> On Thu, Jun 15, 2017 at 12:41 AM, Jan Kara <jack@suse.cz> wrote:
> > Can you explain a bit more what do you mean by "make it more generic" as it
> > seems you just rename a couple of things here...
> 
> The change is really just that, having names that are more generic
> which do not limit use cases to block sharing. In a subsequent patch
> in the series ("[PATCH v4 27/28] ext4: xattr inode deduplication"), we
> start using the mbcache code to share xattr inodes. With that patch,
> old mb_cache_entry.e_block field could be holding either a block
> number or an inode number, so I renamed things to make them more
> generic.

OK, then I'd suggest to change title to "mbcache: make mbcache naming more
generic" and explain what you wrote here in the changelog. Thanks!

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [Ocfs2-devel] [PATCH 23/28] mbcache: make mbcache more generic
@ 2017-06-19  8:50         ` Jan Kara
  0 siblings, 0 replies; 100+ messages in thread
From: Jan Kara @ 2017-06-19  8:50 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Jan Kara, Theodore Ts'o, Andreas Dilger,
	Dave Kleikamp, Alexander Viro, Mark Fasheh, Joel Becker,
	Jens Axboe, Deepa Dinamani, Mike Christie, Fabian Frederick,
	linux-ext4, linux-kernel, jfs-discussion, linux-fsdevel,
	ocfs2-devel, reiserfs-devel

On Thu 15-06-17 11:25:02, Tahsin Erdogan wrote:
> On Thu, Jun 15, 2017 at 12:41 AM, Jan Kara <jack@suse.cz> wrote:
> > Can you explain a bit more what do you mean by "make it more generic" as it
> > seems you just rename a couple of things here...
> 
> The change is really just that, having names that are more generic
> which do not limit use cases to block sharing. In a subsequent patch
> in the series ("[PATCH v4 27/28] ext4: xattr inode deduplication"), we
> start using the mbcache code to share xattr inodes. With that patch,
> old mb_cache_entry.e_block field could be holding either a block
> number or an inode number, so I renamed things to make them more
> generic.

OK, then I'd suggest to change title to "mbcache: make mbcache naming more
generic" and explain what you wrote here in the changelog. Thanks!

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 28/28] quota: add extra inode count to dquot transfer functions
  2017-06-17  1:50     ` Tahsin Erdogan
@ 2017-06-19  9:03         ` Jan Kara
  0 siblings, 0 replies; 100+ messages in thread
From: Jan Kara @ 2017-06-19  9:03 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Jan Kara, Theodore Ts'o, Andreas Dilger,
	Dave Kleikamp, Alexander Viro, Mark Fasheh, Joel Becker,
	Jens Axboe, Deepa Dinamani, Mike Christie, Fabian Frederick,
	linux-ext4, linux-kernel, jfs-discussion, linux-fsdevel,
	ocfs2-devel, reiserfs-devel

On Fri 16-06-17 18:50:58, Tahsin Erdogan wrote:
> On Thu, Jun 15, 2017 at 12:57 AM, Jan Kara <jack@suse.cz> wrote:
> > Hum, rather handle this similarly to how we handle delalloc reserved space.
> > Add a callback to dq_ops to get "inode usage" of an inode and then use it
> > in dquot_transfer(), dquot_free_inode(), dquot_alloc_inode().
> 
> I tried that approach by adding a "int get_inode_usage(struct inode
> *inode, qsize_t *usage)" callback to dquot_operations. Unfortunately,
> ext4 code that calculates the number of internal inodes
> (ext4_xattr_inode_count()) is subject to failures so the callback has
> to be able to report errors. And, that itself is problematic because
> we can't afford to have errors in dquot_free_inode(). If you have
> thoughts about how to address this please let me know.

Well, you can just make dquot_free_inode() return error. Now most callers
won't be able to do much with an error from dquot_free_inode() but that's
the case also for other things during inode deletion - just handle it as
other fatal failures during inode freeing.

> Alternatively, I could try to make this patch less intrusive by
> keeping the existing dquot_transfer() signature and add a new
> dquot_transfer_usage() that accepts inode_usage as a parameter. What
> do you think?

That would be somewhat better than what you do in this patch but I prefer
to handle this like I suggested above.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [Ocfs2-devel] [PATCH 28/28] quota: add extra inode count to dquot transfer functions
@ 2017-06-19  9:03         ` Jan Kara
  0 siblings, 0 replies; 100+ messages in thread
From: Jan Kara @ 2017-06-19  9:03 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Jan Kara, Theodore Ts'o, Andreas Dilger,
	Dave Kleikamp, Alexander Viro, Mark Fasheh, Joel Becker,
	Jens Axboe, Deepa Dinamani, Mike Christie, Fabian Frederick,
	linux-ext4, linux-kernel, jfs-discussion, linux-fsdevel,
	ocfs2-devel, reiserfs-devel

On Fri 16-06-17 18:50:58, Tahsin Erdogan wrote:
> On Thu, Jun 15, 2017 at 12:57 AM, Jan Kara <jack@suse.cz> wrote:
> > Hum, rather handle this similarly to how we handle delalloc reserved space.
> > Add a callback to dq_ops to get "inode usage" of an inode and then use it
> > in dquot_transfer(), dquot_free_inode(), dquot_alloc_inode().
> 
> I tried that approach by adding a "int get_inode_usage(struct inode
> *inode, qsize_t *usage)" callback to dquot_operations. Unfortunately,
> ext4 code that calculates the number of internal inodes
> (ext4_xattr_inode_count()) is subject to failures so the callback has
> to be able to report errors. And, that itself is problematic because
> we can't afford to have errors in dquot_free_inode(). If you have
> thoughts about how to address this please let me know.

Well, you can just make dquot_free_inode() return error. Now most callers
won't be able to do much with an error from dquot_free_inode() but that's
the case also for other things during inode deletion - just handle it as
other fatal failures during inode freeing.

> Alternatively, I could try to make this patch less intrusive by
> keeping the existing dquot_transfer() signature and add a new
> dquot_transfer_usage() that accepts inode_usage as a parameter. What
> do you think?

That would be somewhat better than what you do in this patch but I prefer
to handle this like I suggested above.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 28/28] quota: add extra inode count to dquot transfer functions
  2017-06-19  9:03         ` [Ocfs2-devel] " Jan Kara
  (?)
@ 2017-06-19 11:46         ` Tahsin Erdogan
  2017-06-19 12:36             ` [Ocfs2-devel] " Jan Kara
  -1 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-19 11:46 UTC (permalink / raw)
  To: Jan Kara
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

>> I tried that approach by adding a "int get_inode_usage(struct inode
>> *inode, qsize_t *usage)" callback to dquot_operations. Unfortunately,
>> ext4 code that calculates the number of internal inodes
>> (ext4_xattr_inode_count()) is subject to failures so the callback has
>> to be able to report errors. And, that itself is problematic because
>> we can't afford to have errors in dquot_free_inode(). If you have
>> thoughts about how to address this please let me know.
>
> Well, you can just make dquot_free_inode() return error. Now most callers
> won't be able to do much with an error from dquot_free_inode() but that's
> the case also for other things during inode deletion - just handle it as
> other fatal failures during inode freeing.
>
I just checked dquot_free_inode() to see whether it calls anything
that could fail. It calls mark_all_dquot_dirty() and ignores the
return code from it. I would like to follow the same for the
get_inode_usage() as the only use case for get_inode_usage() (ext4)
should not fail at inode free time.

Basically, I want to avoid changing return type from void to int
because it would create a new responsibility for the filesystem
implementations who do not know how to deal with it.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 28/28] quota: add extra inode count to dquot transfer functions
  2017-06-19 11:46         ` Tahsin Erdogan
@ 2017-06-19 12:36             ` Jan Kara
  0 siblings, 0 replies; 100+ messages in thread
From: Jan Kara @ 2017-06-19 12:36 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Jan Kara, Theodore Ts'o, Andreas Dilger,
	Dave Kleikamp, Alexander Viro, Mark Fasheh, Joel Becker,
	Jens Axboe, Deepa Dinamani, Mike Christie, Fabian Frederick,
	linux-ext4, linux-kernel, jfs-discussion, linux-fsdevel,
	ocfs2-devel, reiserfs-devel

On Mon 19-06-17 04:46:00, Tahsin Erdogan wrote:
> >> I tried that approach by adding a "int get_inode_usage(struct inode
> >> *inode, qsize_t *usage)" callback to dquot_operations. Unfortunately,
> >> ext4 code that calculates the number of internal inodes
> >> (ext4_xattr_inode_count()) is subject to failures so the callback has
> >> to be able to report errors. And, that itself is problematic because
> >> we can't afford to have errors in dquot_free_inode(). If you have
> >> thoughts about how to address this please let me know.
> >
> > Well, you can just make dquot_free_inode() return error. Now most callers
> > won't be able to do much with an error from dquot_free_inode() but that's
> > the case also for other things during inode deletion - just handle it as
> > other fatal failures during inode freeing.
> >
> I just checked dquot_free_inode() to see whether it calls anything
> that could fail. It calls mark_all_dquot_dirty() and ignores the
> return code from it. I would like to follow the same for the
> get_inode_usage() as the only use case for get_inode_usage() (ext4)
> should not fail at inode free time.
> 
> Basically, I want to avoid changing return type from void to int
> because it would create a new responsibility for the filesystem
> implementations who do not know how to deal with it.

Heh, this "pushing of responsibility" looks like a silly game. If an error
can happen in a function, it is better to report it as far as easily
possible (unless we can cleanly handle it which we cannot here). I'm guilty
of making dquot_free_inode() ignore errors from mark_all_dquot_dirty() and
in retrospect it would have been better if these were propagated to the
caller as well. And eventually we can fix this if we decide we care enough.
I'm completely fine with just returning an error from dquot_free_inode()
and ignore it in all the callers except for ext4. Then filesystems which
care enough can try to handle the error. That way we at least don't
increase the design debt from the past.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [Ocfs2-devel] [PATCH 28/28] quota: add extra inode count to dquot transfer functions
@ 2017-06-19 12:36             ` Jan Kara
  0 siblings, 0 replies; 100+ messages in thread
From: Jan Kara @ 2017-06-19 12:36 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Jan Kara, Jan Kara, Theodore Ts'o, Andreas Dilger,
	Dave Kleikamp, Alexander Viro, Mark Fasheh, Joel Becker,
	Jens Axboe, Deepa Dinamani, Mike Christie, Fabian Frederick,
	linux-ext4, linux-kernel, jfs-discussion, linux-fsdevel,
	ocfs2-devel, reiserfs-devel

On Mon 19-06-17 04:46:00, Tahsin Erdogan wrote:
> >> I tried that approach by adding a "int get_inode_usage(struct inode
> >> *inode, qsize_t *usage)" callback to dquot_operations. Unfortunately,
> >> ext4 code that calculates the number of internal inodes
> >> (ext4_xattr_inode_count()) is subject to failures so the callback has
> >> to be able to report errors. And, that itself is problematic because
> >> we can't afford to have errors in dquot_free_inode(). If you have
> >> thoughts about how to address this please let me know.
> >
> > Well, you can just make dquot_free_inode() return error. Now most callers
> > won't be able to do much with an error from dquot_free_inode() but that's
> > the case also for other things during inode deletion - just handle it as
> > other fatal failures during inode freeing.
> >
> I just checked dquot_free_inode() to see whether it calls anything
> that could fail. It calls mark_all_dquot_dirty() and ignores the
> return code from it. I would like to follow the same for the
> get_inode_usage() as the only use case for get_inode_usage() (ext4)
> should not fail at inode free time.
> 
> Basically, I want to avoid changing return type from void to int
> because it would create a new responsibility for the filesystem
> implementations who do not know how to deal with it.

Heh, this "pushing of responsibility" looks like a silly game. If an error
can happen in a function, it is better to report it as far as easily
possible (unless we can cleanly handle it which we cannot here). I'm guilty
of making dquot_free_inode() ignore errors from mark_all_dquot_dirty() and
in retrospect it would have been better if these were propagated to the
caller as well. And eventually we can fix this if we decide we care enough.
I'm completely fine with just returning an error from dquot_free_inode()
and ignore it in all the callers except for ext4. Then filesystems which
care enough can try to handle the error. That way we at least don't
increase the design debt from the past.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [PATCH v2 18/31] ext4: retry storing value in external inode with xattr block too
  2017-05-31  8:15 ` [PATCH 18/28] ext4: retry storing value in external inode with xattr block too Tahsin Erdogan
@ 2017-06-20  8:56   ` Tahsin Erdogan
  0 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-20  8:56 UTC (permalink / raw)
  To: linux-ext4; +Cc: linux-kernel, Tahsin Erdogan

When value size is <= EXT4_XATTR_MIN_LARGE_EA_SIZE(), and it
doesn't fit in either inline or xattr block, a second try is made to
store it in an external inode while storing the entry itself in inline
area. There should also be an attempt to store the entry in xattr block.

This patch adds a retry loop to do that. It also makes the caller the
sole decider on whether to store a value in an external inode.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v2: fix checkpatch.pl warning about indented retry_inode: label

 fs/ext4/xattr.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index dcf7ec98f138..0dfae3f8e209 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -911,11 +911,6 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 	int in_inode = i->in_inode;
 	int rc;
 
-	if (ext4_has_feature_ea_inode(inode->i_sb) &&
-	    (EXT4_XATTR_SIZE(i->value_len) >
-	     EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize)))
-		in_inode = 1;
-
 	/* Compute min_offs and last. */
 	last = s->first;
 	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
@@ -1097,7 +1092,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 {
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *new_bh = NULL;
-	struct ext4_xattr_search *s = &bs->s;
+	struct ext4_xattr_search s_copy = bs->s;
+	struct ext4_xattr_search *s = &s_copy;
 	struct mb_cache_entry *ce = NULL;
 	int error = 0;
 	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
@@ -1519,6 +1515,11 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
 			goto cleanup;
 
+		if (ext4_has_feature_ea_inode(inode->i_sb) &&
+		    (EXT4_XATTR_SIZE(i.value_len) >
+			EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize)))
+			i.in_inode = 1;
+retry_inode:
 		error = ext4_xattr_ibody_set(handle, inode, &i, &is);
 		if (!error && !bs.s.not_found) {
 			i.value = NULL;
@@ -1530,20 +1531,20 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 					goto cleanup;
 			}
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
-			if (ext4_has_feature_ea_inode(inode->i_sb) &&
-			    error == -ENOSPC) {
-				/* xattr not fit to block, store at external
-				 * inode */
-				i.in_inode = 1;
-				error = ext4_xattr_ibody_set(handle, inode,
-							     &i, &is);
-			}
-			if (error)
-				goto cleanup;
-			if (!is.s.not_found) {
+			if (!error && !is.s.not_found) {
 				i.value = NULL;
 				error = ext4_xattr_ibody_set(handle, inode, &i,
 							     &is);
+			} else if (error == -ENOSPC) {
+				/*
+				 * Xattr does not fit in the block, store at
+				 * external inode if possible.
+				 */
+				if (ext4_has_feature_ea_inode(inode->i_sb) &&
+				    !i.in_inode) {
+					i.in_inode = 1;
+					goto retry_inode;
+				}
 			}
 		}
 	}
-- 
2.13.1.518.g3df882009-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH v2 20/31] ext4: improve journal credit handling in set xattr paths
  2017-05-31  8:15 ` [PATCH 20/28] ext4: improve journal credit handling in set xattr paths Tahsin Erdogan
@ 2017-06-20  8:59   ` Tahsin Erdogan
  0 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-20  8:59 UTC (permalink / raw)
  To: linux-ext4; +Cc: linux-kernel, Tahsin Erdogan

Both ext4_set_acl() and ext4_set_context() need to be made aware of
ea_inode feature when it comes to credits calculation.

Also add a sufficient credits check in ext4_xattr_set_handle() right
after xattr write lock is grabbed. Original credits calculation is done
outside the lock so there is a possiblity that the initially calculated
credits are not sufficient anymore.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v2: fixed checkpatch.pl warning about replacing spaces with tab

 fs/ext4/acl.c       |  7 ++++---
 fs/ext4/ext4_jbd2.h | 14 --------------
 fs/ext4/super.c     |  6 +++---
 fs/ext4/xattr.c     | 55 +++++++++++++++++++++++++++++++++++++++++------------
 fs/ext4/xattr.h     |  1 +
 5 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 3ec0e46de95f..74f7ac539e00 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -231,14 +231,15 @@ int
 ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
 	handle_t *handle;
-	int error, retries = 0;
+	int error, credits, retries = 0;
+	size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0;
 
 	error = dquot_initialize(inode);
 	if (error)
 		return error;
 retry:
-	handle = ext4_journal_start(inode, EXT4_HT_XATTR,
-				    ext4_jbd2_credits_xattr(inode));
+	credits = ext4_xattr_set_credits(inode, acl_size);
+	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index f97611171023..a5bda70feed5 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -104,20 +104,6 @@
 #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
 #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
 
-static inline int ext4_jbd2_credits_xattr(struct inode *inode)
-{
-	int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
-
-	/*
-	 * In case of inline data, we may push out the data to a block,
-	 * so we need to reserve credits for this eventuality
-	 */
-	if (ext4_has_inline_data(inode))
-		credits += ext4_writepage_trans_blocks(inode) + 1;
-	return credits;
-}
-
-
 /*
  * Ext4 handle operation types -- for logging purposes
  */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d37c81f327e7..b02a23ec92ca 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1143,7 +1143,7 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 							void *fs_data)
 {
 	handle_t *handle = fs_data;
-	int res, res2, retries = 0;
+	int res, res2, credits, retries = 0;
 
 	res = ext4_convert_inline_data(inode);
 	if (res)
@@ -1178,8 +1178,8 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 	if (res)
 		return res;
 retry:
-	handle = ext4_journal_start(inode, EXT4_HT_MISC,
-			ext4_jbd2_credits_xattr(inode));
+	credits = ext4_xattr_set_credits(inode, len);
+	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 97d33ecf0818..fd017faaf221 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1473,6 +1473,17 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 
 	ext4_write_lock_xattr(inode, &no_expand);
 
+	/* Check journal credits under write lock. */
+	if (ext4_handle_valid(handle)) {
+		int credits;
+
+		credits = ext4_xattr_set_credits(inode, value_len);
+		if (!ext4_handle_has_enough_credits(handle, credits)) {
+			error = -ENOSPC;
+			goto cleanup;
+		}
+	}
+
 	error = ext4_reserve_inode_write(handle, inode, &is.iloc);
 	if (error)
 		goto cleanup;
@@ -1570,6 +1581,36 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	return error;
 }
 
+int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
+{
+	struct super_block *sb = inode->i_sb;
+	int credits;
+
+	if (!EXT4_SB(sb)->s_journal)
+		return 0;
+
+	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+
+	/*
+	 * In case of inline data, we may push out the data to a block,
+	 * so we need to reserve credits for this eventuality
+	 */
+	if (ext4_has_inline_data(inode))
+		credits += ext4_writepage_trans_blocks(inode) + 1;
+
+	if (ext4_has_feature_ea_inode(sb)) {
+		int nrblocks = (value_len + sb->s_blocksize - 1) >>
+					sb->s_blocksize_bits;
+
+		/* For new inode */
+		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
+
+		/* For data blocks of EA inode */
+		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
+	}
+	return credits;
+}
+
 /*
  * ext4_xattr_set()
  *
@@ -1585,24 +1626,14 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 	handle_t *handle;
 	struct super_block *sb = inode->i_sb;
 	int error, retries = 0;
-	int credits = ext4_jbd2_credits_xattr(inode);
+	int credits;
 
 	error = dquot_initialize(inode);
 	if (error)
 		return error;
 
-	if (ext4_has_feature_ea_inode(sb)) {
-		int nrblocks = (value_len + sb->s_blocksize - 1) >>
-					sb->s_blocksize_bits;
-
-		/* For new inode */
-		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
-
-		/* For data blocks of EA inode */
-		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
-	}
-
 retry:
+	credits = ext4_xattr_set_credits(inode, value_len);
 	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index b6ef99d1a061..e82c5fe36a26 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -160,6 +160,7 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
 extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
 extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
+extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
 
 extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-- 
2.13.1.518.g3df882009-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH v2 23/31] mbcache: make mbcache naming more generic
  2017-06-19  8:50         ` [Ocfs2-devel] " Jan Kara
  (?)
@ 2017-06-20  9:01         ` Tahsin Erdogan
  2017-06-21 17:43           ` Andreas Dilger
  2017-06-21 18:33           ` Andreas Dilger
  -1 siblings, 2 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-20  9:01 UTC (permalink / raw)
  To: Jan Kara, linux-ext4; +Cc: linux-kernel, Tahsin Erdogan

Make names more generic so that mbcache usage is not limited to
block sharing. In a subsequent patch in the series
("ext4: xattr inode deduplication"), we start using the mbcache code
for sharing xattr inodes. With that patch, old mb_cache_entry.e_block
field could be holding either a block number or an inode number.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v2: updated commit title and description

 fs/ext2/xattr.c         | 18 +++++++++---------
 fs/ext4/xattr.c         | 10 +++++-----
 fs/mbcache.c            | 43 +++++++++++++++++++++----------------------
 include/linux/mbcache.h | 11 +++++------
 4 files changed, 40 insertions(+), 42 deletions(-)

diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index fbdb8f171893..1e5f76070580 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -493,8 +493,8 @@ bad_block:		ext2_error(sb, "ext2_xattr_set",
 			 * This must happen under buffer lock for
 			 * ext2_xattr_set2() to reliably detect modified block
 			 */
-			mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
-						    hash, bh->b_blocknr);
+			mb_cache_entry_delete(EXT2_SB(sb)->s_mb_cache, hash,
+					      bh->b_blocknr);
 
 			/* keep the buffer locked while modifying it. */
 		} else {
@@ -721,8 +721,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 			 * This must happen under buffer lock for
 			 * ext2_xattr_set2() to reliably detect freed block
 			 */
-			mb_cache_entry_delete_block(ext2_mb_cache,
-						    hash, old_bh->b_blocknr);
+			mb_cache_entry_delete(ext2_mb_cache, hash,
+					      old_bh->b_blocknr);
 			/* Free the old block. */
 			ea_bdebug(old_bh, "freeing");
 			ext2_free_blocks(inode, old_bh->b_blocknr, 1);
@@ -795,8 +795,8 @@ ext2_xattr_delete_inode(struct inode *inode)
 		 * This must happen under buffer lock for ext2_xattr_set2() to
 		 * reliably detect freed block
 		 */
-		mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
-					    hash, bh->b_blocknr);
+		mb_cache_entry_delete(EXT2_SB(inode->i_sb)->s_mb_cache, hash,
+				      bh->b_blocknr);
 		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
 		get_bh(bh);
 		bforget(bh);
@@ -907,11 +907,11 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
 	while (ce) {
 		struct buffer_head *bh;
 
-		bh = sb_bread(inode->i_sb, ce->e_block);
+		bh = sb_bread(inode->i_sb, ce->e_value);
 		if (!bh) {
 			ext2_error(inode->i_sb, "ext2_xattr_cache_find",
 				"inode %ld: block %ld read error",
-				inode->i_ino, (unsigned long) ce->e_block);
+				inode->i_ino, (unsigned long) ce->e_value);
 		} else {
 			lock_buffer(bh);
 			/*
@@ -931,7 +931,7 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
 			} else if (le32_to_cpu(HDR(bh)->h_refcount) >
 				   EXT2_XATTR_REFCOUNT_MAX) {
 				ea_idebug(inode, "block %ld refcount %d>%d",
-					  (unsigned long) ce->e_block,
+					  (unsigned long) ce->e_value,
 					  le32_to_cpu(HDR(bh)->h_refcount),
 					  EXT2_XATTR_REFCOUNT_MAX);
 			} else if (!ext2_xattr_cmp(header, HDR(bh))) {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c09fcffb0878..0b43e0e52e26 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -678,7 +678,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		 * This must happen under buffer lock for
 		 * ext4_xattr_block_set() to reliably detect freed block
 		 */
-		mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
+		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
 		get_bh(bh);
 		unlock_buffer(bh);
 		ext4_free_blocks(handle, inode, bh, 0, 1,
@@ -1115,8 +1115,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			 * ext4_xattr_block_set() to reliably detect modified
 			 * block
 			 */
-			mb_cache_entry_delete_block(ext4_mb_cache, hash,
-						    bs->bh->b_blocknr);
+			mb_cache_entry_delete(ext4_mb_cache, hash,
+					      bs->bh->b_blocknr);
 			ea_bdebug(bs->bh, "modifying in-place");
 			error = ext4_xattr_set_entry(i, s, handle, inode);
 			if (!error) {
@@ -2238,10 +2238,10 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
 	while (ce) {
 		struct buffer_head *bh;
 
-		bh = sb_bread(inode->i_sb, ce->e_block);
+		bh = sb_bread(inode->i_sb, ce->e_value);
 		if (!bh) {
 			EXT4_ERROR_INODE(inode, "block %lu read error",
-					 (unsigned long) ce->e_block);
+					 (unsigned long) ce->e_value);
 		} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
 			*pce = ce;
 			return bh;
diff --git a/fs/mbcache.c b/fs/mbcache.c
index b19be429d655..45a8d52dc991 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -10,7 +10,7 @@
 /*
  * Mbcache is a simple key-value store. Keys need not be unique, however
  * key-value pairs are expected to be unique (we use this fact in
- * mb_cache_entry_delete_block()).
+ * mb_cache_entry_delete()).
  *
  * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
  * They use hash of a block contents as a key and block number as a value.
@@ -62,15 +62,15 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
  * @cache - cache where the entry should be created
  * @mask - gfp mask with which the entry should be allocated
  * @key - key of the entry
- * @block - block that contains data
- * @reusable - is the block reusable by other inodes?
+ * @value - value of the entry
+ * @reusable - is the entry reusable by others?
  *
- * Creates entry in @cache with key @key and records that data is stored in
- * block @block. The function returns -EBUSY if entry with the same key
- * and for the same block already exists in cache. Otherwise 0 is returned.
+ * Creates entry in @cache with key @key and value @value. The function returns
+ * -EBUSY if entry with the same key and value already exists in cache.
+ * Otherwise 0 is returned.
  */
 int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
-			  sector_t block, bool reusable)
+			  u64 value, bool reusable)
 {
 	struct mb_cache_entry *entry, *dup;
 	struct hlist_bl_node *dup_node;
@@ -91,12 +91,12 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 	/* One ref for hash, one ref returned */
 	atomic_set(&entry->e_refcnt, 1);
 	entry->e_key = key;
-	entry->e_block = block;
+	entry->e_value = value;
 	entry->e_reusable = reusable;
 	head = mb_cache_entry_head(cache, key);
 	hlist_bl_lock(head);
 	hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
-		if (dup->e_key == key && dup->e_block == block) {
+		if (dup->e_key == key && dup->e_value == value) {
 			hlist_bl_unlock(head);
 			kmem_cache_free(mb_entry_cache, entry);
 			return -EBUSY;
@@ -187,13 +187,13 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
 EXPORT_SYMBOL(mb_cache_entry_find_next);
 
 /*
- * mb_cache_entry_get - get a cache entry by block number (and key)
+ * mb_cache_entry_get - get a cache entry by value (and key)
  * @cache - cache we work with
- * @key - key of block number @block
- * @block - block number
+ * @key - key
+ * @value - value
  */
 struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
-					  sector_t block)
+					  u64 value)
 {
 	struct hlist_bl_node *node;
 	struct hlist_bl_head *head;
@@ -202,7 +202,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 	head = mb_cache_entry_head(cache, key);
 	hlist_bl_lock(head);
 	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
-		if (entry->e_key == key && entry->e_block == block) {
+		if (entry->e_key == key && entry->e_value == value) {
 			atomic_inc(&entry->e_refcnt);
 			goto out;
 		}
@@ -214,15 +214,14 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 }
 EXPORT_SYMBOL(mb_cache_entry_get);
 
-/* mb_cache_entry_delete_block - remove information about block from cache
+/* mb_cache_entry_delete - remove a cache entry
  * @cache - cache we work with
- * @key - key of block @block
- * @block - block number
+ * @key - key
+ * @value - value
  *
- * Remove entry from cache @cache with key @key with data stored in @block.
+ * Remove entry from cache @cache with key @key and value @value.
  */
-void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
-				 sector_t block)
+void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
 {
 	struct hlist_bl_node *node;
 	struct hlist_bl_head *head;
@@ -231,7 +230,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
 	head = mb_cache_entry_head(cache, key);
 	hlist_bl_lock(head);
 	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
-		if (entry->e_key == key && entry->e_block == block) {
+		if (entry->e_key == key && entry->e_value == value) {
 			/* We keep hash list reference to keep entry alive */
 			hlist_bl_del_init(&entry->e_hash_list);
 			hlist_bl_unlock(head);
@@ -248,7 +247,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
 	}
 	hlist_bl_unlock(head);
 }
-EXPORT_SYMBOL(mb_cache_entry_delete_block);
+EXPORT_SYMBOL(mb_cache_entry_delete);
 
 /* mb_cache_entry_touch - cache entry got used
  * @cache - cache the entry belongs to
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 86c9a8b480c5..e1bc73414983 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -19,15 +19,15 @@ struct mb_cache_entry {
 	u32			e_key;
 	u32			e_referenced:1;
 	u32			e_reusable:1;
-	/* Block number of hashed block - stable during lifetime of the entry */
-	sector_t		e_block;
+	/* User provided value - stable during lifetime of the entry */
+	u64			e_value;
 };
 
 struct mb_cache *mb_cache_create(int bucket_bits);
 void mb_cache_destroy(struct mb_cache *cache);
 
 int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
-			  sector_t block, bool reusable);
+			  u64 value, bool reusable);
 void __mb_cache_entry_free(struct mb_cache_entry *entry);
 static inline int mb_cache_entry_put(struct mb_cache *cache,
 				     struct mb_cache_entry *entry)
@@ -38,10 +38,9 @@ static inline int mb_cache_entry_put(struct mb_cache *cache,
 	return 1;
 }
 
-void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
-				  sector_t block);
+void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value);
 struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
-					  sector_t block);
+					  u64 value);
 struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
 						 u32 key);
 struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
-- 
2.13.1.518.g3df882009-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH v3 26/28] ext4: cleanup transaction restarts during inode deletion
  2017-06-15  0:11       ` [Ocfs2-devel] " Andreas Dilger
  (?)
@ 2017-06-20  9:04       ` Tahsin Erdogan
  2017-06-20  9:29         ` Tahsin Erdogan
  -1 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-20  9:04 UTC (permalink / raw)
  To: Andreas Dilger, linux-ext4; +Cc: linux-kernel, Tahsin Erdogan

During inode deletion, journal credits that will be needed are hard to
determine, that is why we have journal extend/restart calls in several
places. Whenever a transaction is restarted, filesystem must be in a
consistent state because there is no atomicity guarantee beyond a
restart call.

Add ext4_xattr_ensure_credits() helper function which takes care of
journal extend/restart logic. It also handles getting jbd2 write access
and dirty metadata calls. This function is called at every iteration of
handling an ea_inode reference.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v3: fixed checkpatch.pl warnings about long lines and indented label 

v2: made ext4_xattr_ensure_credits() static

 fs/ext4/inode.c |  66 ++++-----------
 fs/ext4/xattr.c | 258 ++++++++++++++++++++++++++++++++++++--------------------
 fs/ext4/xattr.h |   3 +-
 3 files changed, 184 insertions(+), 143 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cf91532765a4..cd007f9757d1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -239,7 +239,11 @@ void ext4_evict_inode(struct inode *inode)
 	 */
 	sb_start_intwrite(inode->i_sb);
 
-	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits);
+	if (!IS_NOQUOTA(inode))
+		extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
+
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+				 ext4_blocks_for_truncate(inode)+extra_credits);
 	if (IS_ERR(handle)) {
 		ext4_std_error(inode->i_sb, PTR_ERR(handle));
 		/*
@@ -251,36 +255,9 @@ void ext4_evict_inode(struct inode *inode)
 		sb_end_intwrite(inode->i_sb);
 		goto no_delete;
 	}
+
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
-
-	/*
-	 * Delete xattr inode before deleting the main inode.
-	 */
-	err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array);
-	if (err) {
-		ext4_warning(inode->i_sb,
-			     "couldn't delete inode's xattr (err %d)", err);
-		goto stop_handle;
-	}
-
-	if (!IS_NOQUOTA(inode))
-		extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
-
-	if (!ext4_handle_has_enough_credits(handle,
-			ext4_blocks_for_truncate(inode) + extra_credits)) {
-		err = ext4_journal_extend(handle,
-			ext4_blocks_for_truncate(inode) + extra_credits);
-		if (err > 0)
-			err = ext4_journal_restart(handle,
-			ext4_blocks_for_truncate(inode) + extra_credits);
-		if (err != 0) {
-			ext4_warning(inode->i_sb,
-				     "couldn't extend journal (err %d)", err);
-			goto stop_handle;
-		}
-	}
-
 	inode->i_size = 0;
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
@@ -298,25 +275,17 @@ void ext4_evict_inode(struct inode *inode)
 		}
 	}
 
-	/*
-	 * ext4_ext_truncate() doesn't reserve any slop when it
-	 * restarts journal transactions; therefore there may not be
-	 * enough credits left in the handle to remove the inode from
-	 * the orphan list and set the dtime field.
-	 */
-	if (!ext4_handle_has_enough_credits(handle, extra_credits)) {
-		err = ext4_journal_extend(handle, extra_credits);
-		if (err > 0)
-			err = ext4_journal_restart(handle, extra_credits);
-		if (err != 0) {
-			ext4_warning(inode->i_sb,
-				     "couldn't extend journal (err %d)", err);
-		stop_handle:
-			ext4_journal_stop(handle);
-			ext4_orphan_del(NULL, inode);
-			sb_end_intwrite(inode->i_sb);
-			goto no_delete;
-		}
+	/* Remove xattr references. */
+	err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
+				      extra_credits);
+	if (err) {
+		ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
+stop_handle:
+		ext4_journal_stop(handle);
+		ext4_orphan_del(NULL, inode);
+		sb_end_intwrite(inode->i_sb);
+		ext4_xattr_inode_array_free(ea_inode_array);
+		goto no_delete;
 	}
 
 	/*
@@ -342,7 +311,6 @@ void ext4_evict_inode(struct inode *inode)
 		ext4_clear_inode(inode);
 	else
 		ext4_free_inode(handle, inode);
-
 	ext4_journal_stop(handle);
 	sb_end_intwrite(inode->i_sb);
 	ext4_xattr_inode_array_free(ea_inode_array);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e69550e23d64..0484df8dadd1 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -108,6 +108,10 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
 #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
 				inode->i_sb->s_fs_info)->s_mb_cache)
 
+static int
+ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
+			struct inode *inode);
+
 #ifdef CONFIG_LOCKDEP
 void ext4_xattr_inode_set_class(struct inode *ea_inode)
 {
@@ -653,6 +657,128 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 	}
 }
 
+static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
+				     int credits, struct buffer_head *bh,
+				     bool dirty, bool block_csum)
+{
+	int error;
+
+	if (!ext4_handle_valid(handle))
+		return 0;
+
+	if (handle->h_buffer_credits >= credits)
+		return 0;
+
+	error = ext4_journal_extend(handle, credits - handle->h_buffer_credits);
+	if (!error)
+		return 0;
+	if (error < 0) {
+		ext4_warning(inode->i_sb, "Extend journal (error %d)", error);
+		return error;
+	}
+
+	if (bh && dirty) {
+		if (block_csum)
+			ext4_xattr_block_csum_set(inode, bh);
+		error = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (error) {
+			ext4_warning(inode->i_sb, "Handle metadata (error %d)",
+				     error);
+			return error;
+		}
+	}
+
+	error = ext4_journal_restart(handle, credits);
+	if (error) {
+		ext4_warning(inode->i_sb, "Restart journal (error %d)", error);
+		return error;
+	}
+
+	if (bh) {
+		error = ext4_journal_get_write_access(handle, bh);
+		if (error) {
+			ext4_warning(inode->i_sb,
+				     "Get write access failed (error %d)",
+				     error);
+			return error;
+		}
+	}
+	return 0;
+}
+
+static void
+ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
+			    struct buffer_head *bh,
+			    struct ext4_xattr_entry *first, bool block_csum,
+			    struct ext4_xattr_inode_array **ea_inode_array,
+			    int extra_credits)
+{
+	struct inode *ea_inode;
+	struct ext4_xattr_entry *entry;
+	bool dirty = false;
+	unsigned int ea_ino;
+	int err;
+	int credits;
+
+	/* One credit for dec ref on ea_inode, one for orphan list addition, */
+	credits = 2 + extra_credits;
+
+	for (entry = first; !IS_LAST_ENTRY(entry);
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		ea_ino = le32_to_cpu(entry->e_value_inum);
+		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		if (err)
+			continue;
+
+		err = ext4_expand_inode_array(ea_inode_array, ea_inode);
+		if (err) {
+			ext4_warning_inode(ea_inode,
+					   "Expand inode array err=%d", err);
+			iput(ea_inode);
+			continue;
+		}
+
+		err = ext4_xattr_ensure_credits(handle, parent, credits, bh,
+						dirty, block_csum);
+		if (err) {
+			ext4_warning_inode(ea_inode, "Ensure credits err=%d",
+					   err);
+			continue;
+		}
+
+		inode_lock(ea_inode);
+		clear_nlink(ea_inode);
+		ext4_orphan_add(handle, ea_inode);
+		inode_unlock(ea_inode);
+
+		/*
+		 * Forget about ea_inode within the same transaction that
+		 * decrements the ref count. This avoids duplicate decrements in
+		 * case the rest of the work spills over to subsequent
+		 * transactions.
+		 */
+		entry->e_value_inum = 0;
+		entry->e_value_size = 0;
+
+		dirty = true;
+	}
+
+	if (dirty) {
+		/*
+		 * Note that we are deliberately skipping csum calculation for
+		 * the final update because we do not expect any journal
+		 * restarts until xattr block is freed.
+		 */
+
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (err)
+			ext4_warning_inode(parent,
+					   "handle dirty metadata err=%d", err);
+	}
+}
+
 /*
  * Release the xattr block BH: If the reference count is > 1, decrement it;
  * otherwise free the block.
@@ -1985,42 +2111,6 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
 	return 0;
 }
 
-/**
- * Add xattr inode to orphan list
- */
-static int
-ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits,
-			    struct ext4_xattr_inode_array *ea_inode_array)
-{
-	int idx = 0, error = 0;
-	struct inode *ea_inode;
-
-	if (ea_inode_array == NULL)
-		return 0;
-
-	for (; idx < ea_inode_array->count; ++idx) {
-		if (!ext4_handle_has_enough_credits(handle, credits)) {
-			error = ext4_journal_extend(handle, credits);
-			if (error > 0)
-				error = ext4_journal_restart(handle, credits);
-
-			if (error != 0) {
-				ext4_warning(inode->i_sb,
-					"couldn't extend journal "
-					"(err %d)", error);
-				return error;
-			}
-		}
-		ea_inode = ea_inode_array->inodes[idx];
-		inode_lock(ea_inode);
-		ext4_orphan_add(handle, ea_inode);
-		inode_unlock(ea_inode);
-		/* the inode's i_count will be released by caller */
-	}
-
-	return 0;
-}
-
 /*
  * ext4_xattr_delete_inode()
  *
@@ -2033,16 +2123,23 @@ ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits,
  */
 int
 ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-			struct ext4_xattr_inode_array **ea_inode_array)
+			struct ext4_xattr_inode_array **ea_inode_array,
+			int extra_credits)
 {
 	struct buffer_head *bh = NULL;
 	struct ext4_xattr_ibody_header *header;
 	struct ext4_inode *raw_inode;
-	struct ext4_iloc iloc;
-	struct ext4_xattr_entry *entry;
-	struct inode *ea_inode;
-	unsigned int ea_ino;
-	int credits = 3, error = 0;
+	struct ext4_iloc iloc = { .bh = NULL };
+	int error;
+
+	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
+					  NULL /* bh */,
+					  false /* dirty */,
+					  false /* block_csum */);
+	if (error) {
+		EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error);
+		goto cleanup;
+	}
 
 	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
 		goto delete_external_ea;
@@ -2050,31 +2147,20 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	error = ext4_get_inode_loc(inode, &iloc);
 	if (error)
 		goto cleanup;
+
+	error = ext4_journal_get_write_access(handle, iloc.bh);
+	if (error)
+		goto cleanup;
+
 	raw_inode = ext4_raw_inode(&iloc);
 	header = IHDR(inode, raw_inode);
-	for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
-	     entry = EXT4_XATTR_NEXT(entry)) {
-		if (!entry->e_value_inum)
-			continue;
-		ea_ino = le32_to_cpu(entry->e_value_inum);
-		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-		if (error)
-			continue;
-		error = ext4_expand_inode_array(ea_inode_array, ea_inode);
-		if (error) {
-			iput(ea_inode);
-			brelse(iloc.bh);
-			goto cleanup;
-		}
-		entry->e_value_inum = 0;
-	}
-	brelse(iloc.bh);
+	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
+				    false /* block_csum */, ea_inode_array,
+				    extra_credits);
 
 delete_external_ea:
 	if (!EXT4_I(inode)->i_file_acl) {
-		/* add xattr inode to orphan list */
-		error = ext4_xattr_inode_orphan_add(handle, inode, credits,
-						    *ea_inode_array);
+		error = 0;
 		goto cleanup;
 	}
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
@@ -2092,46 +2178,32 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 		goto cleanup;
 	}
 
-	for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
-	     entry = EXT4_XATTR_NEXT(entry)) {
-		if (!entry->e_value_inum)
-			continue;
-		ea_ino = le32_to_cpu(entry->e_value_inum);
-		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-		if (error)
-			continue;
-		error = ext4_expand_inode_array(ea_inode_array, ea_inode);
-		if (error)
-			goto cleanup;
-		entry->e_value_inum = 0;
-	}
-
-	/* add xattr inode to orphan list */
-	error = ext4_xattr_inode_orphan_add(handle, inode, credits,
-					*ea_inode_array);
-	if (error)
-		goto cleanup;
-
-	if (!IS_NOQUOTA(inode))
-		credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
-
-	if (!ext4_handle_has_enough_credits(handle, credits)) {
-		error = ext4_journal_extend(handle, credits);
-		if (error > 0)
-			error = ext4_journal_restart(handle, credits);
+	if (ext4_has_feature_ea_inode(inode->i_sb)) {
+		error = ext4_journal_get_write_access(handle, bh);
 		if (error) {
-			ext4_warning(inode->i_sb,
-				"couldn't extend journal (err %d)", error);
+			EXT4_ERROR_INODE(inode, "write access %llu",
+					 EXT4_I(inode)->i_file_acl);
 			goto cleanup;
 		}
+		ext4_xattr_inode_remove_all(handle, inode, bh,
+					    BFIRST(bh),
+					    true /* block_csum */,
+					    ea_inode_array,
+					    extra_credits);
 	}
 
 	ext4_xattr_release_block(handle, inode, bh);
+	/* Update i_file_acl within the same transaction that releases block. */
 	EXT4_I(inode)->i_file_acl = 0;
-
+	error = ext4_mark_inode_dirty(handle, inode);
+	if (error) {
+		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
+				 error);
+		goto cleanup;
+	}
 cleanup:
+	brelse(iloc.bh);
 	brelse(bh);
-
 	return error;
 }
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index adf761518a73..b2005a2716d9 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -169,7 +169,8 @@ extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
 
 extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-				   struct ext4_xattr_inode_array **array);
+				   struct ext4_xattr_inode_array **array,
+				   int extra_credits);
 extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
 
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
-- 
2.13.1.518.g3df882009-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH v5 27/28] ext4: xattr inode deduplication
  2017-06-14 23:26                 ` Andreas Dilger
@ 2017-06-20  9:07                   ` Tahsin Erdogan
  2017-06-20  9:49                     ` Tahsin Erdogan
                                       ` (2 more replies)
  0 siblings, 3 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-20  9:07 UTC (permalink / raw)
  To: Andreas Dilger, Darrick J . Wong, linux-ext4; +Cc: linux-kernel, Tahsin Erdogan

Ext4 now supports xattr values that are up to 64k in size (vfs limit).
Large xattr values are stored in external inodes each one holding a
single value. Once written the data blocks of these inodes are immutable.

The real world use cases are expected to have a lot of value duplication
such as inherited acls etc. To reduce data duplication on disk, this patch
implements a deduplicator that allows sharing of xattr inodes.

The deduplication is based on an in-memory hash lookup that is a best
effort sharing scheme. When a xattr inode is read from disk (i.e.
getxattr() call), its crc32c hash is added to a hash table. Before
creating a new xattr inode for a value being set, the hash table is
checked to see if an existing inode holds an identical value. If such an
inode is found, the ref count on that inode is incremented. On value
removal the ref count is decremented and if it reaches zero the inode is
deleted.

The quota charging for such inodes is manually managed. Every reference
holder is charged the full size as if there was no sharing happening.
This is consistent with how xattr blocks are also charged.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v5:
 - made ext4_meta_trans_blocks() static again since there are no
   remaining users outside of inode.c
 - initialize sbi->s_csum_seed when ea_inode feature is enabled
 - use l_i_version to hold lower 32 bits of the xattr ref count.
   This avoids clashes with old implementations which use i_mtime.
   Since l_i_version is not available in HURD_COMPAT mode, fail mount
   request when both ea_inode feature and HURD_COMPAT are set.
 - when hash validation fails, fall back to old implementation
   which has a backref to parent.
 - fixed checkpatch.pl warning about using unsigned alone

v4:
 - eliminated xattr entry in the xattr inode to avoid complexity and
   recursion in xattr update path. Now the ref count and hash are stored
   in i_[c/m/a]time.tv_sec fields.
 - some clean up in ext4_xattr_set_entry() to reduce code duplication and
   complexity

v3:
 - use s_csum_seed for hash calculations when available
 - return error on stored vs calculated hash mismatch
 
v2:
 - make dependency on crc32c dynamic
 - update ext4_has_metadata_csum() and ext4_has_group_desc_csum() so that
   they do not misinterpret existence of EXT4_SB(sb)->s_chksum_driver

 fs/ext4/acl.c   |    5 +-
 fs/ext4/ext4.h  |   23 +-
 fs/ext4/inode.c |   13 +-
 fs/ext4/super.c |   37 +-
 fs/ext4/xattr.c | 1038 +++++++++++++++++++++++++++++++++++++++++--------------
 fs/ext4/xattr.h |   17 +-
 fs/mbcache.c    |    9 +-
 7 files changed, 848 insertions(+), 294 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 74f7ac539e00..8db03e5c78bc 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	if (error)
 		return error;
 retry:
-	credits = ext4_xattr_set_credits(inode, acl_size);
+	error = ext4_xattr_set_credits(inode, acl_size, &credits);
+	if (error)
+		return error;
+
 	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d79d8d7bee88..59e9488c4876 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1517,6 +1517,7 @@ struct ext4_sb_info {
 	long s_es_nr_inode;
 	struct ext4_es_stats s_es_stats;
 	struct mb_cache *s_mb_cache;
+	struct mb_cache *s_ea_inode_cache;
 	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
 
 	/* Ratelimit ext4 messages. */
@@ -2099,7 +2100,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
 	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
 }
 
-#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
+static inline bool ext4_is_quota_file(struct inode *inode)
+{
+	return IS_NOQUOTA(inode) &&
+	       !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
+}
 
 /*
  * This structure is stuffed into the struct file's private_data field
@@ -2482,7 +2487,6 @@ extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
-extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
@@ -2709,19 +2713,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
 extern int ext4_register_li_request(struct super_block *sb,
 				    ext4_group_t first_not_zeroed);
 
-static inline int ext4_has_group_desc_csum(struct super_block *sb)
-{
-	return ext4_has_feature_gdt_csum(sb) ||
-	       EXT4_SB(sb)->s_chksum_driver != NULL;
-}
-
 static inline int ext4_has_metadata_csum(struct super_block *sb)
 {
 	WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
 		     !EXT4_SB(sb)->s_chksum_driver);
 
-	return (EXT4_SB(sb)->s_chksum_driver != NULL);
+	return ext4_has_feature_metadata_csum(sb) &&
+	       (EXT4_SB(sb)->s_chksum_driver != NULL);
 }
+
+static inline int ext4_has_group_desc_csum(struct super_block *sb)
+{
+	return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
+}
+
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
 	return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cd007f9757d1..ea95bd9eab81 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -139,6 +139,8 @@ static void ext4_invalidatepage(struct page *page, unsigned int offset,
 				unsigned int length);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+				  int pextents);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -4843,8 +4845,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	}
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
-	if (ei->i_flags & EXT4_EA_INODE_FL)
+
+	if (ei->i_flags & EXT4_EA_INODE_FL) {
 		ext4_xattr_inode_set_class(inode);
+
+		inode_lock(inode);
+		inode->i_flags |= S_NOQUOTA;
+		inode_unlock(inode);
+	}
+
 	unlock_new_inode(inode);
 	return inode;
 
@@ -5503,7 +5512,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
  *
  * Also account for superblock, inode, quota and xattr blocks
  */
-int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
 				  int pextents)
 {
 	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b02a23ec92ca..2bfacd737bb6 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
 		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
+	if (sbi->s_ea_inode_cache) {
+		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+		sbi->s_ea_inode_cache = NULL;
+	}
 	if (sbi->s_mb_cache) {
 		ext4_xattr_destroy_cache(sbi->s_mb_cache);
 		sbi->s_mb_cache = NULL;
@@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 	if (res)
 		return res;
 retry:
-	credits = ext4_xattr_set_credits(inode, len);
+	res = ext4_xattr_set_credits(inode, len, &credits);
+	if (res)
+		return res;
+
 	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -3445,7 +3452,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* Load the checksum driver */
-	if (ext4_has_feature_metadata_csum(sb)) {
+	if (ext4_has_feature_metadata_csum(sb) ||
+	    ext4_has_feature_ea_inode(sb)) {
 		sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
 		if (IS_ERR(sbi->s_chksum_driver)) {
 			ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
@@ -3467,7 +3475,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	/* Precompute checksum seed for all metadata */
 	if (ext4_has_feature_csum_seed(sb))
 		sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
-	else if (ext4_has_metadata_csum(sb))
+	else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
 		sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
 					       sizeof(es->s_uuid));
 
@@ -3597,6 +3605,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				 "The Hurd can't support 64-bit file systems");
 			goto failed_mount;
 		}
+
+		/*
+		 * ea_inode feature uses l_i_version field which is not
+		 * available in HURD_COMPAT mode.
+		 */
+		if (ext4_has_feature_ea_inode(sb)) {
+			ext4_msg(sb, KERN_ERR,
+				 "ea_inode feature is not supported for Hurd");
+			goto failed_mount;
+		}
 	}
 
 	if (IS_EXT2_SB(sb)) {
@@ -4067,6 +4085,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount_wq;
 	}
 
+	if (ext4_has_feature_ea_inode(sb)) {
+		sbi->s_ea_inode_cache = ext4_xattr_create_cache();
+		if (!sbi->s_ea_inode_cache) {
+			ext4_msg(sb, KERN_ERR,
+				 "Failed to create an s_ea_inode_cache");
+			goto failed_mount_wq;
+		}
+	}
+
 	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
 	    (blocksize != PAGE_SIZE)) {
 		ext4_msg(sb, KERN_ERR,
@@ -4296,6 +4323,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (EXT4_SB(sb)->rsv_conversion_wq)
 		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
 failed_mount_wq:
+	if (sbi->s_ea_inode_cache) {
+		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+		sbi->s_ea_inode_cache = NULL;
+	}
 	if (sbi->s_mb_cache) {
 		ext4_xattr_destroy_cache(sbi->s_mb_cache);
 		sbi->s_mb_cache = NULL;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 0484df8dadd1..d7e60358ec91 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -108,6 +108,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
 #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
 				inode->i_sb->s_fs_info)->s_mb_cache)
 
+#define EA_INODE_CACHE(inode)	(((struct ext4_sb_info *) \
+				inode->i_sb->s_fs_info)->s_ea_inode_cache)
+
 static int
 ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
 			struct inode *inode);
@@ -280,15 +283,44 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
 	return cmp ? -ENODATA : 0;
 }
 
+static u32
+ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
+{
+	return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size);
+}
+
+static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
+{
+	return ((u64)ea_inode->i_ctime.tv_sec << 32) |
+	       ((u32)ea_inode->i_version);
+}
+
+static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
+{
+	ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32);
+	ea_inode->i_version = (u32)ref_count;
+}
+
+static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
+{
+	return (u32)ea_inode->i_atime.tv_sec;
+}
+
+static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
+{
+	ea_inode->i_atime.tv_sec = hash;
+}
+
 /*
  * Read the EA value from an inode.
  */
 static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
 {
 	unsigned long block = 0;
-	struct buffer_head *bh = NULL;
+	struct buffer_head *bh;
 	int blocksize = ea_inode->i_sb->s_blocksize;
 	size_t csize, copied = 0;
+	void *copy_pos = buf;
 
 	while (copied < size) {
 		csize = (size - copied) > blocksize ? blocksize : size - copied;
@@ -298,10 +330,10 @@ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
 		if (!bh)
 			return -EFSCORRUPTED;
 
-		memcpy(buf, bh->b_data, csize);
+		memcpy(copy_pos, bh->b_data, csize);
 		brelse(bh);
 
-		buf += csize;
+		copy_pos += csize;
 		block += 1;
 		copied += csize;
 	}
@@ -317,29 +349,24 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
 	inode = ext4_iget(parent->i_sb, ea_ino);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
-		ext4_error(parent->i_sb, "error while reading EA inode %lu "
-			   "err=%d", ea_ino, err);
+		ext4_error(parent->i_sb,
+			   "error while reading EA inode %lu err=%d", ea_ino,
+			   err);
 		return err;
 	}
 
 	if (is_bad_inode(inode)) {
-		ext4_error(parent->i_sb, "error while reading EA inode %lu "
-			   "is_bad_inode", ea_ino);
+		ext4_error(parent->i_sb,
+			   "error while reading EA inode %lu is_bad_inode",
+			   ea_ino);
 		err = -EIO;
 		goto error;
 	}
 
-	if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
-	    inode->i_generation != parent->i_generation) {
-		ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
-			   "to parent is invalid.", ea_ino);
-		err = -EINVAL;
-		goto error;
-	}
-
 	if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
-		ext4_error(parent->i_sb, "EA inode %lu does not have "
-			   "EXT4_EA_INODE_FL flag set.\n", ea_ino);
+		ext4_error(parent->i_sb,
+			   "EA inode %lu does not have EXT4_EA_INODE_FL flag",
+			    ea_ino);
 		err = -EINVAL;
 		goto error;
 	}
@@ -351,6 +378,20 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
 	return err;
 }
 
+static int
+ext4_xattr_inode_verify_hash(struct inode *ea_inode, void *buffer, size_t size)
+{
+	u32 hash;
+
+	/* Verify stored hash matches calculated hash. */
+	hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size);
+	if (hash != ext4_xattr_inode_get_hash(ea_inode))
+		return -EFSCORRUPTED;
+	return 0;
+}
+
+#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
+
 /*
  * Read the value from the EA inode.
  */
@@ -358,17 +399,53 @@ static int
 ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
 		     size_t size)
 {
+	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
 	struct inode *ea_inode;
-	int ret;
+	int err;
 
-	ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-	if (ret)
-		return ret;
+	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+	if (err) {
+		ea_inode = NULL;
+		goto out;
+	}
 
-	ret = ext4_xattr_inode_read(ea_inode, buffer, size);
-	iput(ea_inode);
+	if (i_size_read(ea_inode) != size) {
+		ext4_warning_inode(ea_inode,
+				   "ea_inode file size=%llu entry size=%zu",
+				   i_size_read(ea_inode), size);
+		err = -EFSCORRUPTED;
+		goto out;
+	}
 
-	return ret;
+	err = ext4_xattr_inode_read(ea_inode, buffer, size);
+	if (err)
+		goto out;
+
+	err = ext4_xattr_inode_verify_hash(ea_inode, buffer, size);
+	/*
+	 * Compatibility check for old Lustre ea_inode implementation. Old
+	 * version does not have hash validation, but it has a backpointer
+	 * from ea_inode to the parent inode.
+	 */
+	if (err == -EFSCORRUPTED) {
+		if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != inode->i_ino ||
+		    ea_inode->i_generation != inode->i_generation) {
+			ext4_warning_inode(ea_inode,
+					   "EA inode hash validation failed");
+			goto out;
+		}
+		/* Do not add ea_inode to the cache. */
+		ea_inode_cache = NULL;
+	} else if (err)
+		goto out;
+
+	if (ea_inode_cache)
+		mb_cache_entry_create(ea_inode_cache, GFP_NOFS,
+				      ext4_xattr_inode_get_hash(ea_inode),
+				      ea_inode->i_ino, true /* reusable */);
+out:
+	iput(ea_inode);
+	return err;
 }
 
 static int
@@ -657,6 +734,101 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 	}
 }
 
+static inline size_t round_up_cluster(struct inode *inode, size_t length)
+{
+	struct super_block *sb = inode->i_sb;
+	size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
+				    inode->i_blkbits);
+	size_t mask = ~(cluster_size - 1);
+
+	return (length + cluster_size - 1) & mask;
+}
+
+static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
+{
+	int err;
+
+	err = dquot_alloc_inode(inode);
+	if (err)
+		return err;
+	err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
+	if (err)
+		dquot_free_inode(inode);
+	return err;
+}
+
+static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
+{
+	dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
+	dquot_free_inode(inode);
+}
+
+static int __ext4_xattr_set_credits(struct super_block *sb,
+				    struct buffer_head *block_bh,
+				    size_t value_len)
+{
+	int credits;
+	int blocks;
+
+	/*
+	 * 1) Owner inode update
+	 * 2) Ref count update on old xattr block
+	 * 3) new xattr block
+	 * 4) block bitmap update for new xattr block
+	 * 5) group descriptor for new xattr block
+	 */
+	credits = 5;
+
+	/* We are done if ea_inode feature is not enabled. */
+	if (!ext4_has_feature_ea_inode(sb))
+		return credits;
+
+	/* New ea_inode, inode map, block bitmap, group descriptor. */
+	credits += 4;
+
+	/* Data blocks. */
+	blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+
+	/* Indirection block or one level of extent tree. */
+	blocks += 1;
+
+	/* Block bitmap and group descriptor updates for each block. */
+	credits += blocks * 2;
+
+	/* Blocks themselves. */
+	credits += blocks;
+
+	/* Dereference ea_inode holding old xattr value.
+	 * Old ea_inode, inode map, block bitmap, group descriptor.
+	 */
+	credits += 4;
+
+	/* Data blocks for old ea_inode. */
+	blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
+
+	/* Indirection block or one level of extent tree for old ea_inode. */
+	blocks += 1;
+
+	/* Block bitmap and group descriptor updates for each block. */
+	credits += blocks * 2;
+
+	/* Quota updates. */
+	credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
+
+	/* We may need to clone the existing xattr block in which case we need
+	 * to increment ref counts for existing ea_inodes referenced by it.
+	 */
+	if (block_bh) {
+		struct ext4_xattr_entry *entry = BFIRST(block_bh);
+
+		for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
+			if (entry->e_value_inum)
+				/* Ref count update on ea_inode. */
+				credits += 1;
+	}
+	return credits;
+}
+
 static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
 				     int credits, struct buffer_head *bh,
 				     bool dirty, bool block_csum)
@@ -706,12 +878,140 @@ static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
+static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
+				       int ref_change)
+{
+	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
+	struct ext4_iloc iloc;
+	s64 ref_count;
+	u32 hash;
+	int ret;
+
+	inode_lock(ea_inode);
+
+	ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
+	if (ret) {
+		iloc.bh = NULL;
+		goto out;
+	}
+
+	ref_count = ext4_xattr_inode_get_ref(ea_inode);
+	ref_count += ref_change;
+	ext4_xattr_inode_set_ref(ea_inode, ref_count);
+
+	if (ref_change > 0) {
+		WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld",
+			  ea_inode->i_ino, ref_count);
+
+		if (ref_count == 1) {
+			WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
+				  ea_inode->i_ino, ea_inode->i_nlink);
+
+			set_nlink(ea_inode, 1);
+			ext4_orphan_del(handle, ea_inode);
+
+			hash = ext4_xattr_inode_get_hash(ea_inode);
+			mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
+					      ea_inode->i_ino,
+					      true /* reusable */);
+		}
+	} else {
+		WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
+			  ea_inode->i_ino, ref_count);
+
+		if (ref_count == 0) {
+			WARN_ONCE(ea_inode->i_nlink != 1,
+				  "EA inode %lu i_nlink=%u",
+				  ea_inode->i_ino, ea_inode->i_nlink);
+
+			clear_nlink(ea_inode);
+			ext4_orphan_add(handle, ea_inode);
+
+			hash = ext4_xattr_inode_get_hash(ea_inode);
+			mb_cache_entry_delete(ea_inode_cache, hash,
+					      ea_inode->i_ino);
+		}
+	}
+
+	ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
+	iloc.bh = NULL;
+	if (ret)
+		ext4_warning_inode(ea_inode,
+				   "ext4_mark_iloc_dirty() failed ret=%d", ret);
+out:
+	brelse(iloc.bh);
+	inode_unlock(ea_inode);
+	return ret;
+}
+
+static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
+{
+	return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
+}
+
+static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
+{
+	return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
+}
+
+static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
+					struct ext4_xattr_entry *first)
+{
+	struct inode *ea_inode;
+	struct ext4_xattr_entry *entry;
+	struct ext4_xattr_entry *failed_entry;
+	unsigned int ea_ino;
+	int err, saved_err;
+
+	for (entry = first; !IS_LAST_ENTRY(entry);
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		ea_ino = le32_to_cpu(entry->e_value_inum);
+		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		if (err)
+			goto cleanup;
+		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
+		if (err) {
+			ext4_warning_inode(ea_inode, "inc ref error %d", err);
+			iput(ea_inode);
+			goto cleanup;
+		}
+		iput(ea_inode);
+	}
+	return 0;
+
+cleanup:
+	saved_err = err;
+	failed_entry = entry;
+
+	for (entry = first; entry != failed_entry;
+	     entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_inum)
+			continue;
+		ea_ino = le32_to_cpu(entry->e_value_inum);
+		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		if (err) {
+			ext4_warning(parent->i_sb,
+				     "cleanup ea_ino %u iget error %d", ea_ino,
+				     err);
+			continue;
+		}
+		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (err)
+			ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
+					   err);
+		iput(ea_inode);
+	}
+	return saved_err;
+}
+
 static void
-ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
-			    struct buffer_head *bh,
-			    struct ext4_xattr_entry *first, bool block_csum,
-			    struct ext4_xattr_inode_array **ea_inode_array,
-			    int extra_credits)
+ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
+			     struct buffer_head *bh,
+			     struct ext4_xattr_entry *first, bool block_csum,
+			     struct ext4_xattr_inode_array **ea_inode_array,
+			     int extra_credits, bool skip_quota)
 {
 	struct inode *ea_inode;
 	struct ext4_xattr_entry *entry;
@@ -748,10 +1048,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
 			continue;
 		}
 
-		inode_lock(ea_inode);
-		clear_nlink(ea_inode);
-		ext4_orphan_add(handle, ea_inode);
-		inode_unlock(ea_inode);
+		err = ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (err) {
+			ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
+					   err);
+			continue;
+		}
+
+		if (!skip_quota)
+			ext4_xattr_inode_free_quota(parent,
+					      le32_to_cpu(entry->e_value_size));
 
 		/*
 		 * Forget about ea_inode within the same transaction that
@@ -785,7 +1091,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
  */
 static void
 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
-			 struct buffer_head *bh)
+			 struct buffer_head *bh,
+			 struct ext4_xattr_inode_array **ea_inode_array,
+			 int extra_credits)
 {
 	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 	u32 hash, ref;
@@ -808,6 +1116,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
 		get_bh(bh);
 		unlock_buffer(bh);
+
+		if (ext4_has_feature_ea_inode(inode->i_sb))
+			ext4_xattr_inode_dec_ref_all(handle, inode, bh,
+						     BFIRST(bh),
+						     true /* block_csum */,
+						     ea_inode_array,
+						     extra_credits,
+						     true /* skip_quota */);
 		ext4_free_blocks(handle, inode, bh, 0, 1,
 				 EXT4_FREE_BLOCKS_METADATA |
 				 EXT4_FREE_BLOCKS_FORGET);
@@ -879,8 +1195,8 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
 {
 	struct buffer_head *bh = NULL;
 	unsigned long block = 0;
-	unsigned blocksize = ea_inode->i_sb->s_blocksize;
-	unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
+	int blocksize = ea_inode->i_sb->s_blocksize;
+	int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
 	int csize, wsize = 0;
 	int ret = 0;
 	int retries = 0;
@@ -948,7 +1264,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
  * Create an inode to store the value of a large EA.
  */
 static struct inode *ext4_xattr_inode_create(handle_t *handle,
-					     struct inode *inode)
+					     struct inode *inode, u32 hash)
 {
 	struct inode *ea_inode = NULL;
 	uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
@@ -966,67 +1282,115 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 		ea_inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(ea_inode);
 		ext4_xattr_inode_set_class(ea_inode);
-		ea_inode->i_generation = inode->i_generation;
-		EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
-
-		/*
-		 * A back-pointer from EA inode to parent inode will be useful
-		 * for e2fsck.
-		 */
-		EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
 		unlock_new_inode(ea_inode);
-		err = ext4_inode_attach_jinode(ea_inode);
+		ext4_xattr_inode_set_ref(ea_inode, 1);
+		ext4_xattr_inode_set_hash(ea_inode, hash);
+		err = ext4_mark_inode_dirty(handle, ea_inode);
+		if (!err)
+			err = ext4_inode_attach_jinode(ea_inode);
 		if (err) {
 			iput(ea_inode);
 			return ERR_PTR(err);
 		}
+
+		/*
+		 * Xattr inodes are shared therefore quota charging is performed
+		 * at a higher level.
+		 */
+		dquot_free_inode(ea_inode);
+		dquot_drop(ea_inode);
+		inode_lock(ea_inode);
+		ea_inode->i_flags |= S_NOQUOTA;
+		inode_unlock(ea_inode);
 	}
 
 	return ea_inode;
 }
 
-/*
- * Unlink the inode storing the value of the EA.
- */
-int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
+static struct inode *
+ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
+			    size_t value_len, u32 hash)
 {
-	struct inode *ea_inode = NULL;
-	int err;
+	struct inode *ea_inode;
+	struct mb_cache_entry *ce;
+	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
+	void *ea_data;
 
-	err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
-	if (err)
-		return err;
+	ce = mb_cache_entry_find_first(ea_inode_cache, hash);
+	if (!ce)
+		return NULL;
 
-	clear_nlink(ea_inode);
-	iput(ea_inode);
+	ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
+	if (!ea_data) {
+		mb_cache_entry_put(ea_inode_cache, ce);
+		return NULL;
+	}
 
-	return 0;
+	while (ce) {
+		ea_inode = ext4_iget(inode->i_sb, ce->e_value);
+		if (!IS_ERR(ea_inode) &&
+		    !is_bad_inode(ea_inode) &&
+		    (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) &&
+		    i_size_read(ea_inode) == value_len &&
+		    !ext4_xattr_inode_read(ea_inode, ea_data, value_len) &&
+		    !ext4_xattr_inode_verify_hash(ea_inode, ea_data,
+						  value_len) &&
+		    !memcmp(value, ea_data, value_len)) {
+			mb_cache_entry_touch(ea_inode_cache, ce);
+			mb_cache_entry_put(ea_inode_cache, ce);
+			kvfree(ea_data);
+			return ea_inode;
+		}
+
+		if (!IS_ERR(ea_inode))
+			iput(ea_inode);
+		ce = mb_cache_entry_find_next(ea_inode_cache, ce);
+	}
+	kvfree(ea_data);
+	return NULL;
 }
 
 /*
  * Add value of the EA in an inode.
  */
-static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
-				unsigned long *ea_ino, const void *value,
-				size_t value_len)
+static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
+					  const void *value, size_t value_len,
+					  struct inode **ret_inode)
 {
 	struct inode *ea_inode;
+	u32 hash;
 	int err;
 
+	hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
+	ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
+	if (ea_inode) {
+		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
+		if (err) {
+			iput(ea_inode);
+			return err;
+		}
+
+		*ret_inode = ea_inode;
+		return 0;
+	}
+
 	/* Create an inode for the EA value */
-	ea_inode = ext4_xattr_inode_create(handle, inode);
+	ea_inode = ext4_xattr_inode_create(handle, inode, hash);
 	if (IS_ERR(ea_inode))
 		return PTR_ERR(ea_inode);
 
 	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
-	if (err)
-		clear_nlink(ea_inode);
-	else
-		*ea_ino = ea_inode->i_ino;
+	if (err) {
+		ext4_xattr_inode_dec_ref(handle, ea_inode);
+		iput(ea_inode);
+		return err;
+	}
 
-	iput(ea_inode);
+	mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
+			      ea_inode->i_ino, true /* reusable */);
 
-	return err;
+	*ret_inode = ea_inode;
+	return 0;
 }
 
 static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
@@ -1034,9 +1398,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 				handle_t *handle, struct inode *inode)
 {
 	struct ext4_xattr_entry *last;
-	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
+	struct ext4_xattr_entry *here = s->here;
+	size_t min_offs = s->end - s->base, name_len = strlen(i->name);
 	int in_inode = i->in_inode;
-	int rc;
+	struct inode *old_ea_inode = NULL;
+	struct inode *new_ea_inode = NULL;
+	size_t old_size, new_size;
+	int ret;
+
+	/* Space used by old and new values. */
+	old_size = (!s->not_found && !here->e_value_inum) ?
+			EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0;
+	new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0;
+
+	/*
+	 * Optimization for the simple case when old and new values have the
+	 * same padded sizes. Not applicable if external inodes are involved.
+	 */
+	if (new_size && new_size == old_size) {
+		size_t offs = le16_to_cpu(here->e_value_offs);
+		void *val = s->base + offs;
+
+		here->e_value_size = cpu_to_le32(i->value_len);
+		if (i->value == EXT4_ZERO_XATTR_VALUE) {
+			memset(val, 0, new_size);
+		} else {
+			memcpy(val, i->value, i->value_len);
+			/* Clear padding bytes. */
+			memset(val + i->value_len, 0, new_size - i->value_len);
+		}
+		return 0;
+	}
 
 	/* Compute min_offs and last. */
 	last = s->first;
@@ -1047,122 +1439,148 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 				min_offs = offs;
 		}
 	}
-	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
-	if (!s->not_found) {
-		if (!in_inode &&
-		    !s->here->e_value_inum && s->here->e_value_size) {
-			size_t size = le32_to_cpu(s->here->e_value_size);
-			free += EXT4_XATTR_SIZE(size);
-		}
-		free += EXT4_XATTR_LEN(name_len);
-	}
+
+	/* Check whether we have enough space. */
 	if (i->value) {
-		size_t value_len = EXT4_XATTR_SIZE(i->value_len);
+		size_t free;
 
-		if (in_inode)
-			value_len = 0;
+		free = min_offs - ((void *)last - s->base) - sizeof(__u32);
+		if (!s->not_found)
+			free += EXT4_XATTR_LEN(name_len) + old_size;
 
-		if (free < EXT4_XATTR_LEN(name_len) + value_len)
-			return -ENOSPC;
+		if (free < EXT4_XATTR_LEN(name_len) + new_size) {
+			ret = -ENOSPC;
+			goto out;
+		}
 	}
 
-	if (i->value && s->not_found) {
-		/* Insert the new name. */
-		size_t size = EXT4_XATTR_LEN(name_len);
-		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
-		memmove((void *)s->here + size, s->here, rest);
-		memset(s->here, 0, size);
-		s->here->e_name_index = i->name_index;
-		s->here->e_name_len = name_len;
-		memcpy(s->here->e_name, i->name, name_len);
-	} else {
-		if (!s->here->e_value_inum && s->here->e_value_size &&
-		    s->here->e_value_offs > 0) {
-			void *first_val = s->base + min_offs;
-			size_t offs = le16_to_cpu(s->here->e_value_offs);
-			void *val = s->base + offs;
-			size_t size = EXT4_XATTR_SIZE(
-				le32_to_cpu(s->here->e_value_size));
-
-			if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
-				/* The old and the new value have the same
-				   size. Just replace. */
-				s->here->e_value_size =
-					cpu_to_le32(i->value_len);
-				if (i->value == EXT4_ZERO_XATTR_VALUE) {
-					memset(val, 0, size);
-				} else {
-					/* Clear pad bytes first. */
-					memset(val + size - EXT4_XATTR_PAD, 0,
-					       EXT4_XATTR_PAD);
-					memcpy(val, i->value, i->value_len);
-				}
-				return 0;
-			}
+	/*
+	 * Getting access to old and new ea inodes is subject to failures.
+	 * Finish that work before doing any modifications to the xattr data.
+	 */
+	if (!s->not_found && here->e_value_inum) {
+		ret = ext4_xattr_inode_iget(inode,
+					    le32_to_cpu(here->e_value_inum),
+					    &old_ea_inode);
+		if (ret) {
+			old_ea_inode = NULL;
+			goto out;
+		}
+	}
+	if (i->value && in_inode) {
+		WARN_ON_ONCE(!i->value_len);
 
-			/* Remove the old value. */
-			memmove(first_val + size, first_val, val - first_val);
-			memset(first_val, 0, size);
-			s->here->e_value_size = 0;
-			s->here->e_value_offs = 0;
-			min_offs += size;
-
-			/* Adjust all value offsets. */
-			last = s->first;
-			while (!IS_LAST_ENTRY(last)) {
-				size_t o = le16_to_cpu(last->e_value_offs);
-				if (!last->e_value_inum &&
-				    last->e_value_size && o < offs)
-					last->e_value_offs =
-						cpu_to_le16(o + size);
-				last = EXT4_XATTR_NEXT(last);
-			}
+		ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
+		if (ret)
+			goto out;
+
+		ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
+						     i->value_len,
+						     &new_ea_inode);
+		if (ret) {
+			new_ea_inode = NULL;
+			ext4_xattr_inode_free_quota(inode, i->value_len);
+			goto out;
 		}
-		if (s->here->e_value_inum) {
-			ext4_xattr_inode_unlink(inode,
-					    le32_to_cpu(s->here->e_value_inum));
-			s->here->e_value_inum = 0;
+	}
+
+	if (old_ea_inode) {
+		/* We are ready to release ref count on the old_ea_inode. */
+		ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
+		if (ret) {
+			/* Release newly required ref count on new_ea_inode. */
+			if (new_ea_inode) {
+				int err;
+
+				err = ext4_xattr_inode_dec_ref(handle,
+							       new_ea_inode);
+				if (err)
+					ext4_warning_inode(new_ea_inode,
+						  "dec ref new_ea_inode err=%d",
+						  err);
+				ext4_xattr_inode_free_quota(inode,
+							    i->value_len);
+			}
+			goto out;
 		}
-		if (!i->value) {
-			/* Remove the old name. */
-			size_t size = EXT4_XATTR_LEN(name_len);
-			last = ENTRY((void *)last - size);
-			memmove(s->here, (void *)s->here + size,
-				(void *)last - (void *)s->here + sizeof(__u32));
-			memset(last, 0, size);
+
+		ext4_xattr_inode_free_quota(inode,
+					    le32_to_cpu(here->e_value_size));
+	}
+
+	/* No failures allowed past this point. */
+
+	if (!s->not_found && here->e_value_offs) {
+		/* Remove the old value. */
+		void *first_val = s->base + min_offs;
+		size_t offs = le16_to_cpu(here->e_value_offs);
+		void *val = s->base + offs;
+
+		memmove(first_val + old_size, first_val, val - first_val);
+		memset(first_val, 0, old_size);
+		min_offs += old_size;
+
+		/* Adjust all value offsets. */
+		last = s->first;
+		while (!IS_LAST_ENTRY(last)) {
+			size_t o = le16_to_cpu(last->e_value_offs);
+
+			if (!last->e_value_inum &&
+			    last->e_value_size && o < offs)
+				last->e_value_offs = cpu_to_le16(o + old_size);
+			last = EXT4_XATTR_NEXT(last);
 		}
 	}
 
+	if (!i->value) {
+		/* Remove old name. */
+		size_t size = EXT4_XATTR_LEN(name_len);
+
+		last = ENTRY((void *)last - size);
+		memmove(here, (void *)here + size,
+			(void *)last - (void *)here + sizeof(__u32));
+		memset(last, 0, size);
+	} else if (s->not_found) {
+		/* Insert new name. */
+		size_t size = EXT4_XATTR_LEN(name_len);
+		size_t rest = (void *)last - (void *)here + sizeof(__u32);
+
+		memmove((void *)here + size, here, rest);
+		memset(here, 0, size);
+		here->e_name_index = i->name_index;
+		here->e_name_len = name_len;
+		memcpy(here->e_name, i->name, name_len);
+	} else {
+		/* This is an update, reset value info. */
+		here->e_value_inum = 0;
+		here->e_value_offs = 0;
+		here->e_value_size = 0;
+	}
+
 	if (i->value) {
-		/* Insert the new value. */
+		/* Insert new value. */
 		if (in_inode) {
-			unsigned long ea_ino =
-				le32_to_cpu(s->here->e_value_inum);
-			rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
-						  i->value, i->value_len);
-			if (rc)
-				goto out;
-			s->here->e_value_inum = cpu_to_le32(ea_ino);
-			s->here->e_value_offs = 0;
+			here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
 		} else if (i->value_len) {
-			size_t size = EXT4_XATTR_SIZE(i->value_len);
-			void *val = s->base + min_offs - size;
-			s->here->e_value_offs = cpu_to_le16(min_offs - size);
-			s->here->e_value_inum = 0;
+			void *val = s->base + min_offs - new_size;
+
+			here->e_value_offs = cpu_to_le16(min_offs - new_size);
 			if (i->value == EXT4_ZERO_XATTR_VALUE) {
-				memset(val, 0, size);
+				memset(val, 0, new_size);
 			} else {
-				/* Clear the pad bytes first. */
-				memset(val + size - EXT4_XATTR_PAD, 0,
-				       EXT4_XATTR_PAD);
 				memcpy(val, i->value, i->value_len);
+				/* Clear padding bytes. */
+				memset(val + i->value_len, 0,
+				       new_size - i->value_len);
 			}
 		}
-		s->here->e_value_size = cpu_to_le32(i->value_len);
+		here->e_value_size = cpu_to_le32(i->value_len);
 	}
-
+	ret = 0;
 out:
-	return rc;
+	iput(old_ea_inode);
+	iput(new_ea_inode);
+	return ret;
 }
 
 struct ext4_xattr_block_find {
@@ -1224,6 +1642,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 	struct mb_cache_entry *ce = NULL;
 	int error = 0;
 	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+	struct inode *ea_inode = NULL;
+	size_t old_ea_inode_size = 0;
 
 #define header(x) ((struct ext4_xattr_header *)(x))
 
@@ -1278,6 +1698,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			header(s->base)->h_refcount = cpu_to_le32(1);
 			s->here = ENTRY(s->base + offset);
 			s->end = s->base + bs->bh->b_size;
+
+			/*
+			 * If existing entry points to an xattr inode, we need
+			 * to prevent ext4_xattr_set_entry() from decrementing
+			 * ref count on it because the reference belongs to the
+			 * original block. In this case, make the entry look
+			 * like it has an empty value.
+			 */
+			if (!s->not_found && s->here->e_value_inum) {
+				/*
+				 * Defer quota free call for previous inode
+				 * until success is guaranteed.
+				 */
+				old_ea_inode_size = le32_to_cpu(
+							s->here->e_value_size);
+				s->here->e_value_inum = 0;
+				s->here->e_value_size = 0;
+			}
 		}
 	} else {
 		/* Allocate a buffer where we construct the new block. */
@@ -1299,6 +1737,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		goto bad_block;
 	if (error)
 		goto cleanup;
+
+	if (i->value && s->here->e_value_inum) {
+		unsigned int ea_ino;
+
+		/*
+		 * A ref count on ea_inode has been taken as part of the call to
+		 * ext4_xattr_set_entry() above. We would like to drop this
+		 * extra ref but we have to wait until the xattr block is
+		 * initialized and has its own ref count on the ea_inode.
+		 */
+		ea_ino = le32_to_cpu(s->here->e_value_inum);
+		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+		if (error) {
+			ea_inode = NULL;
+			goto cleanup;
+		}
+	}
+
 	if (!IS_LAST_ENTRY(s->first))
 		ext4_xattr_rehash(header(s->base), s->here);
 
@@ -1409,6 +1865,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 						 EXT4_FREE_BLOCKS_METADATA);
 				goto cleanup;
 			}
+			error = ext4_xattr_inode_inc_ref_all(handle, inode,
+						      ENTRY(header(s->base)+1));
+			if (error)
+				goto getblk_failed;
+			if (ea_inode) {
+				/* Drop the extra ref on ea_inode. */
+				error = ext4_xattr_inode_dec_ref(handle,
+								 ea_inode);
+				if (error)
+					ext4_warning_inode(ea_inode,
+							   "dec ref error=%d",
+							   error);
+				iput(ea_inode);
+				ea_inode = NULL;
+			}
+
 			lock_buffer(new_bh);
 			error = ext4_journal_get_create_access(handle, new_bh);
 			if (error) {
@@ -1428,15 +1900,38 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		}
 	}
 
+	if (old_ea_inode_size)
+		ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
+
 	/* Update the inode. */
 	EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
 
 	/* Drop the previous xattr block. */
-	if (bs->bh && bs->bh != new_bh)
-		ext4_xattr_release_block(handle, inode, bs->bh);
+	if (bs->bh && bs->bh != new_bh) {
+		struct ext4_xattr_inode_array *ea_inode_array = NULL;
+
+		ext4_xattr_release_block(handle, inode, bs->bh,
+					 &ea_inode_array,
+					 0 /* extra_credits */);
+		ext4_xattr_inode_array_free(ea_inode_array);
+	}
 	error = 0;
 
 cleanup:
+	if (ea_inode) {
+		int error2;
+
+		error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (error2)
+			ext4_warning_inode(ea_inode, "dec ref error=%d",
+					   error2);
+
+		/* If there was an error, revert the quota charge. */
+		if (error)
+			ext4_xattr_inode_free_quota(inode,
+						    i_size_read(ea_inode));
+		iput(ea_inode);
+	}
 	if (ce)
 		mb_cache_entry_put(ext4_mb_cache, ce);
 	brelse(new_bh);
@@ -1561,6 +2056,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
 	return !memcmp(value, i->value, i->value_len);
 }
 
+static struct buffer_head *ext4_xattr_get_block(struct inode *inode)
+{
+	struct buffer_head *bh;
+	int error;
+
+	if (!EXT4_I(inode)->i_file_acl)
+		return NULL;
+	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+	if (!bh)
+		return ERR_PTR(-EIO);
+	error = ext4_xattr_check_block(inode, bh);
+	if (error)
+		return ERR_PTR(error);
+	return bh;
+}
+
 /*
  * ext4_xattr_set_handle()
  *
@@ -1603,9 +2114,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 
 	/* Check journal credits under write lock. */
 	if (ext4_handle_valid(handle)) {
+		struct buffer_head *bh;
 		int credits;
 
-		credits = ext4_xattr_set_credits(inode, value_len);
+		bh = ext4_xattr_get_block(inode);
+		if (IS_ERR(bh)) {
+			error = PTR_ERR(bh);
+			goto cleanup;
+		}
+
+		credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
+		brelse(bh);
+
 		if (!ext4_handle_has_enough_credits(handle, credits)) {
 			error = -ENOSPC;
 			goto cleanup;
@@ -1641,6 +2161,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		if (flags & XATTR_CREATE)
 			goto cleanup;
 	}
+
 	if (!value) {
 		if (!is.s.not_found)
 			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
@@ -1709,34 +2230,29 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	return error;
 }
 
-int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
+int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
 {
-	struct super_block *sb = inode->i_sb;
-	int credits;
-
-	if (!EXT4_SB(sb)->s_journal)
-		return 0;
+	struct buffer_head *bh;
+	int err;
 
-	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+	*credits = 0;
 
-	/*
-	 * In case of inline data, we may push out the data to a block,
-	 * so we need to reserve credits for this eventuality
-	 */
-	if (ext4_has_inline_data(inode))
-		credits += ext4_writepage_trans_blocks(inode) + 1;
-
-	if (ext4_has_feature_ea_inode(sb)) {
-		int nrblocks = (value_len + sb->s_blocksize - 1) >>
-					sb->s_blocksize_bits;
+	if (!EXT4_SB(inode->i_sb)->s_journal)
+		return 0;
 
-		/* For new inode */
-		credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
+	down_read(&EXT4_I(inode)->xattr_sem);
 
-		/* For data blocks of EA inode */
-		credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
+	bh = ext4_xattr_get_block(inode);
+	if (IS_ERR(bh)) {
+		err = PTR_ERR(bh);
+	} else {
+		*credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
+		brelse(bh);
+		err = 0;
 	}
-	return credits;
+
+	up_read(&EXT4_I(inode)->xattr_sem);
+	return err;
 }
 
 /*
@@ -1761,7 +2277,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 		return error;
 
 retry:
-	credits = ext4_xattr_set_credits(inode, value_len);
+	error = ext4_xattr_set_credits(inode, value_len, &credits);
+	if (error)
+		return error;
+
 	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
@@ -2067,10 +2586,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 	return error;
 }
 
-
 #define EIA_INCR 16 /* must be 2^n */
 #define EIA_MASK (EIA_INCR - 1)
-/* Add the large xattr @inode into @ea_inode_array for later deletion.
+
+/* Add the large xattr @inode into @ea_inode_array for deferred iput().
  * If @ea_inode_array is new or full it will be grown and the old
  * contents copied over.
  */
@@ -2115,21 +2634,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
  * ext4_xattr_delete_inode()
  *
  * Free extended attribute resources associated with this inode. Traverse
- * all entries and unlink any xattr inodes associated with this inode. This
- * is called immediately before an inode is freed. We have exclusive
- * access to the inode. If an orphan inode is deleted it will also delete any
- * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
- * to ensure they belong to the parent inode and were not deleted already.
+ * all entries and decrement reference on any xattr inodes associated with this
+ * inode. This is called immediately before an inode is freed. We have exclusive
+ * access to the inode. If an orphan inode is deleted it will also release its
+ * references on xattr block and xattr inodes.
  */
-int
-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-			struct ext4_xattr_inode_array **ea_inode_array,
-			int extra_credits)
+int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
+			    struct ext4_xattr_inode_array **ea_inode_array,
+			    int extra_credits)
 {
 	struct buffer_head *bh = NULL;
 	struct ext4_xattr_ibody_header *header;
-	struct ext4_inode *raw_inode;
 	struct ext4_iloc iloc = { .bh = NULL };
+	struct ext4_xattr_entry *entry;
 	int error;
 
 	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
@@ -2141,66 +2658,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 		goto cleanup;
 	}
 
-	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
-		goto delete_external_ea;
+	if (ext4_has_feature_ea_inode(inode->i_sb) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
 
-	error = ext4_get_inode_loc(inode, &iloc);
-	if (error)
-		goto cleanup;
-
-	error = ext4_journal_get_write_access(handle, iloc.bh);
-	if (error)
-		goto cleanup;
+		error = ext4_get_inode_loc(inode, &iloc);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
+			goto cleanup;
+		}
 
-	raw_inode = ext4_raw_inode(&iloc);
-	header = IHDR(inode, raw_inode);
-	ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
-				    false /* block_csum */, ea_inode_array,
-				    extra_credits);
+		error = ext4_journal_get_write_access(handle, iloc.bh);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "write access (error %d)",
+					 error);
+			goto cleanup;
+		}
 
-delete_external_ea:
-	if (!EXT4_I(inode)->i_file_acl) {
-		error = 0;
-		goto cleanup;
-	}
-	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-	if (!bh) {
-		EXT4_ERROR_INODE(inode, "block %llu read error",
-				 EXT4_I(inode)->i_file_acl);
-		error = -EIO;
-		goto cleanup;
-	}
-	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
-	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-		EXT4_ERROR_INODE(inode, "bad block %llu",
-				 EXT4_I(inode)->i_file_acl);
-		error = -EFSCORRUPTED;
-		goto cleanup;
+		header = IHDR(inode, ext4_raw_inode(&iloc));
+		if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
+			ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
+						     IFIRST(header),
+						     false /* block_csum */,
+						     ea_inode_array,
+						     extra_credits,
+						     false /* skip_quota */);
 	}
 
-	if (ext4_has_feature_ea_inode(inode->i_sb)) {
-		error = ext4_journal_get_write_access(handle, bh);
-		if (error) {
-			EXT4_ERROR_INODE(inode, "write access %llu",
+	if (EXT4_I(inode)->i_file_acl) {
+		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+		if (!bh) {
+			EXT4_ERROR_INODE(inode, "block %llu read error",
 					 EXT4_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		error = ext4_xattr_check_block(inode, bh);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
+					 EXT4_I(inode)->i_file_acl, error);
 			goto cleanup;
 		}
-		ext4_xattr_inode_remove_all(handle, inode, bh,
-					    BFIRST(bh),
-					    true /* block_csum */,
-					    ea_inode_array,
-					    extra_credits);
-	}
 
-	ext4_xattr_release_block(handle, inode, bh);
-	/* Update i_file_acl within the same transaction that releases block. */
-	EXT4_I(inode)->i_file_acl = 0;
-	error = ext4_mark_inode_dirty(handle, inode);
-	if (error) {
-		EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
-				 error);
-		goto cleanup;
+		if (ext4_has_feature_ea_inode(inode->i_sb)) {
+			for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
+			     entry = EXT4_XATTR_NEXT(entry))
+				if (entry->e_value_inum)
+					ext4_xattr_inode_free_quota(inode,
+					      le32_to_cpu(entry->e_value_size));
+
+		}
+
+		ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
+					 extra_credits);
+		/*
+		 * Update i_file_acl value in the same transaction that releases
+		 * block.
+		 */
+		EXT4_I(inode)->i_file_acl = 0;
+		error = ext4_mark_inode_dirty(handle, inode);
+		if (error) {
+			EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
+					 error);
+			goto cleanup;
+		}
 	}
+	error = 0;
 cleanup:
 	brelse(iloc.bh);
 	brelse(bh);
@@ -2209,17 +2731,13 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 
 void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
 {
-	struct inode	*ea_inode;
-	int		idx = 0;
+	int idx;
 
 	if (ea_inode_array == NULL)
 		return;
 
-	for (; idx < ea_inode_array->count; ++idx) {
-		ea_inode = ea_inode_array->inodes[idx];
-		clear_nlink(ea_inode);
-		iput(ea_inode);
-	}
+	for (idx = 0; idx < ea_inode_array->count; ++idx)
+		iput(ea_inode_array->inodes[idx]);
 	kfree(ea_inode_array);
 }
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index b2005a2716d9..67616cb9a059 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -69,19 +69,6 @@ struct ext4_xattr_entry {
 		EXT4_I(inode)->i_extra_isize))
 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
 
-/*
- * Link EA inode back to parent one using i_mtime field.
- * Extra integer type conversion added to ignore higher
- * bits in i_mtime.tv_sec which might be set by ext4_get()
- */
-#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
-do {                                                  \
-      (inode)->i_mtime.tv_sec = inum;                 \
-} while(0)
-
-#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
-((__u32)(inode)->i_mtime.tv_sec)
-
 /*
  * The minimum size of EA value when you start storing it in an external inode
  * size of block - size of header - size of 1 entry - 4 null bytes
@@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
 extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
 extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
-extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
+extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
+				  int *credits);
 
-extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 				   struct ext4_xattr_inode_array **array,
 				   int extra_credits);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 45a8d52dc991..d818fd236787 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -13,10 +13,11 @@
  * mb_cache_entry_delete()).
  *
  * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
- * They use hash of a block contents as a key and block number as a value.
- * That's why keys need not be unique (different xattr blocks may end up having
- * the same hash). However block number always uniquely identifies a cache
- * entry.
+ * Ext4 also uses it for deduplication of xattr values stored in inodes.
+ * They use hash of data as a key and provide a value that may represent a
+ * block or inode number. That's why keys need not be unique (hash of different
+ * data may be the same). However user provided value always uniquely
+ * identifies a cache entry.
  *
  * We provide functions for creation and removal of entries, search by key,
  * and a special "delete entry with given key-value pair" operation. Fixed
-- 
2.13.1.518.g3df882009-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH v2 28/31] quota: add get_inode_usage callback to transfer multi-inode charges
  2017-06-19 12:36             ` [Ocfs2-devel] " Jan Kara
  (?)
@ 2017-06-20  9:12             ` Tahsin Erdogan
  2017-06-20 12:01               ` Tahsin Erdogan
  2017-06-20 15:28               ` Jan Kara
  -1 siblings, 2 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-20  9:12 UTC (permalink / raw)
  To: Jan Kara, linux-ext4; +Cc: linux-kernel, Tahsin Erdogan

Ext4 ea_inode feature allows storing xattr values in external inodes to
be able to store values that are bigger than a block in size. Ext4 also
has deduplication support for these type of inodes. With deduplication,
the actual storage waste is eliminated but the users of such inodes are
still charged full quota for the inodes as if there was no sharing
happening in the background.

This design requires ext4 to manually charge the users because the
inodes are shared.

An implication of this is that, if someone calls chown on a file that
has such references we need to transfer the quota for the file and xattr
inodes. Current dquot_transfer() function implicitly transfers one inode
charge. With ea_inode feature, we would like to transfer multiple inode
charges.

Add get_inode_usage callback which can interrogate the total number of
inodes that were charged for a given inode.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v2:
  - added get_inode_usage() callback to query total inodes charge to
    be transferred

 fs/ext4/inode.c       |  7 +++++++
 fs/ext4/ioctl.c       |  6 ++++++
 fs/ext4/super.c       | 21 ++++++++++----------
 fs/ext4/xattr.c       | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/xattr.h       |  2 ++
 fs/quota/dquot.c      | 16 +++++++++++----
 include/linux/quota.h |  2 ++
 7 files changed, 94 insertions(+), 14 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ea95bd9eab81..cd22de0b5d2c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5295,7 +5295,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
+
+		/* dquot_transfer() calls back ext4_get_inode_usage() which
+		 * counts xattr inode references.
+		 */
+		down_read(&EXT4_I(inode)->xattr_sem);
 		error = dquot_transfer(inode, attr);
+		up_read(&EXT4_I(inode)->xattr_sem);
+
 		if (error) {
 			ext4_journal_stop(handle);
 			return error;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index dde8deb11e59..42b3a73143cf 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -373,7 +373,13 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
 
 	transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
 	if (!IS_ERR(transfer_to[PRJQUOTA])) {
+
+		/* __dquot_transfer() calls back ext4_get_inode_usage() which
+		 * counts xattr inode references.
+		 */
+		down_read(&EXT4_I(inode)->xattr_sem);
 		err = __dquot_transfer(inode, transfer_to);
+		up_read(&EXT4_I(inode)->xattr_sem);
 		dqput(transfer_to[PRJQUOTA]);
 		if (err)
 			goto out_dirty;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2bfacd737bb6..4b15bf674d45 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1263,16 +1263,17 @@ static struct dquot **ext4_get_dquots(struct inode *inode)
 }
 
 static const struct dquot_operations ext4_quota_operations = {
-	.get_reserved_space = ext4_get_reserved_space,
-	.write_dquot	= ext4_write_dquot,
-	.acquire_dquot	= ext4_acquire_dquot,
-	.release_dquot	= ext4_release_dquot,
-	.mark_dirty	= ext4_mark_dquot_dirty,
-	.write_info	= ext4_write_info,
-	.alloc_dquot	= dquot_alloc,
-	.destroy_dquot	= dquot_destroy,
-	.get_projid	= ext4_get_projid,
-	.get_next_id	= ext4_get_next_id,
+	.get_reserved_space	= ext4_get_reserved_space,
+	.write_dquot		= ext4_write_dquot,
+	.acquire_dquot		= ext4_acquire_dquot,
+	.release_dquot		= ext4_release_dquot,
+	.mark_dirty		= ext4_mark_dquot_dirty,
+	.write_info		= ext4_write_info,
+	.alloc_dquot		= dquot_alloc,
+	.destroy_dquot		= dquot_destroy,
+	.get_projid		= ext4_get_projid,
+	.get_inode_usage	= ext4_get_inode_usage,
+	.get_next_id		= ext4_get_next_id,
 };
 
 static const struct quotactl_ops ext4_qctl_operations = {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index d7e60358ec91..5e20f29afe9e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -734,6 +734,60 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 	}
 }
 
+int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
+{
+	struct ext4_iloc iloc = { .bh = NULL };
+	struct buffer_head *bh = NULL;
+	struct ext4_inode *raw_inode;
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_entry *entry;
+	qsize_t ea_inode_refs = 0;
+	void *end;
+	int ret;
+
+	lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+		ret = ext4_get_inode_loc(inode, &iloc);
+		if (ret)
+			goto out;
+		raw_inode = ext4_raw_inode(&iloc);
+		header = IHDR(inode, raw_inode);
+		end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+		ret = xattr_check_inode(inode, header, end);
+		if (ret)
+			goto out;
+
+		for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
+		     entry = EXT4_XATTR_NEXT(entry))
+			if (entry->e_value_inum)
+				ea_inode_refs++;
+	}
+
+	if (EXT4_I(inode)->i_file_acl) {
+		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+		if (!bh) {
+			ret = -EIO;
+			goto out;
+		}
+
+		if (ext4_xattr_check_block(inode, bh)) {
+			ret = -EFSCORRUPTED;
+			goto out;
+		}
+
+		for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
+		     entry = EXT4_XATTR_NEXT(entry))
+			if (entry->e_value_inum)
+				ea_inode_refs++;
+	}
+	*usage = ea_inode_refs + 1;
+out:
+	brelse(iloc.bh);
+	brelse(bh);
+	return ret;
+}
+
 static inline size_t round_up_cluster(struct inode *inode, size_t length)
 {
 	struct super_block *sb = inode->i_sb;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 67616cb9a059..26119a67c8c3 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -193,3 +193,5 @@ extern void ext4_xattr_inode_set_class(struct inode *ea_inode);
 #else
 static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { }
 #endif
+
+extern int ext4_get_inode_usage(struct inode *inode, qsize_t *usage);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 48813aeaab80..53a17496c5c5 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1910,6 +1910,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 {
 	qsize_t space, cur_space;
 	qsize_t rsv_space = 0;
+	qsize_t inode_usage = 1;
 	struct dquot *transfer_from[MAXQUOTAS] = {};
 	int cnt, ret = 0;
 	char is_valid[MAXQUOTAS] = {};
@@ -1919,6 +1920,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 
 	if (IS_NOQUOTA(inode))
 		return 0;
+
+	if (inode->i_sb->dq_op->get_inode_usage) {
+		ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage);
+		if (ret)
+			return ret;
+	}
+
 	/* Initialize the arrays */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		warn_to[cnt].w_type = QUOTA_NL_NOWARN;
@@ -1946,7 +1954,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 			continue;
 		is_valid[cnt] = 1;
 		transfer_from[cnt] = i_dquot(inode)[cnt];
-		ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]);
+		ret = check_idq(transfer_to[cnt], inode_usage, &warn_to[cnt]);
 		if (ret)
 			goto over_quota;
 		ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]);
@@ -1963,7 +1971,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 		/* Due to IO error we might not have transfer_from[] structure */
 		if (transfer_from[cnt]) {
 			int wtype;
-			wtype = info_idq_free(transfer_from[cnt], 1);
+			wtype = info_idq_free(transfer_from[cnt], inode_usage);
 			if (wtype != QUOTA_NL_NOWARN)
 				prepare_warning(&warn_from_inodes[cnt],
 						transfer_from[cnt], wtype);
@@ -1971,13 +1979,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 			if (wtype != QUOTA_NL_NOWARN)
 				prepare_warning(&warn_from_space[cnt],
 						transfer_from[cnt], wtype);
-			dquot_decr_inodes(transfer_from[cnt], 1);
+			dquot_decr_inodes(transfer_from[cnt], inode_usage);
 			dquot_decr_space(transfer_from[cnt], cur_space);
 			dquot_free_reserved_space(transfer_from[cnt],
 						  rsv_space);
 		}
 
-		dquot_incr_inodes(transfer_to[cnt], 1);
+		dquot_incr_inodes(transfer_to[cnt], inode_usage);
 		dquot_incr_space(transfer_to[cnt], cur_space);
 		dquot_resv_space(transfer_to[cnt], rsv_space);
 
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 3434eef2a5aa..bfd077ca6ac3 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -332,6 +332,8 @@ struct dquot_operations {
 	 * quota code only */
 	qsize_t *(*get_reserved_space) (struct inode *);
 	int (*get_projid) (struct inode *, kprojid_t *);/* Get project ID */
+	/* Get number of inodes that were charged for a given inode */
+	int (*get_inode_usage) (struct inode *, qsize_t *);
 	/* Get next ID with active quota structure */
 	int (*get_next_id) (struct super_block *sb, struct kqid *qid);
 };
-- 
2.13.1.518.g3df882009-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* Re: [PATCH v3 26/28] ext4: cleanup transaction restarts during inode deletion
  2017-06-20  9:04       ` [PATCH v3 " Tahsin Erdogan
@ 2017-06-20  9:29         ` Tahsin Erdogan
  0 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-20  9:29 UTC (permalink / raw)
  To: Andreas Dilger, linux-ext4; +Cc: linux-kernel, Tahsin Erdogan

On Wed, Jun 14, 2017 at 5:11 PM, Andreas Dilger <adilger@dilger.ca> wrote:
> Another option that might be less complex is to just add the xattr inodes
> to the orphan list in the main transaction (which should be a fixed number
> of credits), and then truncate/unlink the xattr inodes after the main
> transaction has completed rather than making the transactions arbitrarily
> large.  At one point we even had a separate unlink thread to handle this
> in the background to reduce the unlink latency for very large files, which
> also avoids issues with nested transactions.

I think that is true for simple xattr value updates or removals, but
when we delete the parent inode we have to drop references to all
xattr inodes which again becomes arbitrarily large. Also with
deduplication patch that follows, dropping xattr inode reference does
not always cause deletion of the inode so I am not sure whether that
version can be simplified.



On Tue, Jun 20, 2017 at 2:04 AM, Tahsin Erdogan <tahsin@google.com> wrote:
> During inode deletion, journal credits that will be needed are hard to
> determine, that is why we have journal extend/restart calls in several
> places. Whenever a transaction is restarted, filesystem must be in a
> consistent state because there is no atomicity guarantee beyond a
> restart call.
>
> Add ext4_xattr_ensure_credits() helper function which takes care of
> journal extend/restart logic. It also handles getting jbd2 write access
> and dirty metadata calls. This function is called at every iteration of
> handling an ea_inode reference.
>
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
> v3: fixed checkpatch.pl warnings about long lines and indented label
>
> v2: made ext4_xattr_ensure_credits() static
>
>  fs/ext4/inode.c |  66 ++++-----------
>  fs/ext4/xattr.c | 258 ++++++++++++++++++++++++++++++++++++--------------------
>  fs/ext4/xattr.h |   3 +-
>  3 files changed, 184 insertions(+), 143 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index cf91532765a4..cd007f9757d1 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -239,7 +239,11 @@ void ext4_evict_inode(struct inode *inode)
>          */
>         sb_start_intwrite(inode->i_sb);
>
> -       handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits);
> +       if (!IS_NOQUOTA(inode))
> +               extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
> +
> +       handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
> +                                ext4_blocks_for_truncate(inode)+extra_credits);
>         if (IS_ERR(handle)) {
>                 ext4_std_error(inode->i_sb, PTR_ERR(handle));
>                 /*
> @@ -251,36 +255,9 @@ void ext4_evict_inode(struct inode *inode)
>                 sb_end_intwrite(inode->i_sb);
>                 goto no_delete;
>         }
> +
>         if (IS_SYNC(inode))
>                 ext4_handle_sync(handle);
> -
> -       /*
> -        * Delete xattr inode before deleting the main inode.
> -        */
> -       err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array);
> -       if (err) {
> -               ext4_warning(inode->i_sb,
> -                            "couldn't delete inode's xattr (err %d)", err);
> -               goto stop_handle;
> -       }
> -
> -       if (!IS_NOQUOTA(inode))
> -               extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
> -
> -       if (!ext4_handle_has_enough_credits(handle,
> -                       ext4_blocks_for_truncate(inode) + extra_credits)) {
> -               err = ext4_journal_extend(handle,
> -                       ext4_blocks_for_truncate(inode) + extra_credits);
> -               if (err > 0)
> -                       err = ext4_journal_restart(handle,
> -                       ext4_blocks_for_truncate(inode) + extra_credits);
> -               if (err != 0) {
> -                       ext4_warning(inode->i_sb,
> -                                    "couldn't extend journal (err %d)", err);
> -                       goto stop_handle;
> -               }
> -       }
> -
>         inode->i_size = 0;
>         err = ext4_mark_inode_dirty(handle, inode);
>         if (err) {
> @@ -298,25 +275,17 @@ void ext4_evict_inode(struct inode *inode)
>                 }
>         }
>
> -       /*
> -        * ext4_ext_truncate() doesn't reserve any slop when it
> -        * restarts journal transactions; therefore there may not be
> -        * enough credits left in the handle to remove the inode from
> -        * the orphan list and set the dtime field.
> -        */
> -       if (!ext4_handle_has_enough_credits(handle, extra_credits)) {
> -               err = ext4_journal_extend(handle, extra_credits);
> -               if (err > 0)
> -                       err = ext4_journal_restart(handle, extra_credits);
> -               if (err != 0) {
> -                       ext4_warning(inode->i_sb,
> -                                    "couldn't extend journal (err %d)", err);
> -               stop_handle:
> -                       ext4_journal_stop(handle);
> -                       ext4_orphan_del(NULL, inode);
> -                       sb_end_intwrite(inode->i_sb);
> -                       goto no_delete;
> -               }
> +       /* Remove xattr references. */
> +       err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
> +                                     extra_credits);
> +       if (err) {
> +               ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
> +stop_handle:
> +               ext4_journal_stop(handle);
> +               ext4_orphan_del(NULL, inode);
> +               sb_end_intwrite(inode->i_sb);
> +               ext4_xattr_inode_array_free(ea_inode_array);
> +               goto no_delete;
>         }
>
>         /*
> @@ -342,7 +311,6 @@ void ext4_evict_inode(struct inode *inode)
>                 ext4_clear_inode(inode);
>         else
>                 ext4_free_inode(handle, inode);
> -
>         ext4_journal_stop(handle);
>         sb_end_intwrite(inode->i_sb);
>         ext4_xattr_inode_array_free(ea_inode_array);
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index e69550e23d64..0484df8dadd1 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -108,6 +108,10 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
>  #define EXT4_GET_MB_CACHE(inode)       (((struct ext4_sb_info *) \
>                                 inode->i_sb->s_fs_info)->s_mb_cache)
>
> +static int
> +ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
> +                       struct inode *inode);
> +
>  #ifdef CONFIG_LOCKDEP
>  void ext4_xattr_inode_set_class(struct inode *ea_inode)
>  {
> @@ -653,6 +657,128 @@ static void ext4_xattr_update_super_block(handle_t *handle,
>         }
>  }
>
> +static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
> +                                    int credits, struct buffer_head *bh,
> +                                    bool dirty, bool block_csum)
> +{
> +       int error;
> +
> +       if (!ext4_handle_valid(handle))
> +               return 0;
> +
> +       if (handle->h_buffer_credits >= credits)
> +               return 0;
> +
> +       error = ext4_journal_extend(handle, credits - handle->h_buffer_credits);
> +       if (!error)
> +               return 0;
> +       if (error < 0) {
> +               ext4_warning(inode->i_sb, "Extend journal (error %d)", error);
> +               return error;
> +       }
> +
> +       if (bh && dirty) {
> +               if (block_csum)
> +                       ext4_xattr_block_csum_set(inode, bh);
> +               error = ext4_handle_dirty_metadata(handle, NULL, bh);
> +               if (error) {
> +                       ext4_warning(inode->i_sb, "Handle metadata (error %d)",
> +                                    error);
> +                       return error;
> +               }
> +       }
> +
> +       error = ext4_journal_restart(handle, credits);
> +       if (error) {
> +               ext4_warning(inode->i_sb, "Restart journal (error %d)", error);
> +               return error;
> +       }
> +
> +       if (bh) {
> +               error = ext4_journal_get_write_access(handle, bh);
> +               if (error) {
> +                       ext4_warning(inode->i_sb,
> +                                    "Get write access failed (error %d)",
> +                                    error);
> +                       return error;
> +               }
> +       }
> +       return 0;
> +}
> +
> +static void
> +ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
> +                           struct buffer_head *bh,
> +                           struct ext4_xattr_entry *first, bool block_csum,
> +                           struct ext4_xattr_inode_array **ea_inode_array,
> +                           int extra_credits)
> +{
> +       struct inode *ea_inode;
> +       struct ext4_xattr_entry *entry;
> +       bool dirty = false;
> +       unsigned int ea_ino;
> +       int err;
> +       int credits;
> +
> +       /* One credit for dec ref on ea_inode, one for orphan list addition, */
> +       credits = 2 + extra_credits;
> +
> +       for (entry = first; !IS_LAST_ENTRY(entry);
> +            entry = EXT4_XATTR_NEXT(entry)) {
> +               if (!entry->e_value_inum)
> +                       continue;
> +               ea_ino = le32_to_cpu(entry->e_value_inum);
> +               err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +               if (err)
> +                       continue;
> +
> +               err = ext4_expand_inode_array(ea_inode_array, ea_inode);
> +               if (err) {
> +                       ext4_warning_inode(ea_inode,
> +                                          "Expand inode array err=%d", err);
> +                       iput(ea_inode);
> +                       continue;
> +               }
> +
> +               err = ext4_xattr_ensure_credits(handle, parent, credits, bh,
> +                                               dirty, block_csum);
> +               if (err) {
> +                       ext4_warning_inode(ea_inode, "Ensure credits err=%d",
> +                                          err);
> +                       continue;
> +               }
> +
> +               inode_lock(ea_inode);
> +               clear_nlink(ea_inode);
> +               ext4_orphan_add(handle, ea_inode);
> +               inode_unlock(ea_inode);
> +
> +               /*
> +                * Forget about ea_inode within the same transaction that
> +                * decrements the ref count. This avoids duplicate decrements in
> +                * case the rest of the work spills over to subsequent
> +                * transactions.
> +                */
> +               entry->e_value_inum = 0;
> +               entry->e_value_size = 0;
> +
> +               dirty = true;
> +       }
> +
> +       if (dirty) {
> +               /*
> +                * Note that we are deliberately skipping csum calculation for
> +                * the final update because we do not expect any journal
> +                * restarts until xattr block is freed.
> +                */
> +
> +               err = ext4_handle_dirty_metadata(handle, NULL, bh);
> +               if (err)
> +                       ext4_warning_inode(parent,
> +                                          "handle dirty metadata err=%d", err);
> +       }
> +}
> +
>  /*
>   * Release the xattr block BH: If the reference count is > 1, decrement it;
>   * otherwise free the block.
> @@ -1985,42 +2111,6 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>         return 0;
>  }
>
> -/**
> - * Add xattr inode to orphan list
> - */
> -static int
> -ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits,
> -                           struct ext4_xattr_inode_array *ea_inode_array)
> -{
> -       int idx = 0, error = 0;
> -       struct inode *ea_inode;
> -
> -       if (ea_inode_array == NULL)
> -               return 0;
> -
> -       for (; idx < ea_inode_array->count; ++idx) {
> -               if (!ext4_handle_has_enough_credits(handle, credits)) {
> -                       error = ext4_journal_extend(handle, credits);
> -                       if (error > 0)
> -                               error = ext4_journal_restart(handle, credits);
> -
> -                       if (error != 0) {
> -                               ext4_warning(inode->i_sb,
> -                                       "couldn't extend journal "
> -                                       "(err %d)", error);
> -                               return error;
> -                       }
> -               }
> -               ea_inode = ea_inode_array->inodes[idx];
> -               inode_lock(ea_inode);
> -               ext4_orphan_add(handle, ea_inode);
> -               inode_unlock(ea_inode);
> -               /* the inode's i_count will be released by caller */
> -       }
> -
> -       return 0;
> -}
> -
>  /*
>   * ext4_xattr_delete_inode()
>   *
> @@ -2033,16 +2123,23 @@ ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, int credits,
>   */
>  int
>  ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> -                       struct ext4_xattr_inode_array **ea_inode_array)
> +                       struct ext4_xattr_inode_array **ea_inode_array,
> +                       int extra_credits)
>  {
>         struct buffer_head *bh = NULL;
>         struct ext4_xattr_ibody_header *header;
>         struct ext4_inode *raw_inode;
> -       struct ext4_iloc iloc;
> -       struct ext4_xattr_entry *entry;
> -       struct inode *ea_inode;
> -       unsigned int ea_ino;
> -       int credits = 3, error = 0;
> +       struct ext4_iloc iloc = { .bh = NULL };
> +       int error;
> +
> +       error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
> +                                         NULL /* bh */,
> +                                         false /* dirty */,
> +                                         false /* block_csum */);
> +       if (error) {
> +               EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error);
> +               goto cleanup;
> +       }
>
>         if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
>                 goto delete_external_ea;
> @@ -2050,31 +2147,20 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>         error = ext4_get_inode_loc(inode, &iloc);
>         if (error)
>                 goto cleanup;
> +
> +       error = ext4_journal_get_write_access(handle, iloc.bh);
> +       if (error)
> +               goto cleanup;
> +
>         raw_inode = ext4_raw_inode(&iloc);
>         header = IHDR(inode, raw_inode);
> -       for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
> -            entry = EXT4_XATTR_NEXT(entry)) {
> -               if (!entry->e_value_inum)
> -                       continue;
> -               ea_ino = le32_to_cpu(entry->e_value_inum);
> -               error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -               if (error)
> -                       continue;
> -               error = ext4_expand_inode_array(ea_inode_array, ea_inode);
> -               if (error) {
> -                       iput(ea_inode);
> -                       brelse(iloc.bh);
> -                       goto cleanup;
> -               }
> -               entry->e_value_inum = 0;
> -       }
> -       brelse(iloc.bh);
> +       ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
> +                                   false /* block_csum */, ea_inode_array,
> +                                   extra_credits);
>
>  delete_external_ea:
>         if (!EXT4_I(inode)->i_file_acl) {
> -               /* add xattr inode to orphan list */
> -               error = ext4_xattr_inode_orphan_add(handle, inode, credits,
> -                                                   *ea_inode_array);
> +               error = 0;
>                 goto cleanup;
>         }
>         bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> @@ -2092,46 +2178,32 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>                 goto cleanup;
>         }
>
> -       for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> -            entry = EXT4_XATTR_NEXT(entry)) {
> -               if (!entry->e_value_inum)
> -                       continue;
> -               ea_ino = le32_to_cpu(entry->e_value_inum);
> -               error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -               if (error)
> -                       continue;
> -               error = ext4_expand_inode_array(ea_inode_array, ea_inode);
> -               if (error)
> -                       goto cleanup;
> -               entry->e_value_inum = 0;
> -       }
> -
> -       /* add xattr inode to orphan list */
> -       error = ext4_xattr_inode_orphan_add(handle, inode, credits,
> -                                       *ea_inode_array);
> -       if (error)
> -               goto cleanup;
> -
> -       if (!IS_NOQUOTA(inode))
> -               credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
> -
> -       if (!ext4_handle_has_enough_credits(handle, credits)) {
> -               error = ext4_journal_extend(handle, credits);
> -               if (error > 0)
> -                       error = ext4_journal_restart(handle, credits);
> +       if (ext4_has_feature_ea_inode(inode->i_sb)) {
> +               error = ext4_journal_get_write_access(handle, bh);
>                 if (error) {
> -                       ext4_warning(inode->i_sb,
> -                               "couldn't extend journal (err %d)", error);
> +                       EXT4_ERROR_INODE(inode, "write access %llu",
> +                                        EXT4_I(inode)->i_file_acl);
>                         goto cleanup;
>                 }
> +               ext4_xattr_inode_remove_all(handle, inode, bh,
> +                                           BFIRST(bh),
> +                                           true /* block_csum */,
> +                                           ea_inode_array,
> +                                           extra_credits);
>         }
>
>         ext4_xattr_release_block(handle, inode, bh);
> +       /* Update i_file_acl within the same transaction that releases block. */
>         EXT4_I(inode)->i_file_acl = 0;
> -
> +       error = ext4_mark_inode_dirty(handle, inode);
> +       if (error) {
> +               EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> +                                error);
> +               goto cleanup;
> +       }
>  cleanup:
> +       brelse(iloc.bh);
>         brelse(bh);
> -
>         return error;
>  }
>
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index adf761518a73..b2005a2716d9 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -169,7 +169,8 @@ extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
>
>  extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
>  extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> -                                  struct ext4_xattr_inode_array **array);
> +                                  struct ext4_xattr_inode_array **array,
> +                                  int extra_credits);
>  extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
>
>  extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
> --
> 2.13.1.518.g3df882009-goog
>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v5 27/28] ext4: xattr inode deduplication
  2017-06-20  9:07                   ` [PATCH v5 " Tahsin Erdogan
@ 2017-06-20  9:49                     ` Tahsin Erdogan
  2017-06-21 17:42                       ` Andreas Dilger
  2017-06-21 21:14                     ` Andreas Dilger
  2017-07-04 18:39                     ` Theodore Ts'o
  2 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-20  9:49 UTC (permalink / raw)
  To: Andreas Dilger, Darrick J . Wong, linux-ext4; +Cc: linux-kernel, Tahsin Erdogan

Thanks Andreas for the feedback. Please see my responses below:

> It would be preferable to allow a mount option like "no_mbcache" to disable
> the use of shared xattrs.  In the Lustre case at least, there will never be
> shared large xattrs, and we've had a bunch of performance issues with mbcache
> due to lock contention among many server threads doing concurrent lookups and
> inserting many thousands of unique entries into the cache.

I have put nombcache mount option in a separate patch ("[PATCH 32/32]
ext4: add nombcache mount option"). I have named it nombcache instead
of no_mbcache to be consistent with other no* options. Let me know if
you prefer no_mbcache as the option name.

> This should follow the existing convention of always using s_csum_seed to seed
> the checksum, and change ext4_fill_super() to initialize s_csum_seed to ~0 if
> ext4_has_metadata_csum() is false, or always use the same value regardless of
> whether ext4_has_metadata_csum() is set or not.

Done.

> If it really necessary to have more than 2^32 references on a single shared
> inode then it would be better to avoid the re-use of i_mtime, which breaks
> the backref for unshared xattrs, and using i_size isn't enough of a guarantee
> that this is the correct parent inode in case of on-disk corruption.

I have now moved the lower 32bits of ref count from i_mtime to l_i_version.

> Should this be contingent on ext4_has_metadata_csum() feature being enabled, or
> alternately check if EXT4_XATTR_INODE_GET_PARENT() and i_generation match before
> returning an error.  This will allow a smooth transition from existing filesystems
> that do not store the hash, but have only a single-use xattr inode with a parent
> backref.

I updated hash validation to fallback to parent backref check for
backward compatibility.

>> +     /* Indirection block. */
>> +     blocks += 1;
>
> Strictly speaking, this is only needed "if (blocks > EXT4_NDIR_BLOCKS)".

Ack. I didn't think it was worth going through exact calculation in
this case, let me know if you see value in doing that
I also updated the comment to mention extents.

>> +     /* We may need to clone the existing xattr block in which case we need
>> +      * to increment ref counts for existing ea_inodes referenced by it.
>> +      */
>
> Just to clarify here, in the case of cloning an existing xattr block, are the
> refcounts being _incremented_ or _decremented_ on the existing ea_inodes?  I'm
> trying to figure out if we really need to have credits for both old and new
> xattr inodes, as well as these additional credits.  Since this is reserving
> about 110 blocks for every setxattr, this can add significant pressure on the
> journal if there are lots of threads creating files and/or setting xattrs.

Cloning causes incrementing xattr inode references.

>> +ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
>> +                         size_t value_len, u32 hash)
>> {
>> +     struct inode *ea_inode;
>> +     struct mb_cache_entry *ce;
>> +     struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
>> +     void *ea_data = NULL;
>>       int err;
>
> This function should just return NULL if ea_inode_cache is NULL (e.g. in
> the case of "no_mbcache" mount option).

Done in later patch ("[PATCH 32/32] ext4: add nombcache mount option")

> Should skip mb_cache if EA_INODE_CACHE(inode) is NULL, or have a wrapper
> like ext4_xattr_inode_cache_insert() to match ext4_xattr_inode_cache_find()
> that does the same.

Added skip in patch ("[PATCH 32/32] ext4: add nombcache mount option")



On Tue, Jun 20, 2017 at 2:07 AM, Tahsin Erdogan <tahsin@google.com> wrote:
> Ext4 now supports xattr values that are up to 64k in size (vfs limit).
> Large xattr values are stored in external inodes each one holding a
> single value. Once written the data blocks of these inodes are immutable.
>
> The real world use cases are expected to have a lot of value duplication
> such as inherited acls etc. To reduce data duplication on disk, this patch
> implements a deduplicator that allows sharing of xattr inodes.
>
> The deduplication is based on an in-memory hash lookup that is a best
> effort sharing scheme. When a xattr inode is read from disk (i.e.
> getxattr() call), its crc32c hash is added to a hash table. Before
> creating a new xattr inode for a value being set, the hash table is
> checked to see if an existing inode holds an identical value. If such an
> inode is found, the ref count on that inode is incremented. On value
> removal the ref count is decremented and if it reaches zero the inode is
> deleted.
>
> The quota charging for such inodes is manually managed. Every reference
> holder is charged the full size as if there was no sharing happening.
> This is consistent with how xattr blocks are also charged.
>
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
> v5:
>  - made ext4_meta_trans_blocks() static again since there are no
>    remaining users outside of inode.c
>  - initialize sbi->s_csum_seed when ea_inode feature is enabled
>  - use l_i_version to hold lower 32 bits of the xattr ref count.
>    This avoids clashes with old implementations which use i_mtime.
>    Since l_i_version is not available in HURD_COMPAT mode, fail mount
>    request when both ea_inode feature and HURD_COMPAT are set.
>  - when hash validation fails, fall back to old implementation
>    which has a backref to parent.
>  - fixed checkpatch.pl warning about using unsigned alone
>
> v4:
>  - eliminated xattr entry in the xattr inode to avoid complexity and
>    recursion in xattr update path. Now the ref count and hash are stored
>    in i_[c/m/a]time.tv_sec fields.
>  - some clean up in ext4_xattr_set_entry() to reduce code duplication and
>    complexity
>
> v3:
>  - use s_csum_seed for hash calculations when available
>  - return error on stored vs calculated hash mismatch
>
> v2:
>  - make dependency on crc32c dynamic
>  - update ext4_has_metadata_csum() and ext4_has_group_desc_csum() so that
>    they do not misinterpret existence of EXT4_SB(sb)->s_chksum_driver
>
>  fs/ext4/acl.c   |    5 +-
>  fs/ext4/ext4.h  |   23 +-
>  fs/ext4/inode.c |   13 +-
>  fs/ext4/super.c |   37 +-
>  fs/ext4/xattr.c | 1038 +++++++++++++++++++++++++++++++++++++++++--------------
>  fs/ext4/xattr.h |   17 +-
>  fs/mbcache.c    |    9 +-
>  7 files changed, 848 insertions(+), 294 deletions(-)
>
> diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
> index 74f7ac539e00..8db03e5c78bc 100644
> --- a/fs/ext4/acl.c
> +++ b/fs/ext4/acl.c
> @@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>         if (error)
>                 return error;
>  retry:
> -       credits = ext4_xattr_set_credits(inode, acl_size);
> +       error = ext4_xattr_set_credits(inode, acl_size, &credits);
> +       if (error)
> +               return error;
> +
>         handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>         if (IS_ERR(handle))
>                 return PTR_ERR(handle);
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index d79d8d7bee88..59e9488c4876 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1517,6 +1517,7 @@ struct ext4_sb_info {
>         long s_es_nr_inode;
>         struct ext4_es_stats s_es_stats;
>         struct mb_cache *s_mb_cache;
> +       struct mb_cache *s_ea_inode_cache;
>         spinlock_t s_es_lock ____cacheline_aligned_in_smp;
>
>         /* Ratelimit ext4 messages. */
> @@ -2099,7 +2100,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
>         return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
>  }
>
> -#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
> +static inline bool ext4_is_quota_file(struct inode *inode)
> +{
> +       return IS_NOQUOTA(inode) &&
> +              !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
> +}
>
>  /*
>   * This structure is stuffed into the struct file's private_data field
> @@ -2482,7 +2487,6 @@ extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
>  extern void ext4_set_inode_flags(struct inode *);
>  extern int ext4_alloc_da_blocks(struct inode *inode);
>  extern void ext4_set_aops(struct inode *inode);
> -extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk);
>  extern int ext4_writepage_trans_blocks(struct inode *);
>  extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
>  extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
> @@ -2709,19 +2713,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
>  extern int ext4_register_li_request(struct super_block *sb,
>                                     ext4_group_t first_not_zeroed);
>
> -static inline int ext4_has_group_desc_csum(struct super_block *sb)
> -{
> -       return ext4_has_feature_gdt_csum(sb) ||
> -              EXT4_SB(sb)->s_chksum_driver != NULL;
> -}
> -
>  static inline int ext4_has_metadata_csum(struct super_block *sb)
>  {
>         WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
>                      !EXT4_SB(sb)->s_chksum_driver);
>
> -       return (EXT4_SB(sb)->s_chksum_driver != NULL);
> +       return ext4_has_feature_metadata_csum(sb) &&
> +              (EXT4_SB(sb)->s_chksum_driver != NULL);
>  }
> +
> +static inline int ext4_has_group_desc_csum(struct super_block *sb)
> +{
> +       return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
> +}
> +
>  static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
>  {
>         return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index cd007f9757d1..ea95bd9eab81 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -139,6 +139,8 @@ static void ext4_invalidatepage(struct page *page, unsigned int offset,
>                                 unsigned int length);
>  static int __ext4_journalled_writepage(struct page *page, unsigned int len);
>  static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
> +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
> +                                 int pextents);
>
>  /*
>   * Test whether an inode is a fast symlink.
> @@ -4843,8 +4845,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>         }
>         brelse(iloc.bh);
>         ext4_set_inode_flags(inode);
> -       if (ei->i_flags & EXT4_EA_INODE_FL)
> +
> +       if (ei->i_flags & EXT4_EA_INODE_FL) {
>                 ext4_xattr_inode_set_class(inode);
> +
> +               inode_lock(inode);
> +               inode->i_flags |= S_NOQUOTA;
> +               inode_unlock(inode);
> +       }
> +
>         unlock_new_inode(inode);
>         return inode;
>
> @@ -5503,7 +5512,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
>   *
>   * Also account for superblock, inode, quota and xattr blocks
>   */
> -int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
> +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
>                                   int pextents)
>  {
>         ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index b02a23ec92ca..2bfacd737bb6 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
>                 invalidate_bdev(sbi->journal_bdev);
>                 ext4_blkdev_remove(sbi);
>         }
> +       if (sbi->s_ea_inode_cache) {
> +               ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
> +               sbi->s_ea_inode_cache = NULL;
> +       }
>         if (sbi->s_mb_cache) {
>                 ext4_xattr_destroy_cache(sbi->s_mb_cache);
>                 sbi->s_mb_cache = NULL;
> @@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
>         if (res)
>                 return res;
>  retry:
> -       credits = ext4_xattr_set_credits(inode, len);
> +       res = ext4_xattr_set_credits(inode, len, &credits);
> +       if (res)
> +               return res;
> +
>         handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
>         if (IS_ERR(handle))
>                 return PTR_ERR(handle);
> @@ -3445,7 +3452,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>         }
>
>         /* Load the checksum driver */
> -       if (ext4_has_feature_metadata_csum(sb)) {
> +       if (ext4_has_feature_metadata_csum(sb) ||
> +           ext4_has_feature_ea_inode(sb)) {
>                 sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
>                 if (IS_ERR(sbi->s_chksum_driver)) {
>                         ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
> @@ -3467,7 +3475,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>         /* Precompute checksum seed for all metadata */
>         if (ext4_has_feature_csum_seed(sb))
>                 sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
> -       else if (ext4_has_metadata_csum(sb))
> +       else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
>                 sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
>                                                sizeof(es->s_uuid));
>
> @@ -3597,6 +3605,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>                                  "The Hurd can't support 64-bit file systems");
>                         goto failed_mount;
>                 }
> +
> +               /*
> +                * ea_inode feature uses l_i_version field which is not
> +                * available in HURD_COMPAT mode.
> +                */
> +               if (ext4_has_feature_ea_inode(sb)) {
> +                       ext4_msg(sb, KERN_ERR,
> +                                "ea_inode feature is not supported for Hurd");
> +                       goto failed_mount;
> +               }
>         }
>
>         if (IS_EXT2_SB(sb)) {
> @@ -4067,6 +4085,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>                 goto failed_mount_wq;
>         }
>
> +       if (ext4_has_feature_ea_inode(sb)) {
> +               sbi->s_ea_inode_cache = ext4_xattr_create_cache();
> +               if (!sbi->s_ea_inode_cache) {
> +                       ext4_msg(sb, KERN_ERR,
> +                                "Failed to create an s_ea_inode_cache");
> +                       goto failed_mount_wq;
> +               }
> +       }
> +
>         if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
>             (blocksize != PAGE_SIZE)) {
>                 ext4_msg(sb, KERN_ERR,
> @@ -4296,6 +4323,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>         if (EXT4_SB(sb)->rsv_conversion_wq)
>                 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
>  failed_mount_wq:
> +       if (sbi->s_ea_inode_cache) {
> +               ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
> +               sbi->s_ea_inode_cache = NULL;
> +       }
>         if (sbi->s_mb_cache) {
>                 ext4_xattr_destroy_cache(sbi->s_mb_cache);
>                 sbi->s_mb_cache = NULL;
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 0484df8dadd1..d7e60358ec91 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -108,6 +108,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
>  #define EXT4_GET_MB_CACHE(inode)       (((struct ext4_sb_info *) \
>                                 inode->i_sb->s_fs_info)->s_mb_cache)
>
> +#define EA_INODE_CACHE(inode)  (((struct ext4_sb_info *) \
> +                               inode->i_sb->s_fs_info)->s_ea_inode_cache)
> +
>  static int
>  ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>                         struct inode *inode);
> @@ -280,15 +283,44 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
>         return cmp ? -ENODATA : 0;
>  }
>
> +static u32
> +ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
> +{
> +       return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size);
> +}
> +
> +static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
> +{
> +       return ((u64)ea_inode->i_ctime.tv_sec << 32) |
> +              ((u32)ea_inode->i_version);
> +}
> +
> +static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
> +{
> +       ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32);
> +       ea_inode->i_version = (u32)ref_count;
> +}
> +
> +static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
> +{
> +       return (u32)ea_inode->i_atime.tv_sec;
> +}
> +
> +static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
> +{
> +       ea_inode->i_atime.tv_sec = hash;
> +}
> +
>  /*
>   * Read the EA value from an inode.
>   */
>  static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
>  {
>         unsigned long block = 0;
> -       struct buffer_head *bh = NULL;
> +       struct buffer_head *bh;
>         int blocksize = ea_inode->i_sb->s_blocksize;
>         size_t csize, copied = 0;
> +       void *copy_pos = buf;
>
>         while (copied < size) {
>                 csize = (size - copied) > blocksize ? blocksize : size - copied;
> @@ -298,10 +330,10 @@ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
>                 if (!bh)
>                         return -EFSCORRUPTED;
>
> -               memcpy(buf, bh->b_data, csize);
> +               memcpy(copy_pos, bh->b_data, csize);
>                 brelse(bh);
>
> -               buf += csize;
> +               copy_pos += csize;
>                 block += 1;
>                 copied += csize;
>         }
> @@ -317,29 +349,24 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>         inode = ext4_iget(parent->i_sb, ea_ino);
>         if (IS_ERR(inode)) {
>                 err = PTR_ERR(inode);
> -               ext4_error(parent->i_sb, "error while reading EA inode %lu "
> -                          "err=%d", ea_ino, err);
> +               ext4_error(parent->i_sb,
> +                          "error while reading EA inode %lu err=%d", ea_ino,
> +                          err);
>                 return err;
>         }
>
>         if (is_bad_inode(inode)) {
> -               ext4_error(parent->i_sb, "error while reading EA inode %lu "
> -                          "is_bad_inode", ea_ino);
> +               ext4_error(parent->i_sb,
> +                          "error while reading EA inode %lu is_bad_inode",
> +                          ea_ino);
>                 err = -EIO;
>                 goto error;
>         }
>
> -       if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
> -           inode->i_generation != parent->i_generation) {
> -               ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
> -                          "to parent is invalid.", ea_ino);
> -               err = -EINVAL;
> -               goto error;
> -       }
> -
>         if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
> -               ext4_error(parent->i_sb, "EA inode %lu does not have "
> -                          "EXT4_EA_INODE_FL flag set.\n", ea_ino);
> +               ext4_error(parent->i_sb,
> +                          "EA inode %lu does not have EXT4_EA_INODE_FL flag",
> +                           ea_ino);
>                 err = -EINVAL;
>                 goto error;
>         }
> @@ -351,6 +378,20 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>         return err;
>  }
>
> +static int
> +ext4_xattr_inode_verify_hash(struct inode *ea_inode, void *buffer, size_t size)
> +{
> +       u32 hash;
> +
> +       /* Verify stored hash matches calculated hash. */
> +       hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size);
> +       if (hash != ext4_xattr_inode_get_hash(ea_inode))
> +               return -EFSCORRUPTED;
> +       return 0;
> +}
> +
> +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
> +
>  /*
>   * Read the value from the EA inode.
>   */
> @@ -358,17 +399,53 @@ static int
>  ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
>                      size_t size)
>  {
> +       struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
>         struct inode *ea_inode;
> -       int ret;
> +       int err;
>
> -       ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -       if (ret)
> -               return ret;
> +       err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> +       if (err) {
> +               ea_inode = NULL;
> +               goto out;
> +       }
>
> -       ret = ext4_xattr_inode_read(ea_inode, buffer, size);
> -       iput(ea_inode);
> +       if (i_size_read(ea_inode) != size) {
> +               ext4_warning_inode(ea_inode,
> +                                  "ea_inode file size=%llu entry size=%zu",
> +                                  i_size_read(ea_inode), size);
> +               err = -EFSCORRUPTED;
> +               goto out;
> +       }
>
> -       return ret;
> +       err = ext4_xattr_inode_read(ea_inode, buffer, size);
> +       if (err)
> +               goto out;
> +
> +       err = ext4_xattr_inode_verify_hash(ea_inode, buffer, size);
> +       /*
> +        * Compatibility check for old Lustre ea_inode implementation. Old
> +        * version does not have hash validation, but it has a backpointer
> +        * from ea_inode to the parent inode.
> +        */
> +       if (err == -EFSCORRUPTED) {
> +               if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != inode->i_ino ||
> +                   ea_inode->i_generation != inode->i_generation) {
> +                       ext4_warning_inode(ea_inode,
> +                                          "EA inode hash validation failed");
> +                       goto out;
> +               }
> +               /* Do not add ea_inode to the cache. */
> +               ea_inode_cache = NULL;
> +       } else if (err)
> +               goto out;
> +
> +       if (ea_inode_cache)
> +               mb_cache_entry_create(ea_inode_cache, GFP_NOFS,
> +                                     ext4_xattr_inode_get_hash(ea_inode),
> +                                     ea_inode->i_ino, true /* reusable */);
> +out:
> +       iput(ea_inode);
> +       return err;
>  }
>
>  static int
> @@ -657,6 +734,101 @@ static void ext4_xattr_update_super_block(handle_t *handle,
>         }
>  }
>
> +static inline size_t round_up_cluster(struct inode *inode, size_t length)
> +{
> +       struct super_block *sb = inode->i_sb;
> +       size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
> +                                   inode->i_blkbits);
> +       size_t mask = ~(cluster_size - 1);
> +
> +       return (length + cluster_size - 1) & mask;
> +}
> +
> +static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
> +{
> +       int err;
> +
> +       err = dquot_alloc_inode(inode);
> +       if (err)
> +               return err;
> +       err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
> +       if (err)
> +               dquot_free_inode(inode);
> +       return err;
> +}
> +
> +static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
> +{
> +       dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
> +       dquot_free_inode(inode);
> +}
> +
> +static int __ext4_xattr_set_credits(struct super_block *sb,
> +                                   struct buffer_head *block_bh,
> +                                   size_t value_len)
> +{
> +       int credits;
> +       int blocks;
> +
> +       /*
> +        * 1) Owner inode update
> +        * 2) Ref count update on old xattr block
> +        * 3) new xattr block
> +        * 4) block bitmap update for new xattr block
> +        * 5) group descriptor for new xattr block
> +        */
> +       credits = 5;
> +
> +       /* We are done if ea_inode feature is not enabled. */
> +       if (!ext4_has_feature_ea_inode(sb))
> +               return credits;
> +
> +       /* New ea_inode, inode map, block bitmap, group descriptor. */
> +       credits += 4;
> +
> +       /* Data blocks. */
> +       blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
> +
> +       /* Indirection block or one level of extent tree. */
> +       blocks += 1;
> +
> +       /* Block bitmap and group descriptor updates for each block. */
> +       credits += blocks * 2;
> +
> +       /* Blocks themselves. */
> +       credits += blocks;
> +
> +       /* Dereference ea_inode holding old xattr value.
> +        * Old ea_inode, inode map, block bitmap, group descriptor.
> +        */
> +       credits += 4;
> +
> +       /* Data blocks for old ea_inode. */
> +       blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
> +
> +       /* Indirection block or one level of extent tree for old ea_inode. */
> +       blocks += 1;
> +
> +       /* Block bitmap and group descriptor updates for each block. */
> +       credits += blocks * 2;
> +
> +       /* Quota updates. */
> +       credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
> +
> +       /* We may need to clone the existing xattr block in which case we need
> +        * to increment ref counts for existing ea_inodes referenced by it.
> +        */
> +       if (block_bh) {
> +               struct ext4_xattr_entry *entry = BFIRST(block_bh);
> +
> +               for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
> +                       if (entry->e_value_inum)
> +                               /* Ref count update on ea_inode. */
> +                               credits += 1;
> +       }
> +       return credits;
> +}
> +
>  static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>                                      int credits, struct buffer_head *bh,
>                                      bool dirty, bool block_csum)
> @@ -706,12 +878,140 @@ static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>         return 0;
>  }
>
> +static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
> +                                      int ref_change)
> +{
> +       struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
> +       struct ext4_iloc iloc;
> +       s64 ref_count;
> +       u32 hash;
> +       int ret;
> +
> +       inode_lock(ea_inode);
> +
> +       ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
> +       if (ret) {
> +               iloc.bh = NULL;
> +               goto out;
> +       }
> +
> +       ref_count = ext4_xattr_inode_get_ref(ea_inode);
> +       ref_count += ref_change;
> +       ext4_xattr_inode_set_ref(ea_inode, ref_count);
> +
> +       if (ref_change > 0) {
> +               WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld",
> +                         ea_inode->i_ino, ref_count);
> +
> +               if (ref_count == 1) {
> +                       WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
> +                                 ea_inode->i_ino, ea_inode->i_nlink);
> +
> +                       set_nlink(ea_inode, 1);
> +                       ext4_orphan_del(handle, ea_inode);
> +
> +                       hash = ext4_xattr_inode_get_hash(ea_inode);
> +                       mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
> +                                             ea_inode->i_ino,
> +                                             true /* reusable */);
> +               }
> +       } else {
> +               WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
> +                         ea_inode->i_ino, ref_count);
> +
> +               if (ref_count == 0) {
> +                       WARN_ONCE(ea_inode->i_nlink != 1,
> +                                 "EA inode %lu i_nlink=%u",
> +                                 ea_inode->i_ino, ea_inode->i_nlink);
> +
> +                       clear_nlink(ea_inode);
> +                       ext4_orphan_add(handle, ea_inode);
> +
> +                       hash = ext4_xattr_inode_get_hash(ea_inode);
> +                       mb_cache_entry_delete(ea_inode_cache, hash,
> +                                             ea_inode->i_ino);
> +               }
> +       }
> +
> +       ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
> +       iloc.bh = NULL;
> +       if (ret)
> +               ext4_warning_inode(ea_inode,
> +                                  "ext4_mark_iloc_dirty() failed ret=%d", ret);
> +out:
> +       brelse(iloc.bh);
> +       inode_unlock(ea_inode);
> +       return ret;
> +}
> +
> +static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
> +{
> +       return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
> +}
> +
> +static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
> +{
> +       return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
> +}
> +
> +static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
> +                                       struct ext4_xattr_entry *first)
> +{
> +       struct inode *ea_inode;
> +       struct ext4_xattr_entry *entry;
> +       struct ext4_xattr_entry *failed_entry;
> +       unsigned int ea_ino;
> +       int err, saved_err;
> +
> +       for (entry = first; !IS_LAST_ENTRY(entry);
> +            entry = EXT4_XATTR_NEXT(entry)) {
> +               if (!entry->e_value_inum)
> +                       continue;
> +               ea_ino = le32_to_cpu(entry->e_value_inum);
> +               err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +               if (err)
> +                       goto cleanup;
> +               err = ext4_xattr_inode_inc_ref(handle, ea_inode);
> +               if (err) {
> +                       ext4_warning_inode(ea_inode, "inc ref error %d", err);
> +                       iput(ea_inode);
> +                       goto cleanup;
> +               }
> +               iput(ea_inode);
> +       }
> +       return 0;
> +
> +cleanup:
> +       saved_err = err;
> +       failed_entry = entry;
> +
> +       for (entry = first; entry != failed_entry;
> +            entry = EXT4_XATTR_NEXT(entry)) {
> +               if (!entry->e_value_inum)
> +                       continue;
> +               ea_ino = le32_to_cpu(entry->e_value_inum);
> +               err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
> +               if (err) {
> +                       ext4_warning(parent->i_sb,
> +                                    "cleanup ea_ino %u iget error %d", ea_ino,
> +                                    err);
> +                       continue;
> +               }
> +               err = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +               if (err)
> +                       ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
> +                                          err);
> +               iput(ea_inode);
> +       }
> +       return saved_err;
> +}
> +
>  static void
> -ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
> -                           struct buffer_head *bh,
> -                           struct ext4_xattr_entry *first, bool block_csum,
> -                           struct ext4_xattr_inode_array **ea_inode_array,
> -                           int extra_credits)
> +ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
> +                            struct buffer_head *bh,
> +                            struct ext4_xattr_entry *first, bool block_csum,
> +                            struct ext4_xattr_inode_array **ea_inode_array,
> +                            int extra_credits, bool skip_quota)
>  {
>         struct inode *ea_inode;
>         struct ext4_xattr_entry *entry;
> @@ -748,10 +1048,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>                         continue;
>                 }
>
> -               inode_lock(ea_inode);
> -               clear_nlink(ea_inode);
> -               ext4_orphan_add(handle, ea_inode);
> -               inode_unlock(ea_inode);
> +               err = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +               if (err) {
> +                       ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
> +                                          err);
> +                       continue;
> +               }
> +
> +               if (!skip_quota)
> +                       ext4_xattr_inode_free_quota(parent,
> +                                             le32_to_cpu(entry->e_value_size));
>
>                 /*
>                  * Forget about ea_inode within the same transaction that
> @@ -785,7 +1091,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>   */
>  static void
>  ext4_xattr_release_block(handle_t *handle, struct inode *inode,
> -                        struct buffer_head *bh)
> +                        struct buffer_head *bh,
> +                        struct ext4_xattr_inode_array **ea_inode_array,
> +                        int extra_credits)
>  {
>         struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
>         u32 hash, ref;
> @@ -808,6 +1116,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>                 mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
>                 get_bh(bh);
>                 unlock_buffer(bh);
> +
> +               if (ext4_has_feature_ea_inode(inode->i_sb))
> +                       ext4_xattr_inode_dec_ref_all(handle, inode, bh,
> +                                                    BFIRST(bh),
> +                                                    true /* block_csum */,
> +                                                    ea_inode_array,
> +                                                    extra_credits,
> +                                                    true /* skip_quota */);
>                 ext4_free_blocks(handle, inode, bh, 0, 1,
>                                  EXT4_FREE_BLOCKS_METADATA |
>                                  EXT4_FREE_BLOCKS_FORGET);
> @@ -879,8 +1195,8 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
>  {
>         struct buffer_head *bh = NULL;
>         unsigned long block = 0;
> -       unsigned blocksize = ea_inode->i_sb->s_blocksize;
> -       unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
> +       int blocksize = ea_inode->i_sb->s_blocksize;
> +       int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
>         int csize, wsize = 0;
>         int ret = 0;
>         int retries = 0;
> @@ -948,7 +1264,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
>   * Create an inode to store the value of a large EA.
>   */
>  static struct inode *ext4_xattr_inode_create(handle_t *handle,
> -                                            struct inode *inode)
> +                                            struct inode *inode, u32 hash)
>  {
>         struct inode *ea_inode = NULL;
>         uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
> @@ -966,67 +1282,115 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
>                 ea_inode->i_fop = &ext4_file_operations;
>                 ext4_set_aops(ea_inode);
>                 ext4_xattr_inode_set_class(ea_inode);
> -               ea_inode->i_generation = inode->i_generation;
> -               EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
> -
> -               /*
> -                * A back-pointer from EA inode to parent inode will be useful
> -                * for e2fsck.
> -                */
> -               EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
>                 unlock_new_inode(ea_inode);
> -               err = ext4_inode_attach_jinode(ea_inode);
> +               ext4_xattr_inode_set_ref(ea_inode, 1);
> +               ext4_xattr_inode_set_hash(ea_inode, hash);
> +               err = ext4_mark_inode_dirty(handle, ea_inode);
> +               if (!err)
> +                       err = ext4_inode_attach_jinode(ea_inode);
>                 if (err) {
>                         iput(ea_inode);
>                         return ERR_PTR(err);
>                 }
> +
> +               /*
> +                * Xattr inodes are shared therefore quota charging is performed
> +                * at a higher level.
> +                */
> +               dquot_free_inode(ea_inode);
> +               dquot_drop(ea_inode);
> +               inode_lock(ea_inode);
> +               ea_inode->i_flags |= S_NOQUOTA;
> +               inode_unlock(ea_inode);
>         }
>
>         return ea_inode;
>  }
>
> -/*
> - * Unlink the inode storing the value of the EA.
> - */
> -int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
> +static struct inode *
> +ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
> +                           size_t value_len, u32 hash)
>  {
> -       struct inode *ea_inode = NULL;
> -       int err;
> +       struct inode *ea_inode;
> +       struct mb_cache_entry *ce;
> +       struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
> +       void *ea_data;
>
> -       err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> -       if (err)
> -               return err;
> +       ce = mb_cache_entry_find_first(ea_inode_cache, hash);
> +       if (!ce)
> +               return NULL;
>
> -       clear_nlink(ea_inode);
> -       iput(ea_inode);
> +       ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
> +       if (!ea_data) {
> +               mb_cache_entry_put(ea_inode_cache, ce);
> +               return NULL;
> +       }
>
> -       return 0;
> +       while (ce) {
> +               ea_inode = ext4_iget(inode->i_sb, ce->e_value);
> +               if (!IS_ERR(ea_inode) &&
> +                   !is_bad_inode(ea_inode) &&
> +                   (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) &&
> +                   i_size_read(ea_inode) == value_len &&
> +                   !ext4_xattr_inode_read(ea_inode, ea_data, value_len) &&
> +                   !ext4_xattr_inode_verify_hash(ea_inode, ea_data,
> +                                                 value_len) &&
> +                   !memcmp(value, ea_data, value_len)) {
> +                       mb_cache_entry_touch(ea_inode_cache, ce);
> +                       mb_cache_entry_put(ea_inode_cache, ce);
> +                       kvfree(ea_data);
> +                       return ea_inode;
> +               }
> +
> +               if (!IS_ERR(ea_inode))
> +                       iput(ea_inode);
> +               ce = mb_cache_entry_find_next(ea_inode_cache, ce);
> +       }
> +       kvfree(ea_data);
> +       return NULL;
>  }
>
>  /*
>   * Add value of the EA in an inode.
>   */
> -static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
> -                               unsigned long *ea_ino, const void *value,
> -                               size_t value_len)
> +static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
> +                                         const void *value, size_t value_len,
> +                                         struct inode **ret_inode)
>  {
>         struct inode *ea_inode;
> +       u32 hash;
>         int err;
>
> +       hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
> +       ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
> +       if (ea_inode) {
> +               err = ext4_xattr_inode_inc_ref(handle, ea_inode);
> +               if (err) {
> +                       iput(ea_inode);
> +                       return err;
> +               }
> +
> +               *ret_inode = ea_inode;
> +               return 0;
> +       }
> +
>         /* Create an inode for the EA value */
> -       ea_inode = ext4_xattr_inode_create(handle, inode);
> +       ea_inode = ext4_xattr_inode_create(handle, inode, hash);
>         if (IS_ERR(ea_inode))
>                 return PTR_ERR(ea_inode);
>
>         err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
> -       if (err)
> -               clear_nlink(ea_inode);
> -       else
> -               *ea_ino = ea_inode->i_ino;
> +       if (err) {
> +               ext4_xattr_inode_dec_ref(handle, ea_inode);
> +               iput(ea_inode);
> +               return err;
> +       }
>
> -       iput(ea_inode);
> +       mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
> +                             ea_inode->i_ino, true /* reusable */);
>
> -       return err;
> +       *ret_inode = ea_inode;
> +       return 0;
>  }
>
>  static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
> @@ -1034,9 +1398,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>                                 handle_t *handle, struct inode *inode)
>  {
>         struct ext4_xattr_entry *last;
> -       size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
> +       struct ext4_xattr_entry *here = s->here;
> +       size_t min_offs = s->end - s->base, name_len = strlen(i->name);
>         int in_inode = i->in_inode;
> -       int rc;
> +       struct inode *old_ea_inode = NULL;
> +       struct inode *new_ea_inode = NULL;
> +       size_t old_size, new_size;
> +       int ret;
> +
> +       /* Space used by old and new values. */
> +       old_size = (!s->not_found && !here->e_value_inum) ?
> +                       EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0;
> +       new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0;
> +
> +       /*
> +        * Optimization for the simple case when old and new values have the
> +        * same padded sizes. Not applicable if external inodes are involved.
> +        */
> +       if (new_size && new_size == old_size) {
> +               size_t offs = le16_to_cpu(here->e_value_offs);
> +               void *val = s->base + offs;
> +
> +               here->e_value_size = cpu_to_le32(i->value_len);
> +               if (i->value == EXT4_ZERO_XATTR_VALUE) {
> +                       memset(val, 0, new_size);
> +               } else {
> +                       memcpy(val, i->value, i->value_len);
> +                       /* Clear padding bytes. */
> +                       memset(val + i->value_len, 0, new_size - i->value_len);
> +               }
> +               return 0;
> +       }
>
>         /* Compute min_offs and last. */
>         last = s->first;
> @@ -1047,122 +1439,148 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>                                 min_offs = offs;
>                 }
>         }
> -       free = min_offs - ((void *)last - s->base) - sizeof(__u32);
> -       if (!s->not_found) {
> -               if (!in_inode &&
> -                   !s->here->e_value_inum && s->here->e_value_size) {
> -                       size_t size = le32_to_cpu(s->here->e_value_size);
> -                       free += EXT4_XATTR_SIZE(size);
> -               }
> -               free += EXT4_XATTR_LEN(name_len);
> -       }
> +
> +       /* Check whether we have enough space. */
>         if (i->value) {
> -               size_t value_len = EXT4_XATTR_SIZE(i->value_len);
> +               size_t free;
>
> -               if (in_inode)
> -                       value_len = 0;
> +               free = min_offs - ((void *)last - s->base) - sizeof(__u32);
> +               if (!s->not_found)
> +                       free += EXT4_XATTR_LEN(name_len) + old_size;
>
> -               if (free < EXT4_XATTR_LEN(name_len) + value_len)
> -                       return -ENOSPC;
> +               if (free < EXT4_XATTR_LEN(name_len) + new_size) {
> +                       ret = -ENOSPC;
> +                       goto out;
> +               }
>         }
>
> -       if (i->value && s->not_found) {
> -               /* Insert the new name. */
> -               size_t size = EXT4_XATTR_LEN(name_len);
> -               size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
> -               memmove((void *)s->here + size, s->here, rest);
> -               memset(s->here, 0, size);
> -               s->here->e_name_index = i->name_index;
> -               s->here->e_name_len = name_len;
> -               memcpy(s->here->e_name, i->name, name_len);
> -       } else {
> -               if (!s->here->e_value_inum && s->here->e_value_size &&
> -                   s->here->e_value_offs > 0) {
> -                       void *first_val = s->base + min_offs;
> -                       size_t offs = le16_to_cpu(s->here->e_value_offs);
> -                       void *val = s->base + offs;
> -                       size_t size = EXT4_XATTR_SIZE(
> -                               le32_to_cpu(s->here->e_value_size));
> -
> -                       if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
> -                               /* The old and the new value have the same
> -                                  size. Just replace. */
> -                               s->here->e_value_size =
> -                                       cpu_to_le32(i->value_len);
> -                               if (i->value == EXT4_ZERO_XATTR_VALUE) {
> -                                       memset(val, 0, size);
> -                               } else {
> -                                       /* Clear pad bytes first. */
> -                                       memset(val + size - EXT4_XATTR_PAD, 0,
> -                                              EXT4_XATTR_PAD);
> -                                       memcpy(val, i->value, i->value_len);
> -                               }
> -                               return 0;
> -                       }
> +       /*
> +        * Getting access to old and new ea inodes is subject to failures.
> +        * Finish that work before doing any modifications to the xattr data.
> +        */
> +       if (!s->not_found && here->e_value_inum) {
> +               ret = ext4_xattr_inode_iget(inode,
> +                                           le32_to_cpu(here->e_value_inum),
> +                                           &old_ea_inode);
> +               if (ret) {
> +                       old_ea_inode = NULL;
> +                       goto out;
> +               }
> +       }
> +       if (i->value && in_inode) {
> +               WARN_ON_ONCE(!i->value_len);
>
> -                       /* Remove the old value. */
> -                       memmove(first_val + size, first_val, val - first_val);
> -                       memset(first_val, 0, size);
> -                       s->here->e_value_size = 0;
> -                       s->here->e_value_offs = 0;
> -                       min_offs += size;
> -
> -                       /* Adjust all value offsets. */
> -                       last = s->first;
> -                       while (!IS_LAST_ENTRY(last)) {
> -                               size_t o = le16_to_cpu(last->e_value_offs);
> -                               if (!last->e_value_inum &&
> -                                   last->e_value_size && o < offs)
> -                                       last->e_value_offs =
> -                                               cpu_to_le16(o + size);
> -                               last = EXT4_XATTR_NEXT(last);
> -                       }
> +               ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
> +               if (ret)
> +                       goto out;
> +
> +               ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
> +                                                    i->value_len,
> +                                                    &new_ea_inode);
> +               if (ret) {
> +                       new_ea_inode = NULL;
> +                       ext4_xattr_inode_free_quota(inode, i->value_len);
> +                       goto out;
>                 }
> -               if (s->here->e_value_inum) {
> -                       ext4_xattr_inode_unlink(inode,
> -                                           le32_to_cpu(s->here->e_value_inum));
> -                       s->here->e_value_inum = 0;
> +       }
> +
> +       if (old_ea_inode) {
> +               /* We are ready to release ref count on the old_ea_inode. */
> +               ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
> +               if (ret) {
> +                       /* Release newly required ref count on new_ea_inode. */
> +                       if (new_ea_inode) {
> +                               int err;
> +
> +                               err = ext4_xattr_inode_dec_ref(handle,
> +                                                              new_ea_inode);
> +                               if (err)
> +                                       ext4_warning_inode(new_ea_inode,
> +                                                 "dec ref new_ea_inode err=%d",
> +                                                 err);
> +                               ext4_xattr_inode_free_quota(inode,
> +                                                           i->value_len);
> +                       }
> +                       goto out;
>                 }
> -               if (!i->value) {
> -                       /* Remove the old name. */
> -                       size_t size = EXT4_XATTR_LEN(name_len);
> -                       last = ENTRY((void *)last - size);
> -                       memmove(s->here, (void *)s->here + size,
> -                               (void *)last - (void *)s->here + sizeof(__u32));
> -                       memset(last, 0, size);
> +
> +               ext4_xattr_inode_free_quota(inode,
> +                                           le32_to_cpu(here->e_value_size));
> +       }
> +
> +       /* No failures allowed past this point. */
> +
> +       if (!s->not_found && here->e_value_offs) {
> +               /* Remove the old value. */
> +               void *first_val = s->base + min_offs;
> +               size_t offs = le16_to_cpu(here->e_value_offs);
> +               void *val = s->base + offs;
> +
> +               memmove(first_val + old_size, first_val, val - first_val);
> +               memset(first_val, 0, old_size);
> +               min_offs += old_size;
> +
> +               /* Adjust all value offsets. */
> +               last = s->first;
> +               while (!IS_LAST_ENTRY(last)) {
> +                       size_t o = le16_to_cpu(last->e_value_offs);
> +
> +                       if (!last->e_value_inum &&
> +                           last->e_value_size && o < offs)
> +                               last->e_value_offs = cpu_to_le16(o + old_size);
> +                       last = EXT4_XATTR_NEXT(last);
>                 }
>         }
>
> +       if (!i->value) {
> +               /* Remove old name. */
> +               size_t size = EXT4_XATTR_LEN(name_len);
> +
> +               last = ENTRY((void *)last - size);
> +               memmove(here, (void *)here + size,
> +                       (void *)last - (void *)here + sizeof(__u32));
> +               memset(last, 0, size);
> +       } else if (s->not_found) {
> +               /* Insert new name. */
> +               size_t size = EXT4_XATTR_LEN(name_len);
> +               size_t rest = (void *)last - (void *)here + sizeof(__u32);
> +
> +               memmove((void *)here + size, here, rest);
> +               memset(here, 0, size);
> +               here->e_name_index = i->name_index;
> +               here->e_name_len = name_len;
> +               memcpy(here->e_name, i->name, name_len);
> +       } else {
> +               /* This is an update, reset value info. */
> +               here->e_value_inum = 0;
> +               here->e_value_offs = 0;
> +               here->e_value_size = 0;
> +       }
> +
>         if (i->value) {
> -               /* Insert the new value. */
> +               /* Insert new value. */
>                 if (in_inode) {
> -                       unsigned long ea_ino =
> -                               le32_to_cpu(s->here->e_value_inum);
> -                       rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
> -                                                 i->value, i->value_len);
> -                       if (rc)
> -                               goto out;
> -                       s->here->e_value_inum = cpu_to_le32(ea_ino);
> -                       s->here->e_value_offs = 0;
> +                       here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
>                 } else if (i->value_len) {
> -                       size_t size = EXT4_XATTR_SIZE(i->value_len);
> -                       void *val = s->base + min_offs - size;
> -                       s->here->e_value_offs = cpu_to_le16(min_offs - size);
> -                       s->here->e_value_inum = 0;
> +                       void *val = s->base + min_offs - new_size;
> +
> +                       here->e_value_offs = cpu_to_le16(min_offs - new_size);
>                         if (i->value == EXT4_ZERO_XATTR_VALUE) {
> -                               memset(val, 0, size);
> +                               memset(val, 0, new_size);
>                         } else {
> -                               /* Clear the pad bytes first. */
> -                               memset(val + size - EXT4_XATTR_PAD, 0,
> -                                      EXT4_XATTR_PAD);
>                                 memcpy(val, i->value, i->value_len);
> +                               /* Clear padding bytes. */
> +                               memset(val + i->value_len, 0,
> +                                      new_size - i->value_len);
>                         }
>                 }
> -               s->here->e_value_size = cpu_to_le32(i->value_len);
> +               here->e_value_size = cpu_to_le32(i->value_len);
>         }
> -
> +       ret = 0;
>  out:
> -       return rc;
> +       iput(old_ea_inode);
> +       iput(new_ea_inode);
> +       return ret;
>  }
>
>  struct ext4_xattr_block_find {
> @@ -1224,6 +1642,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>         struct mb_cache_entry *ce = NULL;
>         int error = 0;
>         struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
> +       struct inode *ea_inode = NULL;
> +       size_t old_ea_inode_size = 0;
>
>  #define header(x) ((struct ext4_xattr_header *)(x))
>
> @@ -1278,6 +1698,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>                         header(s->base)->h_refcount = cpu_to_le32(1);
>                         s->here = ENTRY(s->base + offset);
>                         s->end = s->base + bs->bh->b_size;
> +
> +                       /*
> +                        * If existing entry points to an xattr inode, we need
> +                        * to prevent ext4_xattr_set_entry() from decrementing
> +                        * ref count on it because the reference belongs to the
> +                        * original block. In this case, make the entry look
> +                        * like it has an empty value.
> +                        */
> +                       if (!s->not_found && s->here->e_value_inum) {
> +                               /*
> +                                * Defer quota free call for previous inode
> +                                * until success is guaranteed.
> +                                */
> +                               old_ea_inode_size = le32_to_cpu(
> +                                                       s->here->e_value_size);
> +                               s->here->e_value_inum = 0;
> +                               s->here->e_value_size = 0;
> +                       }
>                 }
>         } else {
>                 /* Allocate a buffer where we construct the new block. */
> @@ -1299,6 +1737,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>                 goto bad_block;
>         if (error)
>                 goto cleanup;
> +
> +       if (i->value && s->here->e_value_inum) {
> +               unsigned int ea_ino;
> +
> +               /*
> +                * A ref count on ea_inode has been taken as part of the call to
> +                * ext4_xattr_set_entry() above. We would like to drop this
> +                * extra ref but we have to wait until the xattr block is
> +                * initialized and has its own ref count on the ea_inode.
> +                */
> +               ea_ino = le32_to_cpu(s->here->e_value_inum);
> +               error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
> +               if (error) {
> +                       ea_inode = NULL;
> +                       goto cleanup;
> +               }
> +       }
> +
>         if (!IS_LAST_ENTRY(s->first))
>                 ext4_xattr_rehash(header(s->base), s->here);
>
> @@ -1409,6 +1865,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>                                                  EXT4_FREE_BLOCKS_METADATA);
>                                 goto cleanup;
>                         }
> +                       error = ext4_xattr_inode_inc_ref_all(handle, inode,
> +                                                     ENTRY(header(s->base)+1));
> +                       if (error)
> +                               goto getblk_failed;
> +                       if (ea_inode) {
> +                               /* Drop the extra ref on ea_inode. */
> +                               error = ext4_xattr_inode_dec_ref(handle,
> +                                                                ea_inode);
> +                               if (error)
> +                                       ext4_warning_inode(ea_inode,
> +                                                          "dec ref error=%d",
> +                                                          error);
> +                               iput(ea_inode);
> +                               ea_inode = NULL;
> +                       }
> +
>                         lock_buffer(new_bh);
>                         error = ext4_journal_get_create_access(handle, new_bh);
>                         if (error) {
> @@ -1428,15 +1900,38 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>                 }
>         }
>
> +       if (old_ea_inode_size)
> +               ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
> +
>         /* Update the inode. */
>         EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
>
>         /* Drop the previous xattr block. */
> -       if (bs->bh && bs->bh != new_bh)
> -               ext4_xattr_release_block(handle, inode, bs->bh);
> +       if (bs->bh && bs->bh != new_bh) {
> +               struct ext4_xattr_inode_array *ea_inode_array = NULL;
> +
> +               ext4_xattr_release_block(handle, inode, bs->bh,
> +                                        &ea_inode_array,
> +                                        0 /* extra_credits */);
> +               ext4_xattr_inode_array_free(ea_inode_array);
> +       }
>         error = 0;
>
>  cleanup:
> +       if (ea_inode) {
> +               int error2;
> +
> +               error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
> +               if (error2)
> +                       ext4_warning_inode(ea_inode, "dec ref error=%d",
> +                                          error2);
> +
> +               /* If there was an error, revert the quota charge. */
> +               if (error)
> +                       ext4_xattr_inode_free_quota(inode,
> +                                                   i_size_read(ea_inode));
> +               iput(ea_inode);
> +       }
>         if (ce)
>                 mb_cache_entry_put(ext4_mb_cache, ce);
>         brelse(new_bh);
> @@ -1561,6 +2056,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
>         return !memcmp(value, i->value, i->value_len);
>  }
>
> +static struct buffer_head *ext4_xattr_get_block(struct inode *inode)
> +{
> +       struct buffer_head *bh;
> +       int error;
> +
> +       if (!EXT4_I(inode)->i_file_acl)
> +               return NULL;
> +       bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +       if (!bh)
> +               return ERR_PTR(-EIO);
> +       error = ext4_xattr_check_block(inode, bh);
> +       if (error)
> +               return ERR_PTR(error);
> +       return bh;
> +}
> +
>  /*
>   * ext4_xattr_set_handle()
>   *
> @@ -1603,9 +2114,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>
>         /* Check journal credits under write lock. */
>         if (ext4_handle_valid(handle)) {
> +               struct buffer_head *bh;
>                 int credits;
>
> -               credits = ext4_xattr_set_credits(inode, value_len);
> +               bh = ext4_xattr_get_block(inode);
> +               if (IS_ERR(bh)) {
> +                       error = PTR_ERR(bh);
> +                       goto cleanup;
> +               }
> +
> +               credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
> +               brelse(bh);
> +
>                 if (!ext4_handle_has_enough_credits(handle, credits)) {
>                         error = -ENOSPC;
>                         goto cleanup;
> @@ -1641,6 +2161,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>                 if (flags & XATTR_CREATE)
>                         goto cleanup;
>         }
> +
>         if (!value) {
>                 if (!is.s.not_found)
>                         error = ext4_xattr_ibody_set(handle, inode, &i, &is);
> @@ -1709,34 +2230,29 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>         return error;
>  }
>
> -int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
> +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
>  {
> -       struct super_block *sb = inode->i_sb;
> -       int credits;
> -
> -       if (!EXT4_SB(sb)->s_journal)
> -               return 0;
> +       struct buffer_head *bh;
> +       int err;
>
> -       credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
> +       *credits = 0;
>
> -       /*
> -        * In case of inline data, we may push out the data to a block,
> -        * so we need to reserve credits for this eventuality
> -        */
> -       if (ext4_has_inline_data(inode))
> -               credits += ext4_writepage_trans_blocks(inode) + 1;
> -
> -       if (ext4_has_feature_ea_inode(sb)) {
> -               int nrblocks = (value_len + sb->s_blocksize - 1) >>
> -                                       sb->s_blocksize_bits;
> +       if (!EXT4_SB(inode->i_sb)->s_journal)
> +               return 0;
>
> -               /* For new inode */
> -               credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
> +       down_read(&EXT4_I(inode)->xattr_sem);
>
> -               /* For data blocks of EA inode */
> -               credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
> +       bh = ext4_xattr_get_block(inode);
> +       if (IS_ERR(bh)) {
> +               err = PTR_ERR(bh);
> +       } else {
> +               *credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
> +               brelse(bh);
> +               err = 0;
>         }
> -       return credits;
> +
> +       up_read(&EXT4_I(inode)->xattr_sem);
> +       return err;
>  }
>
>  /*
> @@ -1761,7 +2277,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
>                 return error;
>
>  retry:
> -       credits = ext4_xattr_set_credits(inode, value_len);
> +       error = ext4_xattr_set_credits(inode, value_len, &credits);
> +       if (error)
> +               return error;
> +
>         handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>         if (IS_ERR(handle)) {
>                 error = PTR_ERR(handle);
> @@ -2067,10 +2586,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
>         return error;
>  }
>
> -
>  #define EIA_INCR 16 /* must be 2^n */
>  #define EIA_MASK (EIA_INCR - 1)
> -/* Add the large xattr @inode into @ea_inode_array for later deletion.
> +
> +/* Add the large xattr @inode into @ea_inode_array for deferred iput().
>   * If @ea_inode_array is new or full it will be grown and the old
>   * contents copied over.
>   */
> @@ -2115,21 +2634,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>   * ext4_xattr_delete_inode()
>   *
>   * Free extended attribute resources associated with this inode. Traverse
> - * all entries and unlink any xattr inodes associated with this inode. This
> - * is called immediately before an inode is freed. We have exclusive
> - * access to the inode. If an orphan inode is deleted it will also delete any
> - * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
> - * to ensure they belong to the parent inode and were not deleted already.
> + * all entries and decrement reference on any xattr inodes associated with this
> + * inode. This is called immediately before an inode is freed. We have exclusive
> + * access to the inode. If an orphan inode is deleted it will also release its
> + * references on xattr block and xattr inodes.
>   */
> -int
> -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> -                       struct ext4_xattr_inode_array **ea_inode_array,
> -                       int extra_credits)
> +int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> +                           struct ext4_xattr_inode_array **ea_inode_array,
> +                           int extra_credits)
>  {
>         struct buffer_head *bh = NULL;
>         struct ext4_xattr_ibody_header *header;
> -       struct ext4_inode *raw_inode;
>         struct ext4_iloc iloc = { .bh = NULL };
> +       struct ext4_xattr_entry *entry;
>         int error;
>
>         error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
> @@ -2141,66 +2658,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>                 goto cleanup;
>         }
>
> -       if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
> -               goto delete_external_ea;
> +       if (ext4_has_feature_ea_inode(inode->i_sb) &&
> +           ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
>
> -       error = ext4_get_inode_loc(inode, &iloc);
> -       if (error)
> -               goto cleanup;
> -
> -       error = ext4_journal_get_write_access(handle, iloc.bh);
> -       if (error)
> -               goto cleanup;
> +               error = ext4_get_inode_loc(inode, &iloc);
> +               if (error) {
> +                       EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
> +                       goto cleanup;
> +               }
>
> -       raw_inode = ext4_raw_inode(&iloc);
> -       header = IHDR(inode, raw_inode);
> -       ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
> -                                   false /* block_csum */, ea_inode_array,
> -                                   extra_credits);
> +               error = ext4_journal_get_write_access(handle, iloc.bh);
> +               if (error) {
> +                       EXT4_ERROR_INODE(inode, "write access (error %d)",
> +                                        error);
> +                       goto cleanup;
> +               }
>
> -delete_external_ea:
> -       if (!EXT4_I(inode)->i_file_acl) {
> -               error = 0;
> -               goto cleanup;
> -       }
> -       bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> -       if (!bh) {
> -               EXT4_ERROR_INODE(inode, "block %llu read error",
> -                                EXT4_I(inode)->i_file_acl);
> -               error = -EIO;
> -               goto cleanup;
> -       }
> -       if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
> -           BHDR(bh)->h_blocks != cpu_to_le32(1)) {
> -               EXT4_ERROR_INODE(inode, "bad block %llu",
> -                                EXT4_I(inode)->i_file_acl);
> -               error = -EFSCORRUPTED;
> -               goto cleanup;
> +               header = IHDR(inode, ext4_raw_inode(&iloc));
> +               if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
> +                       ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
> +                                                    IFIRST(header),
> +                                                    false /* block_csum */,
> +                                                    ea_inode_array,
> +                                                    extra_credits,
> +                                                    false /* skip_quota */);
>         }
>
> -       if (ext4_has_feature_ea_inode(inode->i_sb)) {
> -               error = ext4_journal_get_write_access(handle, bh);
> -               if (error) {
> -                       EXT4_ERROR_INODE(inode, "write access %llu",
> +       if (EXT4_I(inode)->i_file_acl) {
> +               bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +               if (!bh) {
> +                       EXT4_ERROR_INODE(inode, "block %llu read error",
>                                          EXT4_I(inode)->i_file_acl);
> +                       error = -EIO;
> +                       goto cleanup;
> +               }
> +               error = ext4_xattr_check_block(inode, bh);
> +               if (error) {
> +                       EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
> +                                        EXT4_I(inode)->i_file_acl, error);
>                         goto cleanup;
>                 }
> -               ext4_xattr_inode_remove_all(handle, inode, bh,
> -                                           BFIRST(bh),
> -                                           true /* block_csum */,
> -                                           ea_inode_array,
> -                                           extra_credits);
> -       }
>
> -       ext4_xattr_release_block(handle, inode, bh);
> -       /* Update i_file_acl within the same transaction that releases block. */
> -       EXT4_I(inode)->i_file_acl = 0;
> -       error = ext4_mark_inode_dirty(handle, inode);
> -       if (error) {
> -               EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> -                                error);
> -               goto cleanup;
> +               if (ext4_has_feature_ea_inode(inode->i_sb)) {
> +                       for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> +                            entry = EXT4_XATTR_NEXT(entry))
> +                               if (entry->e_value_inum)
> +                                       ext4_xattr_inode_free_quota(inode,
> +                                             le32_to_cpu(entry->e_value_size));
> +
> +               }
> +
> +               ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
> +                                        extra_credits);
> +               /*
> +                * Update i_file_acl value in the same transaction that releases
> +                * block.
> +                */
> +               EXT4_I(inode)->i_file_acl = 0;
> +               error = ext4_mark_inode_dirty(handle, inode);
> +               if (error) {
> +                       EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
> +                                        error);
> +                       goto cleanup;
> +               }
>         }
> +       error = 0;
>  cleanup:
>         brelse(iloc.bh);
>         brelse(bh);
> @@ -2209,17 +2731,13 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>
>  void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
>  {
> -       struct inode    *ea_inode;
> -       int             idx = 0;
> +       int idx;
>
>         if (ea_inode_array == NULL)
>                 return;
>
> -       for (; idx < ea_inode_array->count; ++idx) {
> -               ea_inode = ea_inode_array->inodes[idx];
> -               clear_nlink(ea_inode);
> -               iput(ea_inode);
> -       }
> +       for (idx = 0; idx < ea_inode_array->count; ++idx)
> +               iput(ea_inode_array->inodes[idx]);
>         kfree(ea_inode_array);
>  }
>
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index b2005a2716d9..67616cb9a059 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -69,19 +69,6 @@ struct ext4_xattr_entry {
>                 EXT4_I(inode)->i_extra_isize))
>  #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
>
> -/*
> - * Link EA inode back to parent one using i_mtime field.
> - * Extra integer type conversion added to ignore higher
> - * bits in i_mtime.tv_sec which might be set by ext4_get()
> - */
> -#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
> -do {                                                  \
> -      (inode)->i_mtime.tv_sec = inum;                 \
> -} while(0)
> -
> -#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
> -((__u32)(inode)->i_mtime.tv_sec)
> -
>  /*
>   * The minimum size of EA value when you start storing it in an external inode
>   * size of block - size of header - size of 1 entry - 4 null bytes
> @@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
>  extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
>  extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
>  extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
> -extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
> +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
> +                                 int *credits);
>
> -extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
>  extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>                                    struct ext4_xattr_inode_array **array,
>                                    int extra_credits);
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index 45a8d52dc991..d818fd236787 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -13,10 +13,11 @@
>   * mb_cache_entry_delete()).
>   *
>   * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
> - * They use hash of a block contents as a key and block number as a value.
> - * That's why keys need not be unique (different xattr blocks may end up having
> - * the same hash). However block number always uniquely identifies a cache
> - * entry.
> + * Ext4 also uses it for deduplication of xattr values stored in inodes.
> + * They use hash of data as a key and provide a value that may represent a
> + * block or inode number. That's why keys need not be unique (hash of different
> + * data may be the same). However user provided value always uniquely
> + * identifies a cache entry.
>   *
>   * We provide functions for creation and removal of entries, search by key,
>   * and a special "delete entry with given key-value pair" operation. Fixed
> --
> 2.13.1.518.g3df882009-goog
>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH 28/28] quota: add extra inode count to dquot transfer functions
  2017-06-19 12:36             ` [Ocfs2-devel] " Jan Kara
  (?)
  (?)
@ 2017-06-20  9:53             ` Tahsin Erdogan
  -1 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-20  9:53 UTC (permalink / raw)
  To: Jan Kara
  Cc: Jan Kara, Theodore Ts'o, Andreas Dilger, Dave Kleikamp,
	Alexander Viro, Mark Fasheh, Joel Becker, Jens Axboe,
	Deepa Dinamani, Mike Christie, Fabian Frederick, linux-ext4,
	linux-kernel, jfs-discussion, linux-fsdevel, ocfs2-devel,
	reiserfs-devel

On Mon, Jun 19, 2017 at 5:36 AM, Jan Kara <jack@suse.cz> wrote:
> Heh, this "pushing of responsibility" looks like a silly game. If an error
> can happen in a function, it is better to report it as far as easily
> possible (unless we can cleanly handle it which we cannot here). I'm guilty
> of making dquot_free_inode() ignore errors from mark_all_dquot_dirty() and
> in retrospect it would have been better if these were propagated to the
> caller as well. And eventually we can fix this if we decide we care enough.
> I'm completely fine with just returning an error from dquot_free_inode()
> and ignore it in all the callers except for ext4. Then filesystems which
> care enough can try to handle the error. That way we at least don't
> increase the design debt from the past.

I sent an update but since patch title changed it landed in a new
email thread I think ("[PATCH v2 28/31] quota: add get_inode_usage
callback to transfer multi-inode charges"). I will respond to your
comment in that thread.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v2 28/31] quota: add get_inode_usage callback to transfer multi-inode charges
  2017-06-20  9:12             ` [PATCH v2 28/31] quota: add get_inode_usage callback to transfer multi-inode charges Tahsin Erdogan
@ 2017-06-20 12:01               ` Tahsin Erdogan
  2017-06-20 15:28               ` Jan Kara
  1 sibling, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-20 12:01 UTC (permalink / raw)
  To: Jan Kara, linux-ext4; +Cc: linux-kernel, Tahsin Erdogan

On Mon, Jun 19, 2017 at 5:36 AM, Jan Kara <jack@suse.cz> wrote:
> On Mon 19-06-17 04:46:00, Tahsin Erdogan wrote:
>> >> I tried that approach by adding a "int get_inode_usage(struct inode
>> >> *inode, qsize_t *usage)" callback to dquot_operations. Unfortunately,
>> >> ext4 code that calculates the number of internal inodes
>> >> (ext4_xattr_inode_count()) is subject to failures so the callback has
>> >> to be able to report errors. And, that itself is problematic because
>> >> we can't afford to have errors in dquot_free_inode(). If you have
>> >> thoughts about how to address this please let me know.
>> >
>> > Well, you can just make dquot_free_inode() return error. Now most callers
>> > won't be able to do much with an error from dquot_free_inode() but that's
>> > the case also for other things during inode deletion - just handle it as
>> > other fatal failures during inode freeing.
>> >
>> I just checked dquot_free_inode() to see whether it calls anything
>> that could fail. It calls mark_all_dquot_dirty() and ignores the
>> return code from it. I would like to follow the same for the
>> get_inode_usage() as the only use case for get_inode_usage() (ext4)
>> should not fail at inode free time.
>>
>> Basically, I want to avoid changing return type from void to int
>> because it would create a new responsibility for the filesystem
>> implementations who do not know how to deal with it.
>
> Heh, this "pushing of responsibility" looks like a silly game. If an error
> can happen in a function, it is better to report it as far as easily
> possible (unless we can cleanly handle it which we cannot here). I'm guilty
> of making dquot_free_inode() ignore errors from mark_all_dquot_dirty() and
> in retrospect it would have been better if these were propagated to the
> caller as well. And eventually we can fix this if we decide we care enough.
> I'm completely fine with just returning an error from dquot_free_inode()
> and ignore it in all the callers except for ext4. Then filesystems which
> care enough can try to handle the error. That way we at least don't
> increase the design debt from the past.

In the latest version, I have added a get_inode_usage() callback to be
used in __dquot_transfer(). However, I didn't use it in
dquot_alloc_inode() and dquot_free_inode(), so I owe you an
explanation.

Before ea_inode feature, ext4 used the following simple inode quota operations:

- call dquot_alloc_inode() when a new inode is created
- call dquot_free_inode() when inode is deleted
- call dquot_transfer() when ownership of an inode changed

With ea_inode feature, setting a large extended attribute on an inode
may store EA value in an external inode. These extended attribute
inodes (called xattr inode from now on) are shareable. So, there can
be more than inodes that reference them. From quota tracking
perspective, the xattr inodes are charged to all referencing users as
if no sharing occurs. Xattr inodes themselves have quota flag disabled
on them. With this design, the quota operations that we need are:

1- call dquot_alloc_inode() when a non-xattr inode is created
2- increment inode charge on parent inode when a reference on an xattr
inode is taken
3- decrement inode charge on parent when reference is dropped
4- call dquot_free_inode() when non-xattr inode is deleted
5- call dquot_transfer() when "parent" ownership changes. Transfer has
to carry additional references due to xattr inodes

Latest version of patch collapses "increment inode" operation into
dquot_alloc_inode() so it calls dquot_alloc_inode() for both 1) and 2)
above. Similarly it calls dquot_free_inode() for both 3) and 4). We
could invent new operations to make the function names more explicit
but as is it achieves the intended behavior.

With the design above, calling get_inode_usage() from
dquot_alloc_inode() is not very useful because it will always get back
1. And more importantly, it will prohibit dquot_alloc_inode() from
being used for inode increment operation (use case 3 above).

Sorry for the long description, I hope this makes sense.


On Tue, Jun 20, 2017 at 2:12 AM, Tahsin Erdogan <tahsin@google.com> wrote:
> Ext4 ea_inode feature allows storing xattr values in external inodes to
> be able to store values that are bigger than a block in size. Ext4 also
> has deduplication support for these type of inodes. With deduplication,
> the actual storage waste is eliminated but the users of such inodes are
> still charged full quota for the inodes as if there was no sharing
> happening in the background.
>
> This design requires ext4 to manually charge the users because the
> inodes are shared.
>
> An implication of this is that, if someone calls chown on a file that
> has such references we need to transfer the quota for the file and xattr
> inodes. Current dquot_transfer() function implicitly transfers one inode
> charge. With ea_inode feature, we would like to transfer multiple inode
> charges.
>
> Add get_inode_usage callback which can interrogate the total number of
> inodes that were charged for a given inode.
>
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
> v2:
>   - added get_inode_usage() callback to query total inodes charge to
>     be transferred
>
>  fs/ext4/inode.c       |  7 +++++++
>  fs/ext4/ioctl.c       |  6 ++++++
>  fs/ext4/super.c       | 21 ++++++++++----------
>  fs/ext4/xattr.c       | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/ext4/xattr.h       |  2 ++
>  fs/quota/dquot.c      | 16 +++++++++++----
>  include/linux/quota.h |  2 ++
>  7 files changed, 94 insertions(+), 14 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index ea95bd9eab81..cd22de0b5d2c 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5295,7 +5295,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
>                         error = PTR_ERR(handle);
>                         goto err_out;
>                 }
> +
> +               /* dquot_transfer() calls back ext4_get_inode_usage() which
> +                * counts xattr inode references.
> +                */
> +               down_read(&EXT4_I(inode)->xattr_sem);
>                 error = dquot_transfer(inode, attr);
> +               up_read(&EXT4_I(inode)->xattr_sem);
> +
>                 if (error) {
>                         ext4_journal_stop(handle);
>                         return error;
> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> index dde8deb11e59..42b3a73143cf 100644
> --- a/fs/ext4/ioctl.c
> +++ b/fs/ext4/ioctl.c
> @@ -373,7 +373,13 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
>
>         transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
>         if (!IS_ERR(transfer_to[PRJQUOTA])) {
> +
> +               /* __dquot_transfer() calls back ext4_get_inode_usage() which
> +                * counts xattr inode references.
> +                */
> +               down_read(&EXT4_I(inode)->xattr_sem);
>                 err = __dquot_transfer(inode, transfer_to);
> +               up_read(&EXT4_I(inode)->xattr_sem);
>                 dqput(transfer_to[PRJQUOTA]);
>                 if (err)
>                         goto out_dirty;
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 2bfacd737bb6..4b15bf674d45 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1263,16 +1263,17 @@ static struct dquot **ext4_get_dquots(struct inode *inode)
>  }
>
>  static const struct dquot_operations ext4_quota_operations = {
> -       .get_reserved_space = ext4_get_reserved_space,
> -       .write_dquot    = ext4_write_dquot,
> -       .acquire_dquot  = ext4_acquire_dquot,
> -       .release_dquot  = ext4_release_dquot,
> -       .mark_dirty     = ext4_mark_dquot_dirty,
> -       .write_info     = ext4_write_info,
> -       .alloc_dquot    = dquot_alloc,
> -       .destroy_dquot  = dquot_destroy,
> -       .get_projid     = ext4_get_projid,
> -       .get_next_id    = ext4_get_next_id,
> +       .get_reserved_space     = ext4_get_reserved_space,
> +       .write_dquot            = ext4_write_dquot,
> +       .acquire_dquot          = ext4_acquire_dquot,
> +       .release_dquot          = ext4_release_dquot,
> +       .mark_dirty             = ext4_mark_dquot_dirty,
> +       .write_info             = ext4_write_info,
> +       .alloc_dquot            = dquot_alloc,
> +       .destroy_dquot          = dquot_destroy,
> +       .get_projid             = ext4_get_projid,
> +       .get_inode_usage        = ext4_get_inode_usage,
> +       .get_next_id            = ext4_get_next_id,
>  };
>
>  static const struct quotactl_ops ext4_qctl_operations = {
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index d7e60358ec91..5e20f29afe9e 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -734,6 +734,60 @@ static void ext4_xattr_update_super_block(handle_t *handle,
>         }
>  }
>
> +int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
> +{
> +       struct ext4_iloc iloc = { .bh = NULL };
> +       struct buffer_head *bh = NULL;
> +       struct ext4_inode *raw_inode;
> +       struct ext4_xattr_ibody_header *header;
> +       struct ext4_xattr_entry *entry;
> +       qsize_t ea_inode_refs = 0;
> +       void *end;
> +       int ret;
> +
> +       lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);
> +
> +       if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
> +               ret = ext4_get_inode_loc(inode, &iloc);
> +               if (ret)
> +                       goto out;
> +               raw_inode = ext4_raw_inode(&iloc);
> +               header = IHDR(inode, raw_inode);
> +               end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
> +               ret = xattr_check_inode(inode, header, end);
> +               if (ret)
> +                       goto out;
> +
> +               for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
> +                    entry = EXT4_XATTR_NEXT(entry))
> +                       if (entry->e_value_inum)
> +                               ea_inode_refs++;
> +       }
> +
> +       if (EXT4_I(inode)->i_file_acl) {
> +               bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +               if (!bh) {
> +                       ret = -EIO;
> +                       goto out;
> +               }
> +
> +               if (ext4_xattr_check_block(inode, bh)) {
> +                       ret = -EFSCORRUPTED;
> +                       goto out;
> +               }
> +
> +               for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> +                    entry = EXT4_XATTR_NEXT(entry))
> +                       if (entry->e_value_inum)
> +                               ea_inode_refs++;
> +       }
> +       *usage = ea_inode_refs + 1;
> +out:
> +       brelse(iloc.bh);
> +       brelse(bh);
> +       return ret;
> +}
> +
>  static inline size_t round_up_cluster(struct inode *inode, size_t length)
>  {
>         struct super_block *sb = inode->i_sb;
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index 67616cb9a059..26119a67c8c3 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -193,3 +193,5 @@ extern void ext4_xattr_inode_set_class(struct inode *ea_inode);
>  #else
>  static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { }
>  #endif
> +
> +extern int ext4_get_inode_usage(struct inode *inode, qsize_t *usage);
> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> index 48813aeaab80..53a17496c5c5 100644
> --- a/fs/quota/dquot.c
> +++ b/fs/quota/dquot.c
> @@ -1910,6 +1910,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>  {
>         qsize_t space, cur_space;
>         qsize_t rsv_space = 0;
> +       qsize_t inode_usage = 1;
>         struct dquot *transfer_from[MAXQUOTAS] = {};
>         int cnt, ret = 0;
>         char is_valid[MAXQUOTAS] = {};
> @@ -1919,6 +1920,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>
>         if (IS_NOQUOTA(inode))
>                 return 0;
> +
> +       if (inode->i_sb->dq_op->get_inode_usage) {
> +               ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage);
> +               if (ret)
> +                       return ret;
> +       }
> +
>         /* Initialize the arrays */
>         for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
>                 warn_to[cnt].w_type = QUOTA_NL_NOWARN;
> @@ -1946,7 +1954,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>                         continue;
>                 is_valid[cnt] = 1;
>                 transfer_from[cnt] = i_dquot(inode)[cnt];
> -               ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]);
> +               ret = check_idq(transfer_to[cnt], inode_usage, &warn_to[cnt]);
>                 if (ret)
>                         goto over_quota;
>                 ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]);
> @@ -1963,7 +1971,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>                 /* Due to IO error we might not have transfer_from[] structure */
>                 if (transfer_from[cnt]) {
>                         int wtype;
> -                       wtype = info_idq_free(transfer_from[cnt], 1);
> +                       wtype = info_idq_free(transfer_from[cnt], inode_usage);
>                         if (wtype != QUOTA_NL_NOWARN)
>                                 prepare_warning(&warn_from_inodes[cnt],
>                                                 transfer_from[cnt], wtype);
> @@ -1971,13 +1979,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>                         if (wtype != QUOTA_NL_NOWARN)
>                                 prepare_warning(&warn_from_space[cnt],
>                                                 transfer_from[cnt], wtype);
> -                       dquot_decr_inodes(transfer_from[cnt], 1);
> +                       dquot_decr_inodes(transfer_from[cnt], inode_usage);
>                         dquot_decr_space(transfer_from[cnt], cur_space);
>                         dquot_free_reserved_space(transfer_from[cnt],
>                                                   rsv_space);
>                 }
>
> -               dquot_incr_inodes(transfer_to[cnt], 1);
> +               dquot_incr_inodes(transfer_to[cnt], inode_usage);
>                 dquot_incr_space(transfer_to[cnt], cur_space);
>                 dquot_resv_space(transfer_to[cnt], rsv_space);
>
> diff --git a/include/linux/quota.h b/include/linux/quota.h
> index 3434eef2a5aa..bfd077ca6ac3 100644
> --- a/include/linux/quota.h
> +++ b/include/linux/quota.h
> @@ -332,6 +332,8 @@ struct dquot_operations {
>          * quota code only */
>         qsize_t *(*get_reserved_space) (struct inode *);
>         int (*get_projid) (struct inode *, kprojid_t *);/* Get project ID */
> +       /* Get number of inodes that were charged for a given inode */
> +       int (*get_inode_usage) (struct inode *, qsize_t *);
>         /* Get next ID with active quota structure */
>         int (*get_next_id) (struct super_block *sb, struct kqid *qid);
>  };
> --
> 2.13.1.518.g3df882009-goog
>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v2 28/31] quota: add get_inode_usage callback to transfer multi-inode charges
  2017-06-20  9:12             ` [PATCH v2 28/31] quota: add get_inode_usage callback to transfer multi-inode charges Tahsin Erdogan
  2017-06-20 12:01               ` Tahsin Erdogan
@ 2017-06-20 15:28               ` Jan Kara
  2017-06-20 18:08                 ` [PATCH v3 " Tahsin Erdogan
  1 sibling, 1 reply; 100+ messages in thread
From: Jan Kara @ 2017-06-20 15:28 UTC (permalink / raw)
  To: Tahsin Erdogan; +Cc: Jan Kara, linux-ext4, linux-kernel

On Tue 20-06-17 02:12:10, Tahsin Erdogan wrote:
> Ext4 ea_inode feature allows storing xattr values in external inodes to
> be able to store values that are bigger than a block in size. Ext4 also
> has deduplication support for these type of inodes. With deduplication,
> the actual storage waste is eliminated but the users of such inodes are
> still charged full quota for the inodes as if there was no sharing
> happening in the background.
> 
> This design requires ext4 to manually charge the users because the
> inodes are shared.
> 
> An implication of this is that, if someone calls chown on a file that
> has such references we need to transfer the quota for the file and xattr
> inodes. Current dquot_transfer() function implicitly transfers one inode
> charge. With ea_inode feature, we would like to transfer multiple inode
> charges.
> 
> Add get_inode_usage callback which can interrogate the total number of
> inodes that were charged for a given inode.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>

The patch looks good to me. Feel free to add:

Acked-by: Jan Kara <jack@suse.cz>

								Honza

> ---
> v2:
>   - added get_inode_usage() callback to query total inodes charge to
>     be transferred
> 
>  fs/ext4/inode.c       |  7 +++++++
>  fs/ext4/ioctl.c       |  6 ++++++
>  fs/ext4/super.c       | 21 ++++++++++----------
>  fs/ext4/xattr.c       | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/ext4/xattr.h       |  2 ++
>  fs/quota/dquot.c      | 16 +++++++++++----
>  include/linux/quota.h |  2 ++
>  7 files changed, 94 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index ea95bd9eab81..cd22de0b5d2c 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5295,7 +5295,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
>  			error = PTR_ERR(handle);
>  			goto err_out;
>  		}
> +
> +		/* dquot_transfer() calls back ext4_get_inode_usage() which
> +		 * counts xattr inode references.
> +		 */
> +		down_read(&EXT4_I(inode)->xattr_sem);
>  		error = dquot_transfer(inode, attr);
> +		up_read(&EXT4_I(inode)->xattr_sem);
> +
>  		if (error) {
>  			ext4_journal_stop(handle);
>  			return error;
> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> index dde8deb11e59..42b3a73143cf 100644
> --- a/fs/ext4/ioctl.c
> +++ b/fs/ext4/ioctl.c
> @@ -373,7 +373,13 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
>  
>  	transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
>  	if (!IS_ERR(transfer_to[PRJQUOTA])) {
> +
> +		/* __dquot_transfer() calls back ext4_get_inode_usage() which
> +		 * counts xattr inode references.
> +		 */
> +		down_read(&EXT4_I(inode)->xattr_sem);
>  		err = __dquot_transfer(inode, transfer_to);
> +		up_read(&EXT4_I(inode)->xattr_sem);
>  		dqput(transfer_to[PRJQUOTA]);
>  		if (err)
>  			goto out_dirty;
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 2bfacd737bb6..4b15bf674d45 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1263,16 +1263,17 @@ static struct dquot **ext4_get_dquots(struct inode *inode)
>  }
>  
>  static const struct dquot_operations ext4_quota_operations = {
> -	.get_reserved_space = ext4_get_reserved_space,
> -	.write_dquot	= ext4_write_dquot,
> -	.acquire_dquot	= ext4_acquire_dquot,
> -	.release_dquot	= ext4_release_dquot,
> -	.mark_dirty	= ext4_mark_dquot_dirty,
> -	.write_info	= ext4_write_info,
> -	.alloc_dquot	= dquot_alloc,
> -	.destroy_dquot	= dquot_destroy,
> -	.get_projid	= ext4_get_projid,
> -	.get_next_id	= ext4_get_next_id,
> +	.get_reserved_space	= ext4_get_reserved_space,
> +	.write_dquot		= ext4_write_dquot,
> +	.acquire_dquot		= ext4_acquire_dquot,
> +	.release_dquot		= ext4_release_dquot,
> +	.mark_dirty		= ext4_mark_dquot_dirty,
> +	.write_info		= ext4_write_info,
> +	.alloc_dquot		= dquot_alloc,
> +	.destroy_dquot		= dquot_destroy,
> +	.get_projid		= ext4_get_projid,
> +	.get_inode_usage	= ext4_get_inode_usage,
> +	.get_next_id		= ext4_get_next_id,
>  };
>  
>  static const struct quotactl_ops ext4_qctl_operations = {
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index d7e60358ec91..5e20f29afe9e 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -734,6 +734,60 @@ static void ext4_xattr_update_super_block(handle_t *handle,
>  	}
>  }
>  
> +int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
> +{
> +	struct ext4_iloc iloc = { .bh = NULL };
> +	struct buffer_head *bh = NULL;
> +	struct ext4_inode *raw_inode;
> +	struct ext4_xattr_ibody_header *header;
> +	struct ext4_xattr_entry *entry;
> +	qsize_t ea_inode_refs = 0;
> +	void *end;
> +	int ret;
> +
> +	lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);
> +
> +	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
> +		ret = ext4_get_inode_loc(inode, &iloc);
> +		if (ret)
> +			goto out;
> +		raw_inode = ext4_raw_inode(&iloc);
> +		header = IHDR(inode, raw_inode);
> +		end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
> +		ret = xattr_check_inode(inode, header, end);
> +		if (ret)
> +			goto out;
> +
> +		for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
> +		     entry = EXT4_XATTR_NEXT(entry))
> +			if (entry->e_value_inum)
> +				ea_inode_refs++;
> +	}
> +
> +	if (EXT4_I(inode)->i_file_acl) {
> +		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> +		if (!bh) {
> +			ret = -EIO;
> +			goto out;
> +		}
> +
> +		if (ext4_xattr_check_block(inode, bh)) {
> +			ret = -EFSCORRUPTED;
> +			goto out;
> +		}
> +
> +		for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
> +		     entry = EXT4_XATTR_NEXT(entry))
> +			if (entry->e_value_inum)
> +				ea_inode_refs++;
> +	}
> +	*usage = ea_inode_refs + 1;
> +out:
> +	brelse(iloc.bh);
> +	brelse(bh);
> +	return ret;
> +}
> +
>  static inline size_t round_up_cluster(struct inode *inode, size_t length)
>  {
>  	struct super_block *sb = inode->i_sb;
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index 67616cb9a059..26119a67c8c3 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -193,3 +193,5 @@ extern void ext4_xattr_inode_set_class(struct inode *ea_inode);
>  #else
>  static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { }
>  #endif
> +
> +extern int ext4_get_inode_usage(struct inode *inode, qsize_t *usage);
> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> index 48813aeaab80..53a17496c5c5 100644
> --- a/fs/quota/dquot.c
> +++ b/fs/quota/dquot.c
> @@ -1910,6 +1910,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>  {
>  	qsize_t space, cur_space;
>  	qsize_t rsv_space = 0;
> +	qsize_t inode_usage = 1;
>  	struct dquot *transfer_from[MAXQUOTAS] = {};
>  	int cnt, ret = 0;
>  	char is_valid[MAXQUOTAS] = {};
> @@ -1919,6 +1920,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>  
>  	if (IS_NOQUOTA(inode))
>  		return 0;
> +
> +	if (inode->i_sb->dq_op->get_inode_usage) {
> +		ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage);
> +		if (ret)
> +			return ret;
> +	}
> +
>  	/* Initialize the arrays */
>  	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
>  		warn_to[cnt].w_type = QUOTA_NL_NOWARN;
> @@ -1946,7 +1954,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>  			continue;
>  		is_valid[cnt] = 1;
>  		transfer_from[cnt] = i_dquot(inode)[cnt];
> -		ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]);
> +		ret = check_idq(transfer_to[cnt], inode_usage, &warn_to[cnt]);
>  		if (ret)
>  			goto over_quota;
>  		ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]);
> @@ -1963,7 +1971,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>  		/* Due to IO error we might not have transfer_from[] structure */
>  		if (transfer_from[cnt]) {
>  			int wtype;
> -			wtype = info_idq_free(transfer_from[cnt], 1);
> +			wtype = info_idq_free(transfer_from[cnt], inode_usage);
>  			if (wtype != QUOTA_NL_NOWARN)
>  				prepare_warning(&warn_from_inodes[cnt],
>  						transfer_from[cnt], wtype);
> @@ -1971,13 +1979,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
>  			if (wtype != QUOTA_NL_NOWARN)
>  				prepare_warning(&warn_from_space[cnt],
>  						transfer_from[cnt], wtype);
> -			dquot_decr_inodes(transfer_from[cnt], 1);
> +			dquot_decr_inodes(transfer_from[cnt], inode_usage);
>  			dquot_decr_space(transfer_from[cnt], cur_space);
>  			dquot_free_reserved_space(transfer_from[cnt],
>  						  rsv_space);
>  		}
>  
> -		dquot_incr_inodes(transfer_to[cnt], 1);
> +		dquot_incr_inodes(transfer_to[cnt], inode_usage);
>  		dquot_incr_space(transfer_to[cnt], cur_space);
>  		dquot_resv_space(transfer_to[cnt], rsv_space);
>  
> diff --git a/include/linux/quota.h b/include/linux/quota.h
> index 3434eef2a5aa..bfd077ca6ac3 100644
> --- a/include/linux/quota.h
> +++ b/include/linux/quota.h
> @@ -332,6 +332,8 @@ struct dquot_operations {
>  	 * quota code only */
>  	qsize_t *(*get_reserved_space) (struct inode *);
>  	int (*get_projid) (struct inode *, kprojid_t *);/* Get project ID */
> +	/* Get number of inodes that were charged for a given inode */
> +	int (*get_inode_usage) (struct inode *, qsize_t *);
>  	/* Get next ID with active quota structure */
>  	int (*get_next_id) (struct super_block *sb, struct kqid *qid);
>  };
> -- 
> 2.13.1.518.g3df882009-goog
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [PATCH v3 28/31] quota: add get_inode_usage callback to transfer multi-inode charges
  2017-06-20 15:28               ` Jan Kara
@ 2017-06-20 18:08                 ` Tahsin Erdogan
  2017-06-21  4:48                   ` Theodore Ts'o
  0 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-20 18:08 UTC (permalink / raw)
  To: Jan Kara, linux-ext4; +Cc: linux-kernel, Tahsin Erdogan

Ext4 ea_inode feature allows storing xattr values in external inodes to
be able to store values that are bigger than a block in size. Ext4 also
has deduplication support for these type of inodes. With deduplication,
the actual storage waste is eliminated but the users of such inodes are
still charged full quota for the inodes as if there was no sharing
happening in the background.

This design requires ext4 to manually charge the users because the
inodes are shared.

An implication of this is that, if someone calls chown on a file that
has such references we need to transfer the quota for the file and xattr
inodes. Current dquot_transfer() function implicitly transfers one inode
charge. With ea_inode feature, we would like to transfer multiple inode
charges.

Add get_inode_usage callback which can interrogate the total number of
inodes that were charged for a given inode.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
Acked-by: Jan Kara <jack@suse.cz>
---
v3: added Acked-by

v2:
  - added get_inode_usage() callback to query total inodes charge to
    be transferred

 fs/ext4/inode.c       |  7 +++++++
 fs/ext4/ioctl.c       |  6 ++++++
 fs/ext4/super.c       | 21 ++++++++++----------
 fs/ext4/xattr.c       | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/xattr.h       |  2 ++
 fs/quota/dquot.c      | 16 +++++++++++----
 include/linux/quota.h |  2 ++
 7 files changed, 94 insertions(+), 14 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ea95bd9eab81..cd22de0b5d2c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5295,7 +5295,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
+
+		/* dquot_transfer() calls back ext4_get_inode_usage() which
+		 * counts xattr inode references.
+		 */
+		down_read(&EXT4_I(inode)->xattr_sem);
 		error = dquot_transfer(inode, attr);
+		up_read(&EXT4_I(inode)->xattr_sem);
+
 		if (error) {
 			ext4_journal_stop(handle);
 			return error;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index dde8deb11e59..42b3a73143cf 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -373,7 +373,13 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
 
 	transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
 	if (!IS_ERR(transfer_to[PRJQUOTA])) {
+
+		/* __dquot_transfer() calls back ext4_get_inode_usage() which
+		 * counts xattr inode references.
+		 */
+		down_read(&EXT4_I(inode)->xattr_sem);
 		err = __dquot_transfer(inode, transfer_to);
+		up_read(&EXT4_I(inode)->xattr_sem);
 		dqput(transfer_to[PRJQUOTA]);
 		if (err)
 			goto out_dirty;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2bfacd737bb6..4b15bf674d45 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1263,16 +1263,17 @@ static struct dquot **ext4_get_dquots(struct inode *inode)
 }
 
 static const struct dquot_operations ext4_quota_operations = {
-	.get_reserved_space = ext4_get_reserved_space,
-	.write_dquot	= ext4_write_dquot,
-	.acquire_dquot	= ext4_acquire_dquot,
-	.release_dquot	= ext4_release_dquot,
-	.mark_dirty	= ext4_mark_dquot_dirty,
-	.write_info	= ext4_write_info,
-	.alloc_dquot	= dquot_alloc,
-	.destroy_dquot	= dquot_destroy,
-	.get_projid	= ext4_get_projid,
-	.get_next_id	= ext4_get_next_id,
+	.get_reserved_space	= ext4_get_reserved_space,
+	.write_dquot		= ext4_write_dquot,
+	.acquire_dquot		= ext4_acquire_dquot,
+	.release_dquot		= ext4_release_dquot,
+	.mark_dirty		= ext4_mark_dquot_dirty,
+	.write_info		= ext4_write_info,
+	.alloc_dquot		= dquot_alloc,
+	.destroy_dquot		= dquot_destroy,
+	.get_projid		= ext4_get_projid,
+	.get_inode_usage	= ext4_get_inode_usage,
+	.get_next_id		= ext4_get_next_id,
 };
 
 static const struct quotactl_ops ext4_qctl_operations = {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index d7e60358ec91..5e20f29afe9e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -734,6 +734,60 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 	}
 }
 
+int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
+{
+	struct ext4_iloc iloc = { .bh = NULL };
+	struct buffer_head *bh = NULL;
+	struct ext4_inode *raw_inode;
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_entry *entry;
+	qsize_t ea_inode_refs = 0;
+	void *end;
+	int ret;
+
+	lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+		ret = ext4_get_inode_loc(inode, &iloc);
+		if (ret)
+			goto out;
+		raw_inode = ext4_raw_inode(&iloc);
+		header = IHDR(inode, raw_inode);
+		end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+		ret = xattr_check_inode(inode, header, end);
+		if (ret)
+			goto out;
+
+		for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
+		     entry = EXT4_XATTR_NEXT(entry))
+			if (entry->e_value_inum)
+				ea_inode_refs++;
+	}
+
+	if (EXT4_I(inode)->i_file_acl) {
+		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+		if (!bh) {
+			ret = -EIO;
+			goto out;
+		}
+
+		if (ext4_xattr_check_block(inode, bh)) {
+			ret = -EFSCORRUPTED;
+			goto out;
+		}
+
+		for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
+		     entry = EXT4_XATTR_NEXT(entry))
+			if (entry->e_value_inum)
+				ea_inode_refs++;
+	}
+	*usage = ea_inode_refs + 1;
+out:
+	brelse(iloc.bh);
+	brelse(bh);
+	return ret;
+}
+
 static inline size_t round_up_cluster(struct inode *inode, size_t length)
 {
 	struct super_block *sb = inode->i_sb;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 67616cb9a059..26119a67c8c3 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -193,3 +193,5 @@ extern void ext4_xattr_inode_set_class(struct inode *ea_inode);
 #else
 static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { }
 #endif
+
+extern int ext4_get_inode_usage(struct inode *inode, qsize_t *usage);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 48813aeaab80..53a17496c5c5 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1910,6 +1910,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 {
 	qsize_t space, cur_space;
 	qsize_t rsv_space = 0;
+	qsize_t inode_usage = 1;
 	struct dquot *transfer_from[MAXQUOTAS] = {};
 	int cnt, ret = 0;
 	char is_valid[MAXQUOTAS] = {};
@@ -1919,6 +1920,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 
 	if (IS_NOQUOTA(inode))
 		return 0;
+
+	if (inode->i_sb->dq_op->get_inode_usage) {
+		ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage);
+		if (ret)
+			return ret;
+	}
+
 	/* Initialize the arrays */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		warn_to[cnt].w_type = QUOTA_NL_NOWARN;
@@ -1946,7 +1954,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 			continue;
 		is_valid[cnt] = 1;
 		transfer_from[cnt] = i_dquot(inode)[cnt];
-		ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]);
+		ret = check_idq(transfer_to[cnt], inode_usage, &warn_to[cnt]);
 		if (ret)
 			goto over_quota;
 		ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]);
@@ -1963,7 +1971,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 		/* Due to IO error we might not have transfer_from[] structure */
 		if (transfer_from[cnt]) {
 			int wtype;
-			wtype = info_idq_free(transfer_from[cnt], 1);
+			wtype = info_idq_free(transfer_from[cnt], inode_usage);
 			if (wtype != QUOTA_NL_NOWARN)
 				prepare_warning(&warn_from_inodes[cnt],
 						transfer_from[cnt], wtype);
@@ -1971,13 +1979,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 			if (wtype != QUOTA_NL_NOWARN)
 				prepare_warning(&warn_from_space[cnt],
 						transfer_from[cnt], wtype);
-			dquot_decr_inodes(transfer_from[cnt], 1);
+			dquot_decr_inodes(transfer_from[cnt], inode_usage);
 			dquot_decr_space(transfer_from[cnt], cur_space);
 			dquot_free_reserved_space(transfer_from[cnt],
 						  rsv_space);
 		}
 
-		dquot_incr_inodes(transfer_to[cnt], 1);
+		dquot_incr_inodes(transfer_to[cnt], inode_usage);
 		dquot_incr_space(transfer_to[cnt], cur_space);
 		dquot_resv_space(transfer_to[cnt], rsv_space);
 
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 3434eef2a5aa..bfd077ca6ac3 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -332,6 +332,8 @@ struct dquot_operations {
 	 * quota code only */
 	qsize_t *(*get_reserved_space) (struct inode *);
 	int (*get_projid) (struct inode *, kprojid_t *);/* Get project ID */
+	/* Get number of inodes that were charged for a given inode */
+	int (*get_inode_usage) (struct inode *, qsize_t *);
 	/* Get next ID with active quota structure */
 	int (*get_next_id) (struct super_block *sb, struct kqid *qid);
 };
-- 
2.13.1.518.g3df882009-goog

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* Re: [PATCH v3 28/31] quota: add get_inode_usage callback to transfer multi-inode charges
  2017-06-20 18:08                 ` [PATCH v3 " Tahsin Erdogan
@ 2017-06-21  4:48                   ` Theodore Ts'o
  2017-06-21 11:22                     ` Tahsin Erdogan
  0 siblings, 1 reply; 100+ messages in thread
From: Theodore Ts'o @ 2017-06-21  4:48 UTC (permalink / raw)
  To: Tahsin Erdogan; +Cc: Jan Kara, linux-ext4, linux-kernel

Tahsin, when you think we've closed on the reviews, could you send out
a complete set of all of the patches on a new mail thread, using git
send-email so I can make sure I'm grabbing the final version of all of
the patches in this patch series?

It's great that you are using separate versions for each patch, since
it makes it easy to track changes in each patch, but when it comes
time for me to get a complete set of patches to apply, it does it make
it harder to figure out that I need v5 of this patch, and v3 of that
patch....

Many thanks!

					- Ted

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v3 28/31] quota: add get_inode_usage callback to transfer multi-inode charges
  2017-06-21  4:48                   ` Theodore Ts'o
@ 2017-06-21 11:22                     ` Tahsin Erdogan
  0 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-21 11:22 UTC (permalink / raw)
  To: Theodore Ts'o, Tahsin Erdogan, Jan Kara, linux-ext4, linux-kernel

On Tue, Jun 20, 2017 at 9:48 PM, Theodore Ts'o <tytso@mit.edu> wrote:
> Tahsin, when you think we've closed on the reviews, could you send out
> a complete set of all of the patches on a new mail thread, using git
> send-email so I can make sure I'm grabbing the final version of all of
> the patches in this patch series?

Sure, I will.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v5 27/28] ext4: xattr inode deduplication
  2017-06-20  9:49                     ` Tahsin Erdogan
@ 2017-06-21 17:42                       ` Andreas Dilger
  0 siblings, 0 replies; 100+ messages in thread
From: Andreas Dilger @ 2017-06-21 17:42 UTC (permalink / raw)
  To: Tahsin Erdogan; +Cc: Darrick J . Wong, linux-ext4, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 79286 bytes --]

On Jun 20, 2017, at 3:49 AM, Tahsin Erdogan <tahsin@google.com> wrote:
> 
> Thanks Andreas for the feedback. Please see my responses below:
> 
>> It would be preferable to allow a mount option like "no_mbcache" to disable
>> the use of shared xattrs.  In the Lustre case at least, there will never be
>> shared large xattrs, and we've had a bunch of performance issues with mbcache
>> due to lock contention among many server threads doing concurrent lookups and
>> inserting many thousands of unique entries into the cache.
> 
> I have put nombcache mount option in a separate patch ("[PATCH 32/32]
> ext4: add nombcache mount option"). I have named it nombcache instead
> of no_mbcache to be consistent with other no* options. Let me know if
> you prefer no_mbcache as the option name.

Tashin, we are already using the "no_mbcache" option name, so would prefer
to keep that working.  It would be OK to accept both option names to mean
the same thing, and only document the "nombcache" option.

>> This should follow the existing convention of always using s_csum_seed to seed
>> the checksum, and change ext4_fill_super() to initialize s_csum_seed to ~0 if
>> ext4_has_metadata_csum() is false, or always use the same value regardless of
>> whether ext4_has_metadata_csum() is set or not.
> 
> Done.
> 
>> If it really necessary to have more than 2^32 references on a single shared
>> inode then it would be better to avoid the re-use of i_mtime, which breaks
>> the backref for unshared xattrs, and using i_size isn't enough of a guarantee
>> that this is the correct parent inode in case of on-disk corruption.
> 
> I have now moved the lower 32bits of ref count from i_mtime to l_i_version.
> 
>> Should this be contingent on ext4_has_metadata_csum() feature being enabled, or
>> alternately check if EXT4_XATTR_INODE_GET_PARENT() and i_generation match before
>> returning an error.  This will allow a smooth transition from existing filesystems
>> that do not store the hash, but have only a single-use xattr inode with a parent
>> backref.
> 
> I updated hash validation to fallback to parent backref check for
> backward compatibility.

Great, thanks.

>>> +     /* Indirection block. */
>>> +     blocks += 1;
>> 
>> Strictly speaking, this is only needed "if (blocks > EXT4_NDIR_BLOCKS)".
> 
> Ack. I didn't think it was worth going through exact calculation in
> this case, let me know if you see value in doing that.
> I also updated the comment to mention extents.

This one credit isn't a huge deal.  In general, the number of credits reserved
can affect performance if they get too large, especially if there are a large
number of threads active, since this will cause premature journal flushing and
reduce the number of shared blocks written in the same transaction.

>>> +     /* We may need to clone the existing xattr block in which case we need
>>> +      * to increment ref counts for existing ea_inodes referenced by it.
>>> +      */
>> 
>> Just to clarify here, in the case of cloning an existing xattr block, are the
>> refcounts being _incremented_ or _decremented_ on the existing ea_inodes?  I'm
>> trying to figure out if we really need to have credits for both old and new
>> xattr inodes, as well as these additional credits.  Since this is reserving
>> about 110 blocks for every setxattr, this can add significant pressure on the
>> journal if there are lots of threads creating files and/or setting xattrs.
> 
> Cloning causes incrementing xattr inode references.
> 
>>> +ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
>>> +                         size_t value_len, u32 hash)
>>> {
>>> +     struct inode *ea_inode;
>>> +     struct mb_cache_entry *ce;
>>> +     struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
>>> +     void *ea_data = NULL;
>>>      int err;
>> 
>> This function should just return NULL if ea_inode_cache is NULL (e.g. in
>> the case of "no_mbcache" mount option).
> 
> Done in later patch ("[PATCH 32/32] ext4: add nombcache mount option")
> 
>> Should skip mb_cache if EA_INODE_CACHE(inode) is NULL, or have a wrapper
>> like ext4_xattr_inode_cache_insert() to match ext4_xattr_inode_cache_find()
>> that does the same.
> 
> Added skip in patch ("[PATCH 32/32] ext4: add nombcache mount option")

Thanks, I'm just going to go through those patches.

Cheers, Andreas

> On Tue, Jun 20, 2017 at 2:07 AM, Tahsin Erdogan <tahsin@google.com> wrote:
>> Ext4 now supports xattr values that are up to 64k in size (vfs limit).
>> Large xattr values are stored in external inodes each one holding a
>> single value. Once written the data blocks of these inodes are immutable.
>> 
>> The real world use cases are expected to have a lot of value duplication
>> such as inherited acls etc. To reduce data duplication on disk, this patch
>> implements a deduplicator that allows sharing of xattr inodes.
>> 
>> The deduplication is based on an in-memory hash lookup that is a best
>> effort sharing scheme. When a xattr inode is read from disk (i.e.
>> getxattr() call), its crc32c hash is added to a hash table. Before
>> creating a new xattr inode for a value being set, the hash table is
>> checked to see if an existing inode holds an identical value. If such an
>> inode is found, the ref count on that inode is incremented. On value
>> removal the ref count is decremented and if it reaches zero the inode is
>> deleted.
>> 
>> The quota charging for such inodes is manually managed. Every reference
>> holder is charged the full size as if there was no sharing happening.
>> This is consistent with how xattr blocks are also charged.
>> 
>> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
>> ---
>> v5:
>> - made ext4_meta_trans_blocks() static again since there are no
>>   remaining users outside of inode.c
>> - initialize sbi->s_csum_seed when ea_inode feature is enabled
>> - use l_i_version to hold lower 32 bits of the xattr ref count.
>>   This avoids clashes with old implementations which use i_mtime.
>>   Since l_i_version is not available in HURD_COMPAT mode, fail mount
>>   request when both ea_inode feature and HURD_COMPAT are set.
>> - when hash validation fails, fall back to old implementation
>>   which has a backref to parent.
>> - fixed checkpatch.pl warning about using unsigned alone
>> 
>> v4:
>> - eliminated xattr entry in the xattr inode to avoid complexity and
>>   recursion in xattr update path. Now the ref count and hash are stored
>>   in i_[c/m/a]time.tv_sec fields.
>> - some clean up in ext4_xattr_set_entry() to reduce code duplication and
>>   complexity
>> 
>> v3:
>> - use s_csum_seed for hash calculations when available
>> - return error on stored vs calculated hash mismatch
>> 
>> v2:
>> - make dependency on crc32c dynamic
>> - update ext4_has_metadata_csum() and ext4_has_group_desc_csum() so that
>>   they do not misinterpret existence of EXT4_SB(sb)->s_chksum_driver
>> 
>> fs/ext4/acl.c   |    5 +-
>> fs/ext4/ext4.h  |   23 +-
>> fs/ext4/inode.c |   13 +-
>> fs/ext4/super.c |   37 +-
>> fs/ext4/xattr.c | 1038 +++++++++++++++++++++++++++++++++++++++++--------------
>> fs/ext4/xattr.h |   17 +-
>> fs/mbcache.c    |    9 +-
>> 7 files changed, 848 insertions(+), 294 deletions(-)
>> 
>> diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
>> index 74f7ac539e00..8db03e5c78bc 100644
>> --- a/fs/ext4/acl.c
>> +++ b/fs/ext4/acl.c
>> @@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>>        if (error)
>>                return error;
>> retry:
>> -       credits = ext4_xattr_set_credits(inode, acl_size);
>> +       error = ext4_xattr_set_credits(inode, acl_size, &credits);
>> +       if (error)
>> +               return error;
>> +
>>        handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>>        if (IS_ERR(handle))
>>                return PTR_ERR(handle);
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index d79d8d7bee88..59e9488c4876 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -1517,6 +1517,7 @@ struct ext4_sb_info {
>>        long s_es_nr_inode;
>>        struct ext4_es_stats s_es_stats;
>>        struct mb_cache *s_mb_cache;
>> +       struct mb_cache *s_ea_inode_cache;
>>        spinlock_t s_es_lock ____cacheline_aligned_in_smp;
>> 
>>        /* Ratelimit ext4 messages. */
>> @@ -2099,7 +2100,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
>>        return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
>> }
>> 
>> -#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
>> +static inline bool ext4_is_quota_file(struct inode *inode)
>> +{
>> +       return IS_NOQUOTA(inode) &&
>> +              !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
>> +}
>> 
>> /*
>>  * This structure is stuffed into the struct file's private_data field
>> @@ -2482,7 +2487,6 @@ extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
>> extern void ext4_set_inode_flags(struct inode *);
>> extern int ext4_alloc_da_blocks(struct inode *inode);
>> extern void ext4_set_aops(struct inode *inode);
>> -extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk);
>> extern int ext4_writepage_trans_blocks(struct inode *);
>> extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
>> extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
>> @@ -2709,19 +2713,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
>> extern int ext4_register_li_request(struct super_block *sb,
>>                                    ext4_group_t first_not_zeroed);
>> 
>> -static inline int ext4_has_group_desc_csum(struct super_block *sb)
>> -{
>> -       return ext4_has_feature_gdt_csum(sb) ||
>> -              EXT4_SB(sb)->s_chksum_driver != NULL;
>> -}
>> -
>> static inline int ext4_has_metadata_csum(struct super_block *sb)
>> {
>>        WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
>>                     !EXT4_SB(sb)->s_chksum_driver);
>> 
>> -       return (EXT4_SB(sb)->s_chksum_driver != NULL);
>> +       return ext4_has_feature_metadata_csum(sb) &&
>> +              (EXT4_SB(sb)->s_chksum_driver != NULL);
>> }
>> +
>> +static inline int ext4_has_group_desc_csum(struct super_block *sb)
>> +{
>> +       return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
>> +}
>> +
>> static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
>> {
>>        return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index cd007f9757d1..ea95bd9eab81 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -139,6 +139,8 @@ static void ext4_invalidatepage(struct page *page, unsigned int offset,
>>                                unsigned int length);
>> static int __ext4_journalled_writepage(struct page *page, unsigned int len);
>> static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
>> +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
>> +                                 int pextents);
>> 
>> /*
>>  * Test whether an inode is a fast symlink.
>> @@ -4843,8 +4845,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>>        }
>>        brelse(iloc.bh);
>>        ext4_set_inode_flags(inode);
>> -       if (ei->i_flags & EXT4_EA_INODE_FL)
>> +
>> +       if (ei->i_flags & EXT4_EA_INODE_FL) {
>>                ext4_xattr_inode_set_class(inode);
>> +
>> +               inode_lock(inode);
>> +               inode->i_flags |= S_NOQUOTA;
>> +               inode_unlock(inode);
>> +       }
>> +
>>        unlock_new_inode(inode);
>>        return inode;
>> 
>> @@ -5503,7 +5512,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
>>  *
>>  * Also account for superblock, inode, quota and xattr blocks
>>  */
>> -int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
>> +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
>>                                  int pextents)
>> {
>>        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>> index b02a23ec92ca..2bfacd737bb6 100644
>> --- a/fs/ext4/super.c
>> +++ b/fs/ext4/super.c
>> @@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
>>                invalidate_bdev(sbi->journal_bdev);
>>                ext4_blkdev_remove(sbi);
>>        }
>> +       if (sbi->s_ea_inode_cache) {
>> +               ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
>> +               sbi->s_ea_inode_cache = NULL;
>> +       }
>>        if (sbi->s_mb_cache) {
>>                ext4_xattr_destroy_cache(sbi->s_mb_cache);
>>                sbi->s_mb_cache = NULL;
>> @@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
>>        if (res)
>>                return res;
>> retry:
>> -       credits = ext4_xattr_set_credits(inode, len);
>> +       res = ext4_xattr_set_credits(inode, len, &credits);
>> +       if (res)
>> +               return res;
>> +
>>        handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
>>        if (IS_ERR(handle))
>>                return PTR_ERR(handle);
>> @@ -3445,7 +3452,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>>        }
>> 
>>        /* Load the checksum driver */
>> -       if (ext4_has_feature_metadata_csum(sb)) {
>> +       if (ext4_has_feature_metadata_csum(sb) ||
>> +           ext4_has_feature_ea_inode(sb)) {
>>                sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
>>                if (IS_ERR(sbi->s_chksum_driver)) {
>>                        ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
>> @@ -3467,7 +3475,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>>        /* Precompute checksum seed for all metadata */
>>        if (ext4_has_feature_csum_seed(sb))
>>                sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
>> -       else if (ext4_has_metadata_csum(sb))
>> +       else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
>>                sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
>>                                               sizeof(es->s_uuid));
>> 
>> @@ -3597,6 +3605,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>>                                 "The Hurd can't support 64-bit file systems");
>>                        goto failed_mount;
>>                }
>> +
>> +               /*
>> +                * ea_inode feature uses l_i_version field which is not
>> +                * available in HURD_COMPAT mode.
>> +                */
>> +               if (ext4_has_feature_ea_inode(sb)) {
>> +                       ext4_msg(sb, KERN_ERR,
>> +                                "ea_inode feature is not supported for Hurd");
>> +                       goto failed_mount;
>> +               }
>>        }
>> 
>>        if (IS_EXT2_SB(sb)) {
>> @@ -4067,6 +4085,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>>                goto failed_mount_wq;
>>        }
>> 
>> +       if (ext4_has_feature_ea_inode(sb)) {
>> +               sbi->s_ea_inode_cache = ext4_xattr_create_cache();
>> +               if (!sbi->s_ea_inode_cache) {
>> +                       ext4_msg(sb, KERN_ERR,
>> +                                "Failed to create an s_ea_inode_cache");
>> +                       goto failed_mount_wq;
>> +               }
>> +       }
>> +
>>        if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
>>            (blocksize != PAGE_SIZE)) {
>>                ext4_msg(sb, KERN_ERR,
>> @@ -4296,6 +4323,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>>        if (EXT4_SB(sb)->rsv_conversion_wq)
>>                destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
>> failed_mount_wq:
>> +       if (sbi->s_ea_inode_cache) {
>> +               ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
>> +               sbi->s_ea_inode_cache = NULL;
>> +       }
>>        if (sbi->s_mb_cache) {
>>                ext4_xattr_destroy_cache(sbi->s_mb_cache);
>>                sbi->s_mb_cache = NULL;
>> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
>> index 0484df8dadd1..d7e60358ec91 100644
>> --- a/fs/ext4/xattr.c
>> +++ b/fs/ext4/xattr.c
>> @@ -108,6 +108,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
>> #define EXT4_GET_MB_CACHE(inode)       (((struct ext4_sb_info *) \
>>                                inode->i_sb->s_fs_info)->s_mb_cache)
>> 
>> +#define EA_INODE_CACHE(inode)  (((struct ext4_sb_info *) \
>> +                               inode->i_sb->s_fs_info)->s_ea_inode_cache)
>> +
>> static int
>> ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>>                        struct inode *inode);
>> @@ -280,15 +283,44 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
>>        return cmp ? -ENODATA : 0;
>> }
>> 
>> +static u32
>> +ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
>> +{
>> +       return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size);
>> +}
>> +
>> +static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
>> +{
>> +       return ((u64)ea_inode->i_ctime.tv_sec << 32) |
>> +              ((u32)ea_inode->i_version);
>> +}
>> +
>> +static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
>> +{
>> +       ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32);
>> +       ea_inode->i_version = (u32)ref_count;
>> +}
>> +
>> +static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
>> +{
>> +       return (u32)ea_inode->i_atime.tv_sec;
>> +}
>> +
>> +static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
>> +{
>> +       ea_inode->i_atime.tv_sec = hash;
>> +}
>> +
>> /*
>>  * Read the EA value from an inode.
>>  */
>> static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
>> {
>>        unsigned long block = 0;
>> -       struct buffer_head *bh = NULL;
>> +       struct buffer_head *bh;
>>        int blocksize = ea_inode->i_sb->s_blocksize;
>>        size_t csize, copied = 0;
>> +       void *copy_pos = buf;
>> 
>>        while (copied < size) {
>>                csize = (size - copied) > blocksize ? blocksize : size - copied;
>> @@ -298,10 +330,10 @@ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
>>                if (!bh)
>>                        return -EFSCORRUPTED;
>> 
>> -               memcpy(buf, bh->b_data, csize);
>> +               memcpy(copy_pos, bh->b_data, csize);
>>                brelse(bh);
>> 
>> -               buf += csize;
>> +               copy_pos += csize;
>>                block += 1;
>>                copied += csize;
>>        }
>> @@ -317,29 +349,24 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>>        inode = ext4_iget(parent->i_sb, ea_ino);
>>        if (IS_ERR(inode)) {
>>                err = PTR_ERR(inode);
>> -               ext4_error(parent->i_sb, "error while reading EA inode %lu "
>> -                          "err=%d", ea_ino, err);
>> +               ext4_error(parent->i_sb,
>> +                          "error while reading EA inode %lu err=%d", ea_ino,
>> +                          err);
>>                return err;
>>        }
>> 
>>        if (is_bad_inode(inode)) {
>> -               ext4_error(parent->i_sb, "error while reading EA inode %lu "
>> -                          "is_bad_inode", ea_ino);
>> +               ext4_error(parent->i_sb,
>> +                          "error while reading EA inode %lu is_bad_inode",
>> +                          ea_ino);
>>                err = -EIO;
>>                goto error;
>>        }
>> 
>> -       if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
>> -           inode->i_generation != parent->i_generation) {
>> -               ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
>> -                          "to parent is invalid.", ea_ino);
>> -               err = -EINVAL;
>> -               goto error;
>> -       }
>> -
>>        if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
>> -               ext4_error(parent->i_sb, "EA inode %lu does not have "
>> -                          "EXT4_EA_INODE_FL flag set.\n", ea_ino);
>> +               ext4_error(parent->i_sb,
>> +                          "EA inode %lu does not have EXT4_EA_INODE_FL flag",
>> +                           ea_ino);
>>                err = -EINVAL;
>>                goto error;
>>        }
>> @@ -351,6 +378,20 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>>        return err;
>> }
>> 
>> +static int
>> +ext4_xattr_inode_verify_hash(struct inode *ea_inode, void *buffer, size_t size)
>> +{
>> +       u32 hash;
>> +
>> +       /* Verify stored hash matches calculated hash. */
>> +       hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size);
>> +       if (hash != ext4_xattr_inode_get_hash(ea_inode))
>> +               return -EFSCORRUPTED;
>> +       return 0;
>> +}
>> +
>> +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
>> +
>> /*
>>  * Read the value from the EA inode.
>>  */
>> @@ -358,17 +399,53 @@ static int
>> ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
>>                     size_t size)
>> {
>> +       struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
>>        struct inode *ea_inode;
>> -       int ret;
>> +       int err;
>> 
>> -       ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
>> -       if (ret)
>> -               return ret;
>> +       err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
>> +       if (err) {
>> +               ea_inode = NULL;
>> +               goto out;
>> +       }
>> 
>> -       ret = ext4_xattr_inode_read(ea_inode, buffer, size);
>> -       iput(ea_inode);
>> +       if (i_size_read(ea_inode) != size) {
>> +               ext4_warning_inode(ea_inode,
>> +                                  "ea_inode file size=%llu entry size=%zu",
>> +                                  i_size_read(ea_inode), size);
>> +               err = -EFSCORRUPTED;
>> +               goto out;
>> +       }
>> 
>> -       return ret;
>> +       err = ext4_xattr_inode_read(ea_inode, buffer, size);
>> +       if (err)
>> +               goto out;
>> +
>> +       err = ext4_xattr_inode_verify_hash(ea_inode, buffer, size);
>> +       /*
>> +        * Compatibility check for old Lustre ea_inode implementation. Old
>> +        * version does not have hash validation, but it has a backpointer
>> +        * from ea_inode to the parent inode.
>> +        */
>> +       if (err == -EFSCORRUPTED) {
>> +               if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != inode->i_ino ||
>> +                   ea_inode->i_generation != inode->i_generation) {
>> +                       ext4_warning_inode(ea_inode,
>> +                                          "EA inode hash validation failed");
>> +                       goto out;
>> +               }
>> +               /* Do not add ea_inode to the cache. */
>> +               ea_inode_cache = NULL;
>> +       } else if (err)
>> +               goto out;
>> +
>> +       if (ea_inode_cache)
>> +               mb_cache_entry_create(ea_inode_cache, GFP_NOFS,
>> +                                     ext4_xattr_inode_get_hash(ea_inode),
>> +                                     ea_inode->i_ino, true /* reusable */);
>> +out:
>> +       iput(ea_inode);
>> +       return err;
>> }
>> 
>> static int
>> @@ -657,6 +734,101 @@ static void ext4_xattr_update_super_block(handle_t *handle,
>>        }
>> }
>> 
>> +static inline size_t round_up_cluster(struct inode *inode, size_t length)
>> +{
>> +       struct super_block *sb = inode->i_sb;
>> +       size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
>> +                                   inode->i_blkbits);
>> +       size_t mask = ~(cluster_size - 1);
>> +
>> +       return (length + cluster_size - 1) & mask;
>> +}
>> +
>> +static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
>> +{
>> +       int err;
>> +
>> +       err = dquot_alloc_inode(inode);
>> +       if (err)
>> +               return err;
>> +       err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
>> +       if (err)
>> +               dquot_free_inode(inode);
>> +       return err;
>> +}
>> +
>> +static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
>> +{
>> +       dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
>> +       dquot_free_inode(inode);
>> +}
>> +
>> +static int __ext4_xattr_set_credits(struct super_block *sb,
>> +                                   struct buffer_head *block_bh,
>> +                                   size_t value_len)
>> +{
>> +       int credits;
>> +       int blocks;
>> +
>> +       /*
>> +        * 1) Owner inode update
>> +        * 2) Ref count update on old xattr block
>> +        * 3) new xattr block
>> +        * 4) block bitmap update for new xattr block
>> +        * 5) group descriptor for new xattr block
>> +        */
>> +       credits = 5;
>> +
>> +       /* We are done if ea_inode feature is not enabled. */
>> +       if (!ext4_has_feature_ea_inode(sb))
>> +               return credits;
>> +
>> +       /* New ea_inode, inode map, block bitmap, group descriptor. */
>> +       credits += 4;
>> +
>> +       /* Data blocks. */
>> +       blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
>> +
>> +       /* Indirection block or one level of extent tree. */
>> +       blocks += 1;
>> +
>> +       /* Block bitmap and group descriptor updates for each block. */
>> +       credits += blocks * 2;
>> +
>> +       /* Blocks themselves. */
>> +       credits += blocks;
>> +
>> +       /* Dereference ea_inode holding old xattr value.
>> +        * Old ea_inode, inode map, block bitmap, group descriptor.
>> +        */
>> +       credits += 4;
>> +
>> +       /* Data blocks for old ea_inode. */
>> +       blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
>> +
>> +       /* Indirection block or one level of extent tree for old ea_inode. */
>> +       blocks += 1;
>> +
>> +       /* Block bitmap and group descriptor updates for each block. */
>> +       credits += blocks * 2;
>> +
>> +       /* Quota updates. */
>> +       credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
>> +
>> +       /* We may need to clone the existing xattr block in which case we need
>> +        * to increment ref counts for existing ea_inodes referenced by it.
>> +        */
>> +       if (block_bh) {
>> +               struct ext4_xattr_entry *entry = BFIRST(block_bh);
>> +
>> +               for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
>> +                       if (entry->e_value_inum)
>> +                               /* Ref count update on ea_inode. */
>> +                               credits += 1;
>> +       }
>> +       return credits;
>> +}
>> +
>> static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>>                                     int credits, struct buffer_head *bh,
>>                                     bool dirty, bool block_csum)
>> @@ -706,12 +878,140 @@ static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
>>        return 0;
>> }
>> 
>> +static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
>> +                                      int ref_change)
>> +{
>> +       struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
>> +       struct ext4_iloc iloc;
>> +       s64 ref_count;
>> +       u32 hash;
>> +       int ret;
>> +
>> +       inode_lock(ea_inode);
>> +
>> +       ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
>> +       if (ret) {
>> +               iloc.bh = NULL;
>> +               goto out;
>> +       }
>> +
>> +       ref_count = ext4_xattr_inode_get_ref(ea_inode);
>> +       ref_count += ref_change;
>> +       ext4_xattr_inode_set_ref(ea_inode, ref_count);
>> +
>> +       if (ref_change > 0) {
>> +               WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld",
>> +                         ea_inode->i_ino, ref_count);
>> +
>> +               if (ref_count == 1) {
>> +                       WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
>> +                                 ea_inode->i_ino, ea_inode->i_nlink);
>> +
>> +                       set_nlink(ea_inode, 1);
>> +                       ext4_orphan_del(handle, ea_inode);
>> +
>> +                       hash = ext4_xattr_inode_get_hash(ea_inode);
>> +                       mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
>> +                                             ea_inode->i_ino,
>> +                                             true /* reusable */);
>> +               }
>> +       } else {
>> +               WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
>> +                         ea_inode->i_ino, ref_count);
>> +
>> +               if (ref_count == 0) {
>> +                       WARN_ONCE(ea_inode->i_nlink != 1,
>> +                                 "EA inode %lu i_nlink=%u",
>> +                                 ea_inode->i_ino, ea_inode->i_nlink);
>> +
>> +                       clear_nlink(ea_inode);
>> +                       ext4_orphan_add(handle, ea_inode);
>> +
>> +                       hash = ext4_xattr_inode_get_hash(ea_inode);
>> +                       mb_cache_entry_delete(ea_inode_cache, hash,
>> +                                             ea_inode->i_ino);
>> +               }
>> +       }
>> +
>> +       ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
>> +       iloc.bh = NULL;
>> +       if (ret)
>> +               ext4_warning_inode(ea_inode,
>> +                                  "ext4_mark_iloc_dirty() failed ret=%d", ret);
>> +out:
>> +       brelse(iloc.bh);
>> +       inode_unlock(ea_inode);
>> +       return ret;
>> +}
>> +
>> +static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
>> +{
>> +       return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
>> +}
>> +
>> +static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
>> +{
>> +       return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
>> +}
>> +
>> +static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
>> +                                       struct ext4_xattr_entry *first)
>> +{
>> +       struct inode *ea_inode;
>> +       struct ext4_xattr_entry *entry;
>> +       struct ext4_xattr_entry *failed_entry;
>> +       unsigned int ea_ino;
>> +       int err, saved_err;
>> +
>> +       for (entry = first; !IS_LAST_ENTRY(entry);
>> +            entry = EXT4_XATTR_NEXT(entry)) {
>> +               if (!entry->e_value_inum)
>> +                       continue;
>> +               ea_ino = le32_to_cpu(entry->e_value_inum);
>> +               err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
>> +               if (err)
>> +                       goto cleanup;
>> +               err = ext4_xattr_inode_inc_ref(handle, ea_inode);
>> +               if (err) {
>> +                       ext4_warning_inode(ea_inode, "inc ref error %d", err);
>> +                       iput(ea_inode);
>> +                       goto cleanup;
>> +               }
>> +               iput(ea_inode);
>> +       }
>> +       return 0;
>> +
>> +cleanup:
>> +       saved_err = err;
>> +       failed_entry = entry;
>> +
>> +       for (entry = first; entry != failed_entry;
>> +            entry = EXT4_XATTR_NEXT(entry)) {
>> +               if (!entry->e_value_inum)
>> +                       continue;
>> +               ea_ino = le32_to_cpu(entry->e_value_inum);
>> +               err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
>> +               if (err) {
>> +                       ext4_warning(parent->i_sb,
>> +                                    "cleanup ea_ino %u iget error %d", ea_ino,
>> +                                    err);
>> +                       continue;
>> +               }
>> +               err = ext4_xattr_inode_dec_ref(handle, ea_inode);
>> +               if (err)
>> +                       ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
>> +                                          err);
>> +               iput(ea_inode);
>> +       }
>> +       return saved_err;
>> +}
>> +
>> static void
>> -ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>> -                           struct buffer_head *bh,
>> -                           struct ext4_xattr_entry *first, bool block_csum,
>> -                           struct ext4_xattr_inode_array **ea_inode_array,
>> -                           int extra_credits)
>> +ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
>> +                            struct buffer_head *bh,
>> +                            struct ext4_xattr_entry *first, bool block_csum,
>> +                            struct ext4_xattr_inode_array **ea_inode_array,
>> +                            int extra_credits, bool skip_quota)
>> {
>>        struct inode *ea_inode;
>>        struct ext4_xattr_entry *entry;
>> @@ -748,10 +1048,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>>                        continue;
>>                }
>> 
>> -               inode_lock(ea_inode);
>> -               clear_nlink(ea_inode);
>> -               ext4_orphan_add(handle, ea_inode);
>> -               inode_unlock(ea_inode);
>> +               err = ext4_xattr_inode_dec_ref(handle, ea_inode);
>> +               if (err) {
>> +                       ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
>> +                                          err);
>> +                       continue;
>> +               }
>> +
>> +               if (!skip_quota)
>> +                       ext4_xattr_inode_free_quota(parent,
>> +                                             le32_to_cpu(entry->e_value_size));
>> 
>>                /*
>>                 * Forget about ea_inode within the same transaction that
>> @@ -785,7 +1091,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
>>  */
>> static void
>> ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>> -                        struct buffer_head *bh)
>> +                        struct buffer_head *bh,
>> +                        struct ext4_xattr_inode_array **ea_inode_array,
>> +                        int extra_credits)
>> {
>>        struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
>>        u32 hash, ref;
>> @@ -808,6 +1116,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
>>                mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
>>                get_bh(bh);
>>                unlock_buffer(bh);
>> +
>> +               if (ext4_has_feature_ea_inode(inode->i_sb))
>> +                       ext4_xattr_inode_dec_ref_all(handle, inode, bh,
>> +                                                    BFIRST(bh),
>> +                                                    true /* block_csum */,
>> +                                                    ea_inode_array,
>> +                                                    extra_credits,
>> +                                                    true /* skip_quota */);
>>                ext4_free_blocks(handle, inode, bh, 0, 1,
>>                                 EXT4_FREE_BLOCKS_METADATA |
>>                                 EXT4_FREE_BLOCKS_FORGET);
>> @@ -879,8 +1195,8 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
>> {
>>        struct buffer_head *bh = NULL;
>>        unsigned long block = 0;
>> -       unsigned blocksize = ea_inode->i_sb->s_blocksize;
>> -       unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
>> +       int blocksize = ea_inode->i_sb->s_blocksize;
>> +       int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
>>        int csize, wsize = 0;
>>        int ret = 0;
>>        int retries = 0;
>> @@ -948,7 +1264,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
>>  * Create an inode to store the value of a large EA.
>>  */
>> static struct inode *ext4_xattr_inode_create(handle_t *handle,
>> -                                            struct inode *inode)
>> +                                            struct inode *inode, u32 hash)
>> {
>>        struct inode *ea_inode = NULL;
>>        uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
>> @@ -966,67 +1282,115 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
>>                ea_inode->i_fop = &ext4_file_operations;
>>                ext4_set_aops(ea_inode);
>>                ext4_xattr_inode_set_class(ea_inode);
>> -               ea_inode->i_generation = inode->i_generation;
>> -               EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
>> -
>> -               /*
>> -                * A back-pointer from EA inode to parent inode will be useful
>> -                * for e2fsck.
>> -                */
>> -               EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
>>                unlock_new_inode(ea_inode);
>> -               err = ext4_inode_attach_jinode(ea_inode);
>> +               ext4_xattr_inode_set_ref(ea_inode, 1);
>> +               ext4_xattr_inode_set_hash(ea_inode, hash);
>> +               err = ext4_mark_inode_dirty(handle, ea_inode);
>> +               if (!err)
>> +                       err = ext4_inode_attach_jinode(ea_inode);
>>                if (err) {
>>                        iput(ea_inode);
>>                        return ERR_PTR(err);
>>                }
>> +
>> +               /*
>> +                * Xattr inodes are shared therefore quota charging is performed
>> +                * at a higher level.
>> +                */
>> +               dquot_free_inode(ea_inode);
>> +               dquot_drop(ea_inode);
>> +               inode_lock(ea_inode);
>> +               ea_inode->i_flags |= S_NOQUOTA;
>> +               inode_unlock(ea_inode);
>>        }
>> 
>>        return ea_inode;
>> }
>> 
>> -/*
>> - * Unlink the inode storing the value of the EA.
>> - */
>> -int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
>> +static struct inode *
>> +ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
>> +                           size_t value_len, u32 hash)
>> {
>> -       struct inode *ea_inode = NULL;
>> -       int err;
>> +       struct inode *ea_inode;
>> +       struct mb_cache_entry *ce;
>> +       struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
>> +       void *ea_data;
>> 
>> -       err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
>> -       if (err)
>> -               return err;
>> +       ce = mb_cache_entry_find_first(ea_inode_cache, hash);
>> +       if (!ce)
>> +               return NULL;
>> 
>> -       clear_nlink(ea_inode);
>> -       iput(ea_inode);
>> +       ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
>> +       if (!ea_data) {
>> +               mb_cache_entry_put(ea_inode_cache, ce);
>> +               return NULL;
>> +       }
>> 
>> -       return 0;
>> +       while (ce) {
>> +               ea_inode = ext4_iget(inode->i_sb, ce->e_value);
>> +               if (!IS_ERR(ea_inode) &&
>> +                   !is_bad_inode(ea_inode) &&
>> +                   (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) &&
>> +                   i_size_read(ea_inode) == value_len &&
>> +                   !ext4_xattr_inode_read(ea_inode, ea_data, value_len) &&
>> +                   !ext4_xattr_inode_verify_hash(ea_inode, ea_data,
>> +                                                 value_len) &&
>> +                   !memcmp(value, ea_data, value_len)) {
>> +                       mb_cache_entry_touch(ea_inode_cache, ce);
>> +                       mb_cache_entry_put(ea_inode_cache, ce);
>> +                       kvfree(ea_data);
>> +                       return ea_inode;
>> +               }
>> +
>> +               if (!IS_ERR(ea_inode))
>> +                       iput(ea_inode);
>> +               ce = mb_cache_entry_find_next(ea_inode_cache, ce);
>> +       }
>> +       kvfree(ea_data);
>> +       return NULL;
>> }
>> 
>> /*
>>  * Add value of the EA in an inode.
>>  */
>> -static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
>> -                               unsigned long *ea_ino, const void *value,
>> -                               size_t value_len)
>> +static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
>> +                                         const void *value, size_t value_len,
>> +                                         struct inode **ret_inode)
>> {
>>        struct inode *ea_inode;
>> +       u32 hash;
>>        int err;
>> 
>> +       hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
>> +       ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
>> +       if (ea_inode) {
>> +               err = ext4_xattr_inode_inc_ref(handle, ea_inode);
>> +               if (err) {
>> +                       iput(ea_inode);
>> +                       return err;
>> +               }
>> +
>> +               *ret_inode = ea_inode;
>> +               return 0;
>> +       }
>> +
>>        /* Create an inode for the EA value */
>> -       ea_inode = ext4_xattr_inode_create(handle, inode);
>> +       ea_inode = ext4_xattr_inode_create(handle, inode, hash);
>>        if (IS_ERR(ea_inode))
>>                return PTR_ERR(ea_inode);
>> 
>>        err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
>> -       if (err)
>> -               clear_nlink(ea_inode);
>> -       else
>> -               *ea_ino = ea_inode->i_ino;
>> +       if (err) {
>> +               ext4_xattr_inode_dec_ref(handle, ea_inode);
>> +               iput(ea_inode);
>> +               return err;
>> +       }
>> 
>> -       iput(ea_inode);
>> +       mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
>> +                             ea_inode->i_ino, true /* reusable */);
>> 
>> -       return err;
>> +       *ret_inode = ea_inode;
>> +       return 0;
>> }
>> 
>> static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>> @@ -1034,9 +1398,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>>                                handle_t *handle, struct inode *inode)
>> {
>>        struct ext4_xattr_entry *last;
>> -       size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
>> +       struct ext4_xattr_entry *here = s->here;
>> +       size_t min_offs = s->end - s->base, name_len = strlen(i->name);
>>        int in_inode = i->in_inode;
>> -       int rc;
>> +       struct inode *old_ea_inode = NULL;
>> +       struct inode *new_ea_inode = NULL;
>> +       size_t old_size, new_size;
>> +       int ret;
>> +
>> +       /* Space used by old and new values. */
>> +       old_size = (!s->not_found && !here->e_value_inum) ?
>> +                       EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0;
>> +       new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0;
>> +
>> +       /*
>> +        * Optimization for the simple case when old and new values have the
>> +        * same padded sizes. Not applicable if external inodes are involved.
>> +        */
>> +       if (new_size && new_size == old_size) {
>> +               size_t offs = le16_to_cpu(here->e_value_offs);
>> +               void *val = s->base + offs;
>> +
>> +               here->e_value_size = cpu_to_le32(i->value_len);
>> +               if (i->value == EXT4_ZERO_XATTR_VALUE) {
>> +                       memset(val, 0, new_size);
>> +               } else {
>> +                       memcpy(val, i->value, i->value_len);
>> +                       /* Clear padding bytes. */
>> +                       memset(val + i->value_len, 0, new_size - i->value_len);
>> +               }
>> +               return 0;
>> +       }
>> 
>>        /* Compute min_offs and last. */
>>        last = s->first;
>> @@ -1047,122 +1439,148 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>>                                min_offs = offs;
>>                }
>>        }
>> -       free = min_offs - ((void *)last - s->base) - sizeof(__u32);
>> -       if (!s->not_found) {
>> -               if (!in_inode &&
>> -                   !s->here->e_value_inum && s->here->e_value_size) {
>> -                       size_t size = le32_to_cpu(s->here->e_value_size);
>> -                       free += EXT4_XATTR_SIZE(size);
>> -               }
>> -               free += EXT4_XATTR_LEN(name_len);
>> -       }
>> +
>> +       /* Check whether we have enough space. */
>>        if (i->value) {
>> -               size_t value_len = EXT4_XATTR_SIZE(i->value_len);
>> +               size_t free;
>> 
>> -               if (in_inode)
>> -                       value_len = 0;
>> +               free = min_offs - ((void *)last - s->base) - sizeof(__u32);
>> +               if (!s->not_found)
>> +                       free += EXT4_XATTR_LEN(name_len) + old_size;
>> 
>> -               if (free < EXT4_XATTR_LEN(name_len) + value_len)
>> -                       return -ENOSPC;
>> +               if (free < EXT4_XATTR_LEN(name_len) + new_size) {
>> +                       ret = -ENOSPC;
>> +                       goto out;
>> +               }
>>        }
>> 
>> -       if (i->value && s->not_found) {
>> -               /* Insert the new name. */
>> -               size_t size = EXT4_XATTR_LEN(name_len);
>> -               size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
>> -               memmove((void *)s->here + size, s->here, rest);
>> -               memset(s->here, 0, size);
>> -               s->here->e_name_index = i->name_index;
>> -               s->here->e_name_len = name_len;
>> -               memcpy(s->here->e_name, i->name, name_len);
>> -       } else {
>> -               if (!s->here->e_value_inum && s->here->e_value_size &&
>> -                   s->here->e_value_offs > 0) {
>> -                       void *first_val = s->base + min_offs;
>> -                       size_t offs = le16_to_cpu(s->here->e_value_offs);
>> -                       void *val = s->base + offs;
>> -                       size_t size = EXT4_XATTR_SIZE(
>> -                               le32_to_cpu(s->here->e_value_size));
>> -
>> -                       if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
>> -                               /* The old and the new value have the same
>> -                                  size. Just replace. */
>> -                               s->here->e_value_size =
>> -                                       cpu_to_le32(i->value_len);
>> -                               if (i->value == EXT4_ZERO_XATTR_VALUE) {
>> -                                       memset(val, 0, size);
>> -                               } else {
>> -                                       /* Clear pad bytes first. */
>> -                                       memset(val + size - EXT4_XATTR_PAD, 0,
>> -                                              EXT4_XATTR_PAD);
>> -                                       memcpy(val, i->value, i->value_len);
>> -                               }
>> -                               return 0;
>> -                       }
>> +       /*
>> +        * Getting access to old and new ea inodes is subject to failures.
>> +        * Finish that work before doing any modifications to the xattr data.
>> +        */
>> +       if (!s->not_found && here->e_value_inum) {
>> +               ret = ext4_xattr_inode_iget(inode,
>> +                                           le32_to_cpu(here->e_value_inum),
>> +                                           &old_ea_inode);
>> +               if (ret) {
>> +                       old_ea_inode = NULL;
>> +                       goto out;
>> +               }
>> +       }
>> +       if (i->value && in_inode) {
>> +               WARN_ON_ONCE(!i->value_len);
>> 
>> -                       /* Remove the old value. */
>> -                       memmove(first_val + size, first_val, val - first_val);
>> -                       memset(first_val, 0, size);
>> -                       s->here->e_value_size = 0;
>> -                       s->here->e_value_offs = 0;
>> -                       min_offs += size;
>> -
>> -                       /* Adjust all value offsets. */
>> -                       last = s->first;
>> -                       while (!IS_LAST_ENTRY(last)) {
>> -                               size_t o = le16_to_cpu(last->e_value_offs);
>> -                               if (!last->e_value_inum &&
>> -                                   last->e_value_size && o < offs)
>> -                                       last->e_value_offs =
>> -                                               cpu_to_le16(o + size);
>> -                               last = EXT4_XATTR_NEXT(last);
>> -                       }
>> +               ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
>> +               if (ret)
>> +                       goto out;
>> +
>> +               ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
>> +                                                    i->value_len,
>> +                                                    &new_ea_inode);
>> +               if (ret) {
>> +                       new_ea_inode = NULL;
>> +                       ext4_xattr_inode_free_quota(inode, i->value_len);
>> +                       goto out;
>>                }
>> -               if (s->here->e_value_inum) {
>> -                       ext4_xattr_inode_unlink(inode,
>> -                                           le32_to_cpu(s->here->e_value_inum));
>> -                       s->here->e_value_inum = 0;
>> +       }
>> +
>> +       if (old_ea_inode) {
>> +               /* We are ready to release ref count on the old_ea_inode. */
>> +               ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
>> +               if (ret) {
>> +                       /* Release newly required ref count on new_ea_inode. */
>> +                       if (new_ea_inode) {
>> +                               int err;
>> +
>> +                               err = ext4_xattr_inode_dec_ref(handle,
>> +                                                              new_ea_inode);
>> +                               if (err)
>> +                                       ext4_warning_inode(new_ea_inode,
>> +                                                 "dec ref new_ea_inode err=%d",
>> +                                                 err);
>> +                               ext4_xattr_inode_free_quota(inode,
>> +                                                           i->value_len);
>> +                       }
>> +                       goto out;
>>                }
>> -               if (!i->value) {
>> -                       /* Remove the old name. */
>> -                       size_t size = EXT4_XATTR_LEN(name_len);
>> -                       last = ENTRY((void *)last - size);
>> -                       memmove(s->here, (void *)s->here + size,
>> -                               (void *)last - (void *)s->here + sizeof(__u32));
>> -                       memset(last, 0, size);
>> +
>> +               ext4_xattr_inode_free_quota(inode,
>> +                                           le32_to_cpu(here->e_value_size));
>> +       }
>> +
>> +       /* No failures allowed past this point. */
>> +
>> +       if (!s->not_found && here->e_value_offs) {
>> +               /* Remove the old value. */
>> +               void *first_val = s->base + min_offs;
>> +               size_t offs = le16_to_cpu(here->e_value_offs);
>> +               void *val = s->base + offs;
>> +
>> +               memmove(first_val + old_size, first_val, val - first_val);
>> +               memset(first_val, 0, old_size);
>> +               min_offs += old_size;
>> +
>> +               /* Adjust all value offsets. */
>> +               last = s->first;
>> +               while (!IS_LAST_ENTRY(last)) {
>> +                       size_t o = le16_to_cpu(last->e_value_offs);
>> +
>> +                       if (!last->e_value_inum &&
>> +                           last->e_value_size && o < offs)
>> +                               last->e_value_offs = cpu_to_le16(o + old_size);
>> +                       last = EXT4_XATTR_NEXT(last);
>>                }
>>        }
>> 
>> +       if (!i->value) {
>> +               /* Remove old name. */
>> +               size_t size = EXT4_XATTR_LEN(name_len);
>> +
>> +               last = ENTRY((void *)last - size);
>> +               memmove(here, (void *)here + size,
>> +                       (void *)last - (void *)here + sizeof(__u32));
>> +               memset(last, 0, size);
>> +       } else if (s->not_found) {
>> +               /* Insert new name. */
>> +               size_t size = EXT4_XATTR_LEN(name_len);
>> +               size_t rest = (void *)last - (void *)here + sizeof(__u32);
>> +
>> +               memmove((void *)here + size, here, rest);
>> +               memset(here, 0, size);
>> +               here->e_name_index = i->name_index;
>> +               here->e_name_len = name_len;
>> +               memcpy(here->e_name, i->name, name_len);
>> +       } else {
>> +               /* This is an update, reset value info. */
>> +               here->e_value_inum = 0;
>> +               here->e_value_offs = 0;
>> +               here->e_value_size = 0;
>> +       }
>> +
>>        if (i->value) {
>> -               /* Insert the new value. */
>> +               /* Insert new value. */
>>                if (in_inode) {
>> -                       unsigned long ea_ino =
>> -                               le32_to_cpu(s->here->e_value_inum);
>> -                       rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
>> -                                                 i->value, i->value_len);
>> -                       if (rc)
>> -                               goto out;
>> -                       s->here->e_value_inum = cpu_to_le32(ea_ino);
>> -                       s->here->e_value_offs = 0;
>> +                       here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
>>                } else if (i->value_len) {
>> -                       size_t size = EXT4_XATTR_SIZE(i->value_len);
>> -                       void *val = s->base + min_offs - size;
>> -                       s->here->e_value_offs = cpu_to_le16(min_offs - size);
>> -                       s->here->e_value_inum = 0;
>> +                       void *val = s->base + min_offs - new_size;
>> +
>> +                       here->e_value_offs = cpu_to_le16(min_offs - new_size);
>>                        if (i->value == EXT4_ZERO_XATTR_VALUE) {
>> -                               memset(val, 0, size);
>> +                               memset(val, 0, new_size);
>>                        } else {
>> -                               /* Clear the pad bytes first. */
>> -                               memset(val + size - EXT4_XATTR_PAD, 0,
>> -                                      EXT4_XATTR_PAD);
>>                                memcpy(val, i->value, i->value_len);
>> +                               /* Clear padding bytes. */
>> +                               memset(val + i->value_len, 0,
>> +                                      new_size - i->value_len);
>>                        }
>>                }
>> -               s->here->e_value_size = cpu_to_le32(i->value_len);
>> +               here->e_value_size = cpu_to_le32(i->value_len);
>>        }
>> -
>> +       ret = 0;
>> out:
>> -       return rc;
>> +       iput(old_ea_inode);
>> +       iput(new_ea_inode);
>> +       return ret;
>> }
>> 
>> struct ext4_xattr_block_find {
>> @@ -1224,6 +1642,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>>        struct mb_cache_entry *ce = NULL;
>>        int error = 0;
>>        struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
>> +       struct inode *ea_inode = NULL;
>> +       size_t old_ea_inode_size = 0;
>> 
>> #define header(x) ((struct ext4_xattr_header *)(x))
>> 
>> @@ -1278,6 +1698,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>>                        header(s->base)->h_refcount = cpu_to_le32(1);
>>                        s->here = ENTRY(s->base + offset);
>>                        s->end = s->base + bs->bh->b_size;
>> +
>> +                       /*
>> +                        * If existing entry points to an xattr inode, we need
>> +                        * to prevent ext4_xattr_set_entry() from decrementing
>> +                        * ref count on it because the reference belongs to the
>> +                        * original block. In this case, make the entry look
>> +                        * like it has an empty value.
>> +                        */
>> +                       if (!s->not_found && s->here->e_value_inum) {
>> +                               /*
>> +                                * Defer quota free call for previous inode
>> +                                * until success is guaranteed.
>> +                                */
>> +                               old_ea_inode_size = le32_to_cpu(
>> +                                                       s->here->e_value_size);
>> +                               s->here->e_value_inum = 0;
>> +                               s->here->e_value_size = 0;
>> +                       }
>>                }
>>        } else {
>>                /* Allocate a buffer where we construct the new block. */
>> @@ -1299,6 +1737,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>>                goto bad_block;
>>        if (error)
>>                goto cleanup;
>> +
>> +       if (i->value && s->here->e_value_inum) {
>> +               unsigned int ea_ino;
>> +
>> +               /*
>> +                * A ref count on ea_inode has been taken as part of the call to
>> +                * ext4_xattr_set_entry() above. We would like to drop this
>> +                * extra ref but we have to wait until the xattr block is
>> +                * initialized and has its own ref count on the ea_inode.
>> +                */
>> +               ea_ino = le32_to_cpu(s->here->e_value_inum);
>> +               error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
>> +               if (error) {
>> +                       ea_inode = NULL;
>> +                       goto cleanup;
>> +               }
>> +       }
>> +
>>        if (!IS_LAST_ENTRY(s->first))
>>                ext4_xattr_rehash(header(s->base), s->here);
>> 
>> @@ -1409,6 +1865,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>>                                                 EXT4_FREE_BLOCKS_METADATA);
>>                                goto cleanup;
>>                        }
>> +                       error = ext4_xattr_inode_inc_ref_all(handle, inode,
>> +                                                     ENTRY(header(s->base)+1));
>> +                       if (error)
>> +                               goto getblk_failed;
>> +                       if (ea_inode) {
>> +                               /* Drop the extra ref on ea_inode. */
>> +                               error = ext4_xattr_inode_dec_ref(handle,
>> +                                                                ea_inode);
>> +                               if (error)
>> +                                       ext4_warning_inode(ea_inode,
>> +                                                          "dec ref error=%d",
>> +                                                          error);
>> +                               iput(ea_inode);
>> +                               ea_inode = NULL;
>> +                       }
>> +
>>                        lock_buffer(new_bh);
>>                        error = ext4_journal_get_create_access(handle, new_bh);
>>                        if (error) {
>> @@ -1428,15 +1900,38 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>>                }
>>        }
>> 
>> +       if (old_ea_inode_size)
>> +               ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
>> +
>>        /* Update the inode. */
>>        EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
>> 
>>        /* Drop the previous xattr block. */
>> -       if (bs->bh && bs->bh != new_bh)
>> -               ext4_xattr_release_block(handle, inode, bs->bh);
>> +       if (bs->bh && bs->bh != new_bh) {
>> +               struct ext4_xattr_inode_array *ea_inode_array = NULL;
>> +
>> +               ext4_xattr_release_block(handle, inode, bs->bh,
>> +                                        &ea_inode_array,
>> +                                        0 /* extra_credits */);
>> +               ext4_xattr_inode_array_free(ea_inode_array);
>> +       }
>>        error = 0;
>> 
>> cleanup:
>> +       if (ea_inode) {
>> +               int error2;
>> +
>> +               error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
>> +               if (error2)
>> +                       ext4_warning_inode(ea_inode, "dec ref error=%d",
>> +                                          error2);
>> +
>> +               /* If there was an error, revert the quota charge. */
>> +               if (error)
>> +                       ext4_xattr_inode_free_quota(inode,
>> +                                                   i_size_read(ea_inode));
>> +               iput(ea_inode);
>> +       }
>>        if (ce)
>>                mb_cache_entry_put(ext4_mb_cache, ce);
>>        brelse(new_bh);
>> @@ -1561,6 +2056,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
>>        return !memcmp(value, i->value, i->value_len);
>> }
>> 
>> +static struct buffer_head *ext4_xattr_get_block(struct inode *inode)
>> +{
>> +       struct buffer_head *bh;
>> +       int error;
>> +
>> +       if (!EXT4_I(inode)->i_file_acl)
>> +               return NULL;
>> +       bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
>> +       if (!bh)
>> +               return ERR_PTR(-EIO);
>> +       error = ext4_xattr_check_block(inode, bh);
>> +       if (error)
>> +               return ERR_PTR(error);
>> +       return bh;
>> +}
>> +
>> /*
>>  * ext4_xattr_set_handle()
>>  *
>> @@ -1603,9 +2114,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>> 
>>        /* Check journal credits under write lock. */
>>        if (ext4_handle_valid(handle)) {
>> +               struct buffer_head *bh;
>>                int credits;
>> 
>> -               credits = ext4_xattr_set_credits(inode, value_len);
>> +               bh = ext4_xattr_get_block(inode);
>> +               if (IS_ERR(bh)) {
>> +                       error = PTR_ERR(bh);
>> +                       goto cleanup;
>> +               }
>> +
>> +               credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
>> +               brelse(bh);
>> +
>>                if (!ext4_handle_has_enough_credits(handle, credits)) {
>>                        error = -ENOSPC;
>>                        goto cleanup;
>> @@ -1641,6 +2161,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>>                if (flags & XATTR_CREATE)
>>                        goto cleanup;
>>        }
>> +
>>        if (!value) {
>>                if (!is.s.not_found)
>>                        error = ext4_xattr_ibody_set(handle, inode, &i, &is);
>> @@ -1709,34 +2230,29 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
>>        return error;
>> }
>> 
>> -int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
>> +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
>> {
>> -       struct super_block *sb = inode->i_sb;
>> -       int credits;
>> -
>> -       if (!EXT4_SB(sb)->s_journal)
>> -               return 0;
>> +       struct buffer_head *bh;
>> +       int err;
>> 
>> -       credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
>> +       *credits = 0;
>> 
>> -       /*
>> -        * In case of inline data, we may push out the data to a block,
>> -        * so we need to reserve credits for this eventuality
>> -        */
>> -       if (ext4_has_inline_data(inode))
>> -               credits += ext4_writepage_trans_blocks(inode) + 1;
>> -
>> -       if (ext4_has_feature_ea_inode(sb)) {
>> -               int nrblocks = (value_len + sb->s_blocksize - 1) >>
>> -                                       sb->s_blocksize_bits;
>> +       if (!EXT4_SB(inode->i_sb)->s_journal)
>> +               return 0;
>> 
>> -               /* For new inode */
>> -               credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
>> +       down_read(&EXT4_I(inode)->xattr_sem);
>> 
>> -               /* For data blocks of EA inode */
>> -               credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
>> +       bh = ext4_xattr_get_block(inode);
>> +       if (IS_ERR(bh)) {
>> +               err = PTR_ERR(bh);
>> +       } else {
>> +               *credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
>> +               brelse(bh);
>> +               err = 0;
>>        }
>> -       return credits;
>> +
>> +       up_read(&EXT4_I(inode)->xattr_sem);
>> +       return err;
>> }
>> 
>> /*
>> @@ -1761,7 +2277,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
>>                return error;
>> 
>> retry:
>> -       credits = ext4_xattr_set_credits(inode, value_len);
>> +       error = ext4_xattr_set_credits(inode, value_len, &credits);
>> +       if (error)
>> +               return error;
>> +
>>        handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
>>        if (IS_ERR(handle)) {
>>                error = PTR_ERR(handle);
>> @@ -2067,10 +2586,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
>>        return error;
>> }
>> 
>> -
>> #define EIA_INCR 16 /* must be 2^n */
>> #define EIA_MASK (EIA_INCR - 1)
>> -/* Add the large xattr @inode into @ea_inode_array for later deletion.
>> +
>> +/* Add the large xattr @inode into @ea_inode_array for deferred iput().
>>  * If @ea_inode_array is new or full it will be grown and the old
>>  * contents copied over.
>>  */
>> @@ -2115,21 +2634,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
>>  * ext4_xattr_delete_inode()
>>  *
>>  * Free extended attribute resources associated with this inode. Traverse
>> - * all entries and unlink any xattr inodes associated with this inode. This
>> - * is called immediately before an inode is freed. We have exclusive
>> - * access to the inode. If an orphan inode is deleted it will also delete any
>> - * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
>> - * to ensure they belong to the parent inode and were not deleted already.
>> + * all entries and decrement reference on any xattr inodes associated with this
>> + * inode. This is called immediately before an inode is freed. We have exclusive
>> + * access to the inode. If an orphan inode is deleted it will also release its
>> + * references on xattr block and xattr inodes.
>>  */
>> -int
>> -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>> -                       struct ext4_xattr_inode_array **ea_inode_array,
>> -                       int extra_credits)
>> +int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>> +                           struct ext4_xattr_inode_array **ea_inode_array,
>> +                           int extra_credits)
>> {
>>        struct buffer_head *bh = NULL;
>>        struct ext4_xattr_ibody_header *header;
>> -       struct ext4_inode *raw_inode;
>>        struct ext4_iloc iloc = { .bh = NULL };
>> +       struct ext4_xattr_entry *entry;
>>        int error;
>> 
>>        error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
>> @@ -2141,66 +2658,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>>                goto cleanup;
>>        }
>> 
>> -       if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
>> -               goto delete_external_ea;
>> +       if (ext4_has_feature_ea_inode(inode->i_sb) &&
>> +           ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
>> 
>> -       error = ext4_get_inode_loc(inode, &iloc);
>> -       if (error)
>> -               goto cleanup;
>> -
>> -       error = ext4_journal_get_write_access(handle, iloc.bh);
>> -       if (error)
>> -               goto cleanup;
>> +               error = ext4_get_inode_loc(inode, &iloc);
>> +               if (error) {
>> +                       EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
>> +                       goto cleanup;
>> +               }
>> 
>> -       raw_inode = ext4_raw_inode(&iloc);
>> -       header = IHDR(inode, raw_inode);
>> -       ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
>> -                                   false /* block_csum */, ea_inode_array,
>> -                                   extra_credits);
>> +               error = ext4_journal_get_write_access(handle, iloc.bh);
>> +               if (error) {
>> +                       EXT4_ERROR_INODE(inode, "write access (error %d)",
>> +                                        error);
>> +                       goto cleanup;
>> +               }
>> 
>> -delete_external_ea:
>> -       if (!EXT4_I(inode)->i_file_acl) {
>> -               error = 0;
>> -               goto cleanup;
>> -       }
>> -       bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
>> -       if (!bh) {
>> -               EXT4_ERROR_INODE(inode, "block %llu read error",
>> -                                EXT4_I(inode)->i_file_acl);
>> -               error = -EIO;
>> -               goto cleanup;
>> -       }
>> -       if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
>> -           BHDR(bh)->h_blocks != cpu_to_le32(1)) {
>> -               EXT4_ERROR_INODE(inode, "bad block %llu",
>> -                                EXT4_I(inode)->i_file_acl);
>> -               error = -EFSCORRUPTED;
>> -               goto cleanup;
>> +               header = IHDR(inode, ext4_raw_inode(&iloc));
>> +               if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
>> +                       ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
>> +                                                    IFIRST(header),
>> +                                                    false /* block_csum */,
>> +                                                    ea_inode_array,
>> +                                                    extra_credits,
>> +                                                    false /* skip_quota */);
>>        }
>> 
>> -       if (ext4_has_feature_ea_inode(inode->i_sb)) {
>> -               error = ext4_journal_get_write_access(handle, bh);
>> -               if (error) {
>> -                       EXT4_ERROR_INODE(inode, "write access %llu",
>> +       if (EXT4_I(inode)->i_file_acl) {
>> +               bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
>> +               if (!bh) {
>> +                       EXT4_ERROR_INODE(inode, "block %llu read error",
>>                                         EXT4_I(inode)->i_file_acl);
>> +                       error = -EIO;
>> +                       goto cleanup;
>> +               }
>> +               error = ext4_xattr_check_block(inode, bh);
>> +               if (error) {
>> +                       EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
>> +                                        EXT4_I(inode)->i_file_acl, error);
>>                        goto cleanup;
>>                }
>> -               ext4_xattr_inode_remove_all(handle, inode, bh,
>> -                                           BFIRST(bh),
>> -                                           true /* block_csum */,
>> -                                           ea_inode_array,
>> -                                           extra_credits);
>> -       }
>> 
>> -       ext4_xattr_release_block(handle, inode, bh);
>> -       /* Update i_file_acl within the same transaction that releases block. */
>> -       EXT4_I(inode)->i_file_acl = 0;
>> -       error = ext4_mark_inode_dirty(handle, inode);
>> -       if (error) {
>> -               EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
>> -                                error);
>> -               goto cleanup;
>> +               if (ext4_has_feature_ea_inode(inode->i_sb)) {
>> +                       for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
>> +                            entry = EXT4_XATTR_NEXT(entry))
>> +                               if (entry->e_value_inum)
>> +                                       ext4_xattr_inode_free_quota(inode,
>> +                                             le32_to_cpu(entry->e_value_size));
>> +
>> +               }
>> +
>> +               ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
>> +                                        extra_credits);
>> +               /*
>> +                * Update i_file_acl value in the same transaction that releases
>> +                * block.
>> +                */
>> +               EXT4_I(inode)->i_file_acl = 0;
>> +               error = ext4_mark_inode_dirty(handle, inode);
>> +               if (error) {
>> +                       EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
>> +                                        error);
>> +                       goto cleanup;
>> +               }
>>        }
>> +       error = 0;
>> cleanup:
>>        brelse(iloc.bh);
>>        brelse(bh);
>> @@ -2209,17 +2731,13 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>> 
>> void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
>> {
>> -       struct inode    *ea_inode;
>> -       int             idx = 0;
>> +       int idx;
>> 
>>        if (ea_inode_array == NULL)
>>                return;
>> 
>> -       for (; idx < ea_inode_array->count; ++idx) {
>> -               ea_inode = ea_inode_array->inodes[idx];
>> -               clear_nlink(ea_inode);
>> -               iput(ea_inode);
>> -       }
>> +       for (idx = 0; idx < ea_inode_array->count; ++idx)
>> +               iput(ea_inode_array->inodes[idx]);
>>        kfree(ea_inode_array);
>> }
>> 
>> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
>> index b2005a2716d9..67616cb9a059 100644
>> --- a/fs/ext4/xattr.h
>> +++ b/fs/ext4/xattr.h
>> @@ -69,19 +69,6 @@ struct ext4_xattr_entry {
>>                EXT4_I(inode)->i_extra_isize))
>> #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
>> 
>> -/*
>> - * Link EA inode back to parent one using i_mtime field.
>> - * Extra integer type conversion added to ignore higher
>> - * bits in i_mtime.tv_sec which might be set by ext4_get()
>> - */
>> -#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
>> -do {                                                  \
>> -      (inode)->i_mtime.tv_sec = inum;                 \
>> -} while(0)
>> -
>> -#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
>> -((__u32)(inode)->i_mtime.tv_sec)
>> -
>> /*
>>  * The minimum size of EA value when you start storing it in an external inode
>>  * size of block - size of header - size of 1 entry - 4 null bytes
>> @@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
>> extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
>> extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
>> extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
>> -extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
>> +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
>> +                                 int *credits);
>> 
>> -extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
>> extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
>>                                   struct ext4_xattr_inode_array **array,
>>                                   int extra_credits);
>> diff --git a/fs/mbcache.c b/fs/mbcache.c
>> index 45a8d52dc991..d818fd236787 100644
>> --- a/fs/mbcache.c
>> +++ b/fs/mbcache.c
>> @@ -13,10 +13,11 @@
>>  * mb_cache_entry_delete()).
>>  *
>>  * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
>> - * They use hash of a block contents as a key and block number as a value.
>> - * That's why keys need not be unique (different xattr blocks may end up having
>> - * the same hash). However block number always uniquely identifies a cache
>> - * entry.
>> + * Ext4 also uses it for deduplication of xattr values stored in inodes.
>> + * They use hash of data as a key and provide a value that may represent a
>> + * block or inode number. That's why keys need not be unique (hash of different
>> + * data may be the same). However user provided value always uniquely
>> + * identifies a cache entry.
>>  *
>>  * We provide functions for creation and removal of entries, search by key,
>>  * and a special "delete entry with given key-value pair" operation. Fixed
>> --
>> 2.13.1.518.g3df882009-goog
>> 


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v2 23/31] mbcache: make mbcache naming more generic
  2017-06-20  9:01         ` [PATCH v2 23/31] mbcache: make mbcache naming " Tahsin Erdogan
@ 2017-06-21 17:43           ` Andreas Dilger
  2017-06-21 18:33           ` Andreas Dilger
  1 sibling, 0 replies; 100+ messages in thread
From: Andreas Dilger @ 2017-06-21 17:43 UTC (permalink / raw)
  To: Tahsin Erdogan; +Cc: Jan Kara, linux-ext4, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 11405 bytes --]

On Jun 20, 2017, at 3:01 AM, Tahsin Erdogan <tahsin@google.com> wrote:
> 
> Make names more generic so that mbcache usage is not limited to
> block sharing. In a subsequent patch in the series
> ("ext4: xattr inode deduplication"), we start using the mbcache code
> for sharing xattr inodes. With that patch, old mb_cache_entry.e_block
> field could be holding either a block number or an inode number.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>

Reviewed-by: Andreas Dilger <adilger@dilger.ca>

> ---
> v2: updated commit title and description
> 
> fs/ext2/xattr.c         | 18 +++++++++---------
> fs/ext4/xattr.c         | 10 +++++-----
> fs/mbcache.c            | 43 +++++++++++++++++++++----------------------
> include/linux/mbcache.h | 11 +++++------
> 4 files changed, 40 insertions(+), 42 deletions(-)
> 
> diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
> index fbdb8f171893..1e5f76070580 100644
> --- a/fs/ext2/xattr.c
> +++ b/fs/ext2/xattr.c
> @@ -493,8 +493,8 @@ bad_block:		ext2_error(sb, "ext2_xattr_set",
> 			 * This must happen under buffer lock for
> 			 * ext2_xattr_set2() to reliably detect modified block
> 			 */
> -			mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
> -						    hash, bh->b_blocknr);
> +			mb_cache_entry_delete(EXT2_SB(sb)->s_mb_cache, hash,
> +					      bh->b_blocknr);
> 
> 			/* keep the buffer locked while modifying it. */
> 		} else {
> @@ -721,8 +721,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
> 			 * This must happen under buffer lock for
> 			 * ext2_xattr_set2() to reliably detect freed block
> 			 */
> -			mb_cache_entry_delete_block(ext2_mb_cache,
> -						    hash, old_bh->b_blocknr);
> +			mb_cache_entry_delete(ext2_mb_cache, hash,
> +					      old_bh->b_blocknr);
> 			/* Free the old block. */
> 			ea_bdebug(old_bh, "freeing");
> 			ext2_free_blocks(inode, old_bh->b_blocknr, 1);
> @@ -795,8 +795,8 @@ ext2_xattr_delete_inode(struct inode *inode)
> 		 * This must happen under buffer lock for ext2_xattr_set2() to
> 		 * reliably detect freed block
> 		 */
> -		mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
> -					    hash, bh->b_blocknr);
> +		mb_cache_entry_delete(EXT2_SB(inode->i_sb)->s_mb_cache, hash,
> +				      bh->b_blocknr);
> 		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
> 		get_bh(bh);
> 		bforget(bh);
> @@ -907,11 +907,11 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
> 	while (ce) {
> 		struct buffer_head *bh;
> 
> -		bh = sb_bread(inode->i_sb, ce->e_block);
> +		bh = sb_bread(inode->i_sb, ce->e_value);
> 		if (!bh) {
> 			ext2_error(inode->i_sb, "ext2_xattr_cache_find",
> 				"inode %ld: block %ld read error",
> -				inode->i_ino, (unsigned long) ce->e_block);
> +				inode->i_ino, (unsigned long) ce->e_value);
> 		} else {
> 			lock_buffer(bh);
> 			/*
> @@ -931,7 +931,7 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
> 			} else if (le32_to_cpu(HDR(bh)->h_refcount) >
> 				   EXT2_XATTR_REFCOUNT_MAX) {
> 				ea_idebug(inode, "block %ld refcount %d>%d",
> -					  (unsigned long) ce->e_block,
> +					  (unsigned long) ce->e_value,
> 					  le32_to_cpu(HDR(bh)->h_refcount),
> 					  EXT2_XATTR_REFCOUNT_MAX);
> 			} else if (!ext2_xattr_cmp(header, HDR(bh))) {
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index c09fcffb0878..0b43e0e52e26 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -678,7 +678,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
> 		 * This must happen under buffer lock for
> 		 * ext4_xattr_block_set() to reliably detect freed block
> 		 */
> -		mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
> +		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
> 		get_bh(bh);
> 		unlock_buffer(bh);
> 		ext4_free_blocks(handle, inode, bh, 0, 1,
> @@ -1115,8 +1115,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
> 			 * ext4_xattr_block_set() to reliably detect modified
> 			 * block
> 			 */
> -			mb_cache_entry_delete_block(ext4_mb_cache, hash,
> -						    bs->bh->b_blocknr);
> +			mb_cache_entry_delete(ext4_mb_cache, hash,
> +					      bs->bh->b_blocknr);
> 			ea_bdebug(bs->bh, "modifying in-place");
> 			error = ext4_xattr_set_entry(i, s, handle, inode);
> 			if (!error) {
> @@ -2238,10 +2238,10 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
> 	while (ce) {
> 		struct buffer_head *bh;
> 
> -		bh = sb_bread(inode->i_sb, ce->e_block);
> +		bh = sb_bread(inode->i_sb, ce->e_value);
> 		if (!bh) {
> 			EXT4_ERROR_INODE(inode, "block %lu read error",
> -					 (unsigned long) ce->e_block);
> +					 (unsigned long) ce->e_value);
> 		} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
> 			*pce = ce;
> 			return bh;
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index b19be429d655..45a8d52dc991 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -10,7 +10,7 @@
> /*
>  * Mbcache is a simple key-value store. Keys need not be unique, however
>  * key-value pairs are expected to be unique (we use this fact in
> - * mb_cache_entry_delete_block()).
> + * mb_cache_entry_delete()).
>  *
>  * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
>  * They use hash of a block contents as a key and block number as a value.
> @@ -62,15 +62,15 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
>  * @cache - cache where the entry should be created
>  * @mask - gfp mask with which the entry should be allocated
>  * @key - key of the entry
> - * @block - block that contains data
> - * @reusable - is the block reusable by other inodes?
> + * @value - value of the entry
> + * @reusable - is the entry reusable by others?
>  *
> - * Creates entry in @cache with key @key and records that data is stored in
> - * block @block. The function returns -EBUSY if entry with the same key
> - * and for the same block already exists in cache. Otherwise 0 is returned.
> + * Creates entry in @cache with key @key and value @value. The function returns
> + * -EBUSY if entry with the same key and value already exists in cache.
> + * Otherwise 0 is returned.
>  */
> int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
> -			  sector_t block, bool reusable)
> +			  u64 value, bool reusable)
> {
> 	struct mb_cache_entry *entry, *dup;
> 	struct hlist_bl_node *dup_node;
> @@ -91,12 +91,12 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
> 	/* One ref for hash, one ref returned */
> 	atomic_set(&entry->e_refcnt, 1);
> 	entry->e_key = key;
> -	entry->e_block = block;
> +	entry->e_value = value;
> 	entry->e_reusable = reusable;
> 	head = mb_cache_entry_head(cache, key);
> 	hlist_bl_lock(head);
> 	hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
> -		if (dup->e_key == key && dup->e_block == block) {
> +		if (dup->e_key == key && dup->e_value == value) {
> 			hlist_bl_unlock(head);
> 			kmem_cache_free(mb_entry_cache, entry);
> 			return -EBUSY;
> @@ -187,13 +187,13 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
> EXPORT_SYMBOL(mb_cache_entry_find_next);
> 
> /*
> - * mb_cache_entry_get - get a cache entry by block number (and key)
> + * mb_cache_entry_get - get a cache entry by value (and key)
>  * @cache - cache we work with
> - * @key - key of block number @block
> - * @block - block number
> + * @key - key
> + * @value - value
>  */
> struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
> -					  sector_t block)
> +					  u64 value)
> {
> 	struct hlist_bl_node *node;
> 	struct hlist_bl_head *head;
> @@ -202,7 +202,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
> 	head = mb_cache_entry_head(cache, key);
> 	hlist_bl_lock(head);
> 	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
> -		if (entry->e_key == key && entry->e_block == block) {
> +		if (entry->e_key == key && entry->e_value == value) {
> 			atomic_inc(&entry->e_refcnt);
> 			goto out;
> 		}
> @@ -214,15 +214,14 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
> }
> EXPORT_SYMBOL(mb_cache_entry_get);
> 
> -/* mb_cache_entry_delete_block - remove information about block from cache
> +/* mb_cache_entry_delete - remove a cache entry
>  * @cache - cache we work with
> - * @key - key of block @block
> - * @block - block number
> + * @key - key
> + * @value - value
>  *
> - * Remove entry from cache @cache with key @key with data stored in @block.
> + * Remove entry from cache @cache with key @key and value @value.
>  */
> -void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
> -				 sector_t block)
> +void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
> {
> 	struct hlist_bl_node *node;
> 	struct hlist_bl_head *head;
> @@ -231,7 +230,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
> 	head = mb_cache_entry_head(cache, key);
> 	hlist_bl_lock(head);
> 	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
> -		if (entry->e_key == key && entry->e_block == block) {
> +		if (entry->e_key == key && entry->e_value == value) {
> 			/* We keep hash list reference to keep entry alive */
> 			hlist_bl_del_init(&entry->e_hash_list);
> 			hlist_bl_unlock(head);
> @@ -248,7 +247,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
> 	}
> 	hlist_bl_unlock(head);
> }
> -EXPORT_SYMBOL(mb_cache_entry_delete_block);
> +EXPORT_SYMBOL(mb_cache_entry_delete);
> 
> /* mb_cache_entry_touch - cache entry got used
>  * @cache - cache the entry belongs to
> diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
> index 86c9a8b480c5..e1bc73414983 100644
> --- a/include/linux/mbcache.h
> +++ b/include/linux/mbcache.h
> @@ -19,15 +19,15 @@ struct mb_cache_entry {
> 	u32			e_key;
> 	u32			e_referenced:1;
> 	u32			e_reusable:1;
> -	/* Block number of hashed block - stable during lifetime of the entry */
> -	sector_t		e_block;
> +	/* User provided value - stable during lifetime of the entry */
> +	u64			e_value;
> };
> 
> struct mb_cache *mb_cache_create(int bucket_bits);
> void mb_cache_destroy(struct mb_cache *cache);
> 
> int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
> -			  sector_t block, bool reusable);
> +			  u64 value, bool reusable);
> void __mb_cache_entry_free(struct mb_cache_entry *entry);
> static inline int mb_cache_entry_put(struct mb_cache *cache,
> 				     struct mb_cache_entry *entry)
> @@ -38,10 +38,9 @@ static inline int mb_cache_entry_put(struct mb_cache *cache,
> 	return 1;
> }
> 
> -void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
> -				  sector_t block);
> +void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value);
> struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
> -					  sector_t block);
> +					  u64 value);
> struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
> 						 u32 key);
> struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
> --
> 2.13.1.518.g3df882009-goog
> 


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v2 23/31] mbcache: make mbcache naming more generic
  2017-06-20  9:01         ` [PATCH v2 23/31] mbcache: make mbcache naming " Tahsin Erdogan
  2017-06-21 17:43           ` Andreas Dilger
@ 2017-06-21 18:33           ` Andreas Dilger
  2017-06-21 21:39             ` Tahsin Erdogan
  1 sibling, 1 reply; 100+ messages in thread
From: Andreas Dilger @ 2017-06-21 18:33 UTC (permalink / raw)
  To: Tahsin Erdogan; +Cc: Jan Kara, linux-ext4, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 12260 bytes --]

On Jun 20, 2017, at 3:01 AM, Tahsin Erdogan <tahsin@google.com> wrote:
> 
> Make names more generic so that mbcache usage is not limited to
> block sharing. In a subsequent patch in the series
> ("ext4: xattr inode deduplication"), we start using the mbcache code
> for sharing xattr inodes. With that patch, old mb_cache_entry.e_block
> field could be holding either a block number or an inode number.

Actually, looking at the later 28/31 patch made me come back to add a few
comments to this patch.  Not strictly required, but I think it makes the
code a bit more clear.

> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
> v2: updated commit title and description
> 
> fs/ext2/xattr.c         | 18 +++++++++---------
> fs/ext4/xattr.c         | 10 +++++-----
> fs/mbcache.c            | 43 +++++++++++++++++++++----------------------
> include/linux/mbcache.h | 11 +++++------
> 4 files changed, 40 insertions(+), 42 deletions(-)
> 
> diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
> index fbdb8f171893..1e5f76070580 100644
> --- a/fs/ext2/xattr.c
> +++ b/fs/ext2/xattr.c
> @@ -493,8 +493,8 @@ bad_block:		ext2_error(sb, "ext2_xattr_set",
> 			 * This must happen under buffer lock for
> 			 * ext2_xattr_set2() to reliably detect modified block
> 			 */
> -			mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
> -						    hash, bh->b_blocknr);
> +			mb_cache_entry_delete(EXT2_SB(sb)->s_mb_cache, hash,
> +					      bh->b_blocknr);

Since we now also have the ea_inode cache, it would be better to rename
s_mb_cache to s_mb_block_cache to make it more clear what it is.  That
isn't strictly needed for ext2, but better to keep it consistent with ext4.

> 			/* keep the buffer locked while modifying it. */
> 		} else {
> @@ -721,8 +721,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
> 			 * This must happen under buffer lock for
> 			 * ext2_xattr_set2() to reliably detect freed block
> 			 */
> -			mb_cache_entry_delete_block(ext2_mb_cache,
> -						    hash, old_bh->b_blocknr);
> +			mb_cache_entry_delete(ext2_mb_cache, hash,
> +					      old_bh->b_blocknr);

Minor nit - having a function-local variable named "ext2_mb_cache" and
"ext4_mb_cache" makes it look like this is a global variable.  I think
this name is left over from when it _was_ actually a single global cache
rather than a per-filesystem cache.  It would be better to use a name
like "mb_block_cache" to make it clear that this is the local block cache.

> 			/* Free the old block. */
> 			ea_bdebug(old_bh, "freeing");
> 			ext2_free_blocks(inode, old_bh->b_blocknr, 1);
> @@ -795,8 +795,8 @@ ext2_xattr_delete_inode(struct inode *inode)
> 		 * This must happen under buffer lock for ext2_xattr_set2() to
> 		 * reliably detect freed block
> 		 */
> -		mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
> -					    hash, bh->b_blocknr);
> +		mb_cache_entry_delete(EXT2_SB(inode->i_sb)->s_mb_cache, hash,
> +				      bh->b_blocknr);
> 		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
> 		get_bh(bh);
> 		bforget(bh);
> @@ -907,11 +907,11 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
> 	while (ce) {
> 		struct buffer_head *bh;
> 
> -		bh = sb_bread(inode->i_sb, ce->e_block);
> +		bh = sb_bread(inode->i_sb, ce->e_value);
> 		if (!bh) {
> 			ext2_error(inode->i_sb, "ext2_xattr_cache_find",
> 				"inode %ld: block %ld read error",
> -				inode->i_ino, (unsigned long) ce->e_block);
> +				inode->i_ino, (unsigned long) ce->e_value);
> 		} else {
> 			lock_buffer(bh);
> 			/*
> @@ -931,7 +931,7 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
> 			} else if (le32_to_cpu(HDR(bh)->h_refcount) >
> 				   EXT2_XATTR_REFCOUNT_MAX) {
> 				ea_idebug(inode, "block %ld refcount %d>%d",
> -					  (unsigned long) ce->e_block,
> +					  (unsigned long) ce->e_value,
> 					  le32_to_cpu(HDR(bh)->h_refcount),
> 					  EXT2_XATTR_REFCOUNT_MAX);
> 			} else if (!ext2_xattr_cmp(header, HDR(bh))) {
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index c09fcffb0878..0b43e0e52e26 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -678,7 +678,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
> 		 * This must happen under buffer lock for
> 		 * ext4_xattr_block_set() to reliably detect freed block
> 		 */
> -		mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
> +		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);

This local variable should be named "mb_block_cache" or similar.

> 		get_bh(bh);
> 		unlock_buffer(bh);
> 		ext4_free_blocks(handle, inode, bh, 0, 1,
> @@ -1115,8 +1115,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
> 			 * ext4_xattr_block_set() to reliably detect modified
> 			 * block
> 			 */
> -			mb_cache_entry_delete_block(ext4_mb_cache, hash,
> -						    bs->bh->b_blocknr);
> +			mb_cache_entry_delete(ext4_mb_cache, hash,
> +					      bs->bh->b_blocknr);

s/ext4_mb_cache/mb_block_cache/

> 			ea_bdebug(bs->bh, "modifying in-place");
> 			error = ext4_xattr_set_entry(i, s, handle, inode);
> 			if (!error) {
> @@ -2238,10 +2238,10 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
> 	while (ce) {
> 		struct buffer_head *bh;
> 
> -		bh = sb_bread(inode->i_sb, ce->e_block);
> +		bh = sb_bread(inode->i_sb, ce->e_value);
> 		if (!bh) {
> 			EXT4_ERROR_INODE(inode, "block %lu read error",
> -					 (unsigned long) ce->e_block);
> +					 (unsigned long) ce->e_value);

(style) no space after typecast

> 		} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
> 			*pce = ce;
> 			return bh;
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index b19be429d655..45a8d52dc991 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -10,7 +10,7 @@
> /*
>  * Mbcache is a simple key-value store. Keys need not be unique, however
>  * key-value pairs are expected to be unique (we use this fact in
> - * mb_cache_entry_delete_block()).
> + * mb_cache_entry_delete()).
>  *
>  * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
>  * They use hash of a block contents as a key and block number as a value.
> @@ -62,15 +62,15 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
>  * @cache - cache where the entry should be created
>  * @mask - gfp mask with which the entry should be allocated
>  * @key - key of the entry
> - * @block - block that contains data
> - * @reusable - is the block reusable by other inodes?
> + * @value - value of the entry
> + * @reusable - is the entry reusable by others?
>  *
> - * Creates entry in @cache with key @key and records that data is stored in
> - * block @block. The function returns -EBUSY if entry with the same key
> - * and for the same block already exists in cache. Otherwise 0 is returned.
> + * Creates entry in @cache with key @key and value @value. The function returns
> + * -EBUSY if entry with the same key and value already exists in cache.
> + * Otherwise 0 is returned.
>  */
> int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
> -			  sector_t block, bool reusable)
> +			  u64 value, bool reusable)
> {
> 	struct mb_cache_entry *entry, *dup;
> 	struct hlist_bl_node *dup_node;
> @@ -91,12 +91,12 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
> 	/* One ref for hash, one ref returned */
> 	atomic_set(&entry->e_refcnt, 1);
> 	entry->e_key = key;
> -	entry->e_block = block;
> +	entry->e_value = value;
> 	entry->e_reusable = reusable;
> 	head = mb_cache_entry_head(cache, key);
> 	hlist_bl_lock(head);
> 	hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
> -		if (dup->e_key == key && dup->e_block == block) {
> +		if (dup->e_key == key && dup->e_value == value) {
> 			hlist_bl_unlock(head);
> 			kmem_cache_free(mb_entry_cache, entry);
> 			return -EBUSY;
> @@ -187,13 +187,13 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
> EXPORT_SYMBOL(mb_cache_entry_find_next);
> 
> /*
> - * mb_cache_entry_get - get a cache entry by block number (and key)
> + * mb_cache_entry_get - get a cache entry by value (and key)
>  * @cache - cache we work with
> - * @key - key of block number @block
> - * @block - block number
> + * @key - key
> + * @value - value
>  */
> struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
> -					  sector_t block)
> +					  u64 value)
> {
> 	struct hlist_bl_node *node;
> 	struct hlist_bl_head *head;
> @@ -202,7 +202,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
> 	head = mb_cache_entry_head(cache, key);
> 	hlist_bl_lock(head);
> 	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
> -		if (entry->e_key == key && entry->e_block == block) {
> +		if (entry->e_key == key && entry->e_value == value) {
> 			atomic_inc(&entry->e_refcnt);
> 			goto out;
> 		}
> @@ -214,15 +214,14 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
> }
> EXPORT_SYMBOL(mb_cache_entry_get);
> 
> -/* mb_cache_entry_delete_block - remove information about block from cache
> +/* mb_cache_entry_delete - remove a cache entry
>  * @cache - cache we work with
> - * @key - key of block @block
> - * @block - block number
> + * @key - key
> + * @value - value
>  *
> - * Remove entry from cache @cache with key @key with data stored in @block.
> + * Remove entry from cache @cache with key @key and value @value.
>  */
> -void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
> -				 sector_t block)
> +void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
> {
> 	struct hlist_bl_node *node;
> 	struct hlist_bl_head *head;
> @@ -231,7 +230,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
> 	head = mb_cache_entry_head(cache, key);
> 	hlist_bl_lock(head);
> 	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
> -		if (entry->e_key == key && entry->e_block == block) {
> +		if (entry->e_key == key && entry->e_value == value) {
> 			/* We keep hash list reference to keep entry alive */
> 			hlist_bl_del_init(&entry->e_hash_list);
> 			hlist_bl_unlock(head);
> @@ -248,7 +247,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
> 	}
> 	hlist_bl_unlock(head);
> }
> -EXPORT_SYMBOL(mb_cache_entry_delete_block);
> +EXPORT_SYMBOL(mb_cache_entry_delete);
> 
> /* mb_cache_entry_touch - cache entry got used
>  * @cache - cache the entry belongs to
> diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
> index 86c9a8b480c5..e1bc73414983 100644
> --- a/include/linux/mbcache.h
> +++ b/include/linux/mbcache.h
> @@ -19,15 +19,15 @@ struct mb_cache_entry {
> 	u32			e_key;
> 	u32			e_referenced:1;
> 	u32			e_reusable:1;
> -	/* Block number of hashed block - stable during lifetime of the entry */
> -	sector_t		e_block;
> +	/* User provided value - stable during lifetime of the entry */
> +	u64			e_value;
> };
> 
> struct mb_cache *mb_cache_create(int bucket_bits);
> void mb_cache_destroy(struct mb_cache *cache);
> 
> int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
> -			  sector_t block, bool reusable);
> +			  u64 value, bool reusable);
> void __mb_cache_entry_free(struct mb_cache_entry *entry);
> static inline int mb_cache_entry_put(struct mb_cache *cache,
> 				     struct mb_cache_entry *entry)
> @@ -38,10 +38,9 @@ static inline int mb_cache_entry_put(struct mb_cache *cache,
> 	return 1;
> }
> 
> -void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
> -				  sector_t block);
> +void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value);
> struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
> -					  sector_t block);
> +					  u64 value);
> struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
> 						 u32 key);
> struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
> --
> 2.13.1.518.g3df882009-goog
> 


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v5 27/28] ext4: xattr inode deduplication
  2017-06-20  9:07                   ` [PATCH v5 " Tahsin Erdogan
  2017-06-20  9:49                     ` Tahsin Erdogan
@ 2017-06-21 21:14                     ` Andreas Dilger
  2017-06-21 21:34                       ` Tahsin Erdogan
  2017-07-04 18:39                     ` Theodore Ts'o
  2 siblings, 1 reply; 100+ messages in thread
From: Andreas Dilger @ 2017-06-21 21:14 UTC (permalink / raw)
  To: Tahsin Erdogan; +Cc: Darrick J . Wong, linux-ext4, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2295 bytes --]

On Jun 20, 2017, at 3:07 AM, Tahsin Erdogan <tahsin@google.com> wrote:
> 
> Ext4 now supports xattr values that are up to 64k in size (vfs limit).
> Large xattr values are stored in external inodes each one holding a
> single value. Once written the data blocks of these inodes are immutable.
> 
> The real world use cases are expected to have a lot of value duplication
> such as inherited acls etc. To reduce data duplication on disk, this patch
> implements a deduplicator that allows sharing of xattr inodes.
> 
> The deduplication is based on an in-memory hash lookup that is a best
> effort sharing scheme. When a xattr inode is read from disk (i.e.
> getxattr() call), its crc32c hash is added to a hash table. Before
> creating a new xattr inode for a value being set, the hash table is
> checked to see if an existing inode holds an identical value. If such an
> inode is found, the ref count on that inode is incremented. On value
> removal the ref count is decremented and if it reaches zero the inode is
> deleted.
> 
> The quota charging for such inodes is manually managed. Every reference
> holder is charged the full size as if there was no sharing happening.
> This is consistent with how xattr blocks are also charged.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index d79d8d7bee88..59e9488c4876 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1517,6 +1517,7 @@ struct ext4_sb_info {
> 	long s_es_nr_inode;
> 	struct ext4_es_stats s_es_stats;
> 	struct mb_cache *s_mb_cache;
> +	struct mb_cache *s_ea_inode_cache;

These names should be consistent, like "s_ea_block_cache".

> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 0484df8dadd1..d7e60358ec91 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -108,6 +108,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
> #define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
> 				inode->i_sb->s_fs_info)->s_mb_cache)
> 
> +#define EA_INODE_CACHE(inode)	(((struct ext4_sb_info *) \
> +				inode->i_sb->s_fs_info)->s_ea_inode_cache)

These names should be consistent, like EXT4_GET_EA_CACHE() or maybe
EXT4_GET_EA_BLOCK_CACHE() and EXT4_GET_EA_INODE_CACHE().

The rest of the changes look reasonable.

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v5 27/28] ext4: xattr inode deduplication
  2017-06-21 21:14                     ` Andreas Dilger
@ 2017-06-21 21:34                       ` Tahsin Erdogan
  2017-06-21 21:42                         ` Andreas Dilger
  0 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-21 21:34 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Darrick J . Wong, linux-ext4, linux-kernel

> Tashin, we are already using the "no_mbcache" option name, so would prefer
> to keep that working.  It would be OK to accept both option names to mean
> the same thing, and only document the "nombcache" option.

Updated patch to accept both nombcache and no_mbcache.

>>       struct mb_cache *s_mb_cache;
>> +     struct mb_cache *s_ea_inode_cache;
>
> These names should be consistent, like "s_ea_block_cache".

Yes, I will rename this to s_ea_block_cache.

>> #define EXT4_GET_MB_CACHE(inode)      (((struct ext4_sb_info *) \
>>                               inode->i_sb->s_fs_info)->s_mb_cache)
>>
>> +#define EA_INODE_CACHE(inode)        (((struct ext4_sb_info *) \
>> +                             inode->i_sb->s_fs_info)->s_ea_inode_cache)
>
> These names should be consistent, like EXT4_GET_EA_CACHE() or maybe
> EXT4_GET_EA_BLOCK_CACHE() and EXT4_GET_EA_INODE_CACHE().

How about EA_BLOCK_CACHE() and EA_INODE_CACHE() to keep them short?

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v2 23/31] mbcache: make mbcache naming more generic
  2017-06-21 18:33           ` Andreas Dilger
@ 2017-06-21 21:39             ` Tahsin Erdogan
  0 siblings, 0 replies; 100+ messages in thread
From: Tahsin Erdogan @ 2017-06-21 21:39 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Jan Kara, linux-ext4, linux-kernel

Hi Andreas, I have incorporated your suggestions into another patch
that renames things in ext2/ext4 ("[PATCH 24/32] ext2, ext4: make mb
block cache names more explicit").

> Since we now also have the ea_inode cache, it would be better to rename
> s_mb_cache to s_mb_block_cache to make it more clear what it is.  That
> isn't strictly needed for ext2, but better to keep it consistent with ext4.

Done

> Minor nit - having a function-local variable named "ext2_mb_cache" and
> "ext4_mb_cache" makes it look like this is a global variable.  I think
> this name is left over from when it _was_ actually a single global cache
> rather than a per-filesystem cache.  It would be better to use a name
> like "mb_block_cache" to make it clear that this is the local block cache.

Done

> This local variable should be named "mb_block_cache" or similar.

Done

> s/ext4_mb_cache/mb_block_cache/
Done

> (style) no space after typecast
Fixed

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v5 27/28] ext4: xattr inode deduplication
  2017-06-21 21:34                       ` Tahsin Erdogan
@ 2017-06-21 21:42                         ` Andreas Dilger
  0 siblings, 0 replies; 100+ messages in thread
From: Andreas Dilger @ 2017-06-21 21:42 UTC (permalink / raw)
  To: Tahsin Erdogan; +Cc: Darrick J . Wong, linux-ext4, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1141 bytes --]


> On Jun 21, 2017, at 3:34 PM, Tahsin Erdogan <tahsin@google.com> wrote:
> 
>> Tashin, we are already using the "no_mbcache" option name, so would prefer
>> to keep that working.  It would be OK to accept both option names to mean
>> the same thing, and only document the "nombcache" option.
> 
> Updated patch to accept both nombcache and no_mbcache.
> 
>>>      struct mb_cache *s_mb_cache;
>>> +     struct mb_cache *s_ea_inode_cache;
>> 
>> These names should be consistent, like "s_ea_block_cache".
> 
> Yes, I will rename this to s_ea_block_cache.
> 
>>> #define EXT4_GET_MB_CACHE(inode)      (((struct ext4_sb_info *) \
>>>                              inode->i_sb->s_fs_info)->s_mb_cache)
>>> 
>>> +#define EA_INODE_CACHE(inode)        (((struct ext4_sb_info *) \
>>> +                             inode->i_sb->s_fs_info)->s_ea_inode_cache)
>> 
>> These names should be consistent, like EXT4_GET_EA_CACHE() or maybe
>> EXT4_GET_EA_BLOCK_CACHE() and EXT4_GET_EA_INODE_CACHE().
> 
> How about EA_BLOCK_CACHE() and EA_INODE_CACHE() to keep them short?

Sure, that is fine since these macros are local to xattr.c.

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v5 27/28] ext4: xattr inode deduplication
  2017-06-20  9:07                   ` [PATCH v5 " Tahsin Erdogan
  2017-06-20  9:49                     ` Tahsin Erdogan
  2017-06-21 21:14                     ` Andreas Dilger
@ 2017-07-04 18:39                     ` Theodore Ts'o
  2017-07-05 17:30                       ` Tahsin Erdogan
  2 siblings, 1 reply; 100+ messages in thread
From: Theodore Ts'o @ 2017-07-04 18:39 UTC (permalink / raw)
  To: Tahsin Erdogan; +Cc: Andreas Dilger, Darrick J . Wong, linux-ext4

I'm going to fold in the following patch to the xattr inode
deduplicate patch to fix up some corner cases which
__ext4_xattr_set_credits() wasn't quite getting right.

				- Ted

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 4befc7369c0d..435a49811218 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -762,10 +762,11 @@ static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
 	dquot_free_inode(inode);
 }
 
-static int __ext4_xattr_set_credits(struct super_block *sb,
+static int __ext4_xattr_set_credits(struct inode *inode,
 				    struct buffer_head *block_bh,
 				    size_t value_len)
 {
+	struct super_block *sb = inode->i_sb;
 	int credits;
 	int blocks;
 
@@ -775,12 +776,25 @@ static int __ext4_xattr_set_credits(struct super_block *sb,
 	 * 3) new xattr block
 	 * 4) block bitmap update for new xattr block
 	 * 5) group descriptor for new xattr block
+	 * 6) block bitmap update for old xattr block
+	 * 7) group descriptor for new old block
+	 *
+	 * 6 & 7 can happen if we have two racing threads T_a and T_b
+	 * which are each trying to set an xattr on inodes I_a and I_b
+	 * which were both initially sharing an xattr block.
 	 */
-	credits = 5;
+	credits = 7;
 
 	/* Quota updates. */
 	credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
 
+	/*
+	 * In case of inline data, we may push out the data to a block,
+	 * so we need to reserve credits for this eventuality
+	 */
+	if (ext4_has_inline_data(inode))
+		credits += ext4_writepage_trans_blocks(inode) + 1;
+
 	/* We are done if ea_inode feature is not enabled. */
 	if (!ext4_has_feature_ea_inode(sb))
 		return credits;
@@ -2120,7 +2134,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 			goto cleanup;
 		}
 
-		credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
+		credits = __ext4_xattr_set_credits(inode, bh, value_len);
 		brelse(bh);
 
 		if (!ext4_handle_has_enough_credits(handle, credits)) {
@@ -2243,7 +2257,7 @@ int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
 	if (IS_ERR(bh)) {
 		err = PTR_ERR(bh);
 	} else {
-		*credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
+		*credits = __ext4_xattr_set_credits(inode, bh, value_len);
 		brelse(bh);
 		err = 0;
 	}

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* Re: [PATCH v5 27/28] ext4: xattr inode deduplication
  2017-07-04 18:39                     ` Theodore Ts'o
@ 2017-07-05 17:30                       ` Tahsin Erdogan
  2017-07-06  4:19                         ` Theodore Ts'o
  0 siblings, 1 reply; 100+ messages in thread
From: Tahsin Erdogan @ 2017-07-05 17:30 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: Andreas Dilger, Darrick J . Wong, Ext4 Developers List

>          * 5) group descriptor for new xattr block
> +        * 6) block bitmap update for old xattr block
> +        * 7) group descriptor for new old block

s/new old block/old block/

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH v5 27/28] ext4: xattr inode deduplication
  2017-07-05 17:30                       ` Tahsin Erdogan
@ 2017-07-06  4:19                         ` Theodore Ts'o
  0 siblings, 0 replies; 100+ messages in thread
From: Theodore Ts'o @ 2017-07-06  4:19 UTC (permalink / raw)
  To: Tahsin Erdogan; +Cc: Andreas Dilger, Darrick J . Wong, Ext4 Developers List

On Wed, Jul 05, 2017 at 10:30:17AM -0700, Tahsin Erdogan wrote:
> >          * 5) group descriptor for new xattr block
> > +        * 6) block bitmap update for old xattr block
> > +        * 7) group descriptor for new old block
> 
> s/new old block/old block/

Thanks, I've fixed that up in my tree.

	     	   	      - Ted

^ permalink raw reply	[flat|nested] 100+ messages in thread

end of thread, other threads:[~2017-07-06  4:19 UTC | newest]

Thread overview: 100+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-31  8:14 [PATCH 01/28] ext4: xattr-in-inode support Tahsin Erdogan
2017-05-31  8:14 ` [PATCH 02/28] ext4: fix lockdep warning about recursive inode locking Tahsin Erdogan
2017-05-31  8:14 ` [PATCH 03/28] ext4: lock inode before calling ext4_orphan_add() Tahsin Erdogan
2017-05-31  8:14 ` [PATCH 04/28] ext4: do not set posix acls on xattr inodes Tahsin Erdogan
2017-05-31  8:14 ` [PATCH 05/28] ext4: attach jinode after creation of xattr inode Tahsin Erdogan
2017-05-31  8:14 ` [PATCH 06/28] ext4: ea_inode owner should be the same as the inode owner Tahsin Erdogan
2017-05-31  8:14 ` [PATCH 07/28] ext4: call journal revoke when freeing ea_inode blocks Tahsin Erdogan
2017-05-31 16:12   ` Darrick J. Wong
2017-05-31 16:12     ` [Ocfs2-devel] " Darrick J. Wong
2017-05-31 16:12     ` Darrick J. Wong
2017-05-31 21:01     ` Tahsin Erdogan
2017-06-05 22:08     ` Andreas Dilger
2017-05-31  8:14 ` [PATCH 08/28] ext4: fix ref counting for ea_inode Tahsin Erdogan
2017-05-31  8:14 ` [PATCH 09/28] ext4: extended attribute value size limit is enforced by vfs Tahsin Erdogan
2017-05-31 16:03   ` Darrick J. Wong
2017-05-31 16:03     ` [Ocfs2-devel] " Darrick J. Wong
2017-05-31 16:03     ` Darrick J. Wong
2017-05-31 16:13     ` Tahsin Erdogan
2017-05-31  8:14 ` [PATCH 10/28] ext4: change ext4_xattr_inode_iget() signature Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 11/28] ext4: clean up ext4_xattr_inode_get() Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 12/28] ext4: add missing le32_to_cpu(e_value_inum) conversions Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 13/28] ext4: ext4_xattr_value_same() should return false for external data Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 14/28] ext4: fix ext4_xattr_make_inode_space() value size calculation Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 15/28] ext4: fix ext4_xattr_move_to_block() Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 16/28] ext4: fix ext4_xattr_cmp() Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 17/28] ext4: fix credits calculation for xattr inode Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 18/28] ext4: retry storing value in external inode with xattr block too Tahsin Erdogan
2017-06-20  8:56   ` [PATCH v2 18/31] " Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 19/28] ext4: ext4_xattr_delete_inode() should return accurate errors Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 20/28] ext4: improve journal credit handling in set xattr paths Tahsin Erdogan
2017-06-20  8:59   ` [PATCH v2 20/31] " Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 21/28] ext4: modify ext4_xattr_ino_array to hold struct inode * Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 22/28] ext4: move struct ext4_xattr_inode_array to xattr.h Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 23/28] mbcache: make mbcache more generic Tahsin Erdogan
2017-06-15  7:41   ` Jan Kara
2017-06-15  7:41     ` [Ocfs2-devel] " Jan Kara
2017-06-15 18:25     ` Tahsin Erdogan
2017-06-19  8:50       ` Jan Kara
2017-06-19  8:50         ` [Ocfs2-devel] " Jan Kara
2017-06-20  9:01         ` [PATCH v2 23/31] mbcache: make mbcache naming " Tahsin Erdogan
2017-06-21 17:43           ` Andreas Dilger
2017-06-21 18:33           ` Andreas Dilger
2017-06-21 21:39             ` Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 24/28] ext4: rename mb block cache functions Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 25/28] ext4: add ext4_is_quota_file() Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 26/28] ext4: cleanup transaction restarts during inode deletion Tahsin Erdogan
2017-06-14 14:17   ` [PATCH v2 " Tahsin Erdogan
2017-06-15  0:11     ` Andreas Dilger
2017-06-15  0:11       ` [Ocfs2-devel] " Andreas Dilger
2017-06-20  9:04       ` [PATCH v3 " Tahsin Erdogan
2017-06-20  9:29         ` Tahsin Erdogan
2017-05-31  8:15 ` [PATCH 27/28] ext4: xattr inode deduplication Tahsin Erdogan
2017-05-31 15:40   ` kbuild test robot
2017-05-31 15:40     ` [Ocfs2-devel] " kbuild test robot
2017-05-31 15:50   ` kbuild test robot
2017-05-31 15:50     ` [Ocfs2-devel] " kbuild test robot
2017-05-31 16:00   ` Darrick J. Wong
2017-05-31 16:00     ` [Ocfs2-devel] " Darrick J. Wong
2017-05-31 16:00     ` Darrick J. Wong
2017-05-31 22:33     ` [PATCH v2 " Tahsin Erdogan
2017-06-02  5:41       ` Darrick J. Wong
2017-06-02  5:41         ` [Ocfs2-devel] " Darrick J. Wong
2017-06-02  5:41         ` Darrick J. Wong
2017-06-02 12:46         ` Tahsin Erdogan
2017-06-02 17:59           ` Darrick J. Wong
2017-06-02 17:59             ` [Ocfs2-devel] " Darrick J. Wong
2017-06-02 17:59             ` Darrick J. Wong
2017-06-02 23:35             ` [PATCH v3 " Tahsin Erdogan
2017-06-14 14:34               ` [PATCH v4 " Tahsin Erdogan
2017-06-14 23:26                 ` Andreas Dilger
2017-06-20  9:07                   ` [PATCH v5 " Tahsin Erdogan
2017-06-20  9:49                     ` Tahsin Erdogan
2017-06-21 17:42                       ` Andreas Dilger
2017-06-21 21:14                     ` Andreas Dilger
2017-06-21 21:34                       ` Tahsin Erdogan
2017-06-21 21:42                         ` Andreas Dilger
2017-07-04 18:39                     ` Theodore Ts'o
2017-07-05 17:30                       ` Tahsin Erdogan
2017-07-06  4:19                         ` Theodore Ts'o
2017-05-31  8:15 ` [PATCH 28/28] quota: add extra inode count to dquot transfer functions Tahsin Erdogan
2017-06-15  7:57   ` Jan Kara
2017-06-15  7:57     ` [Ocfs2-devel] " Jan Kara
2017-06-17  1:50     ` Tahsin Erdogan
2017-06-19  9:03       ` Jan Kara
2017-06-19  9:03         ` [Ocfs2-devel] " Jan Kara
2017-06-19 11:46         ` Tahsin Erdogan
2017-06-19 12:36           ` Jan Kara
2017-06-19 12:36             ` [Ocfs2-devel] " Jan Kara
2017-06-20  9:12             ` [PATCH v2 28/31] quota: add get_inode_usage callback to transfer multi-inode charges Tahsin Erdogan
2017-06-20 12:01               ` Tahsin Erdogan
2017-06-20 15:28               ` Jan Kara
2017-06-20 18:08                 ` [PATCH v3 " Tahsin Erdogan
2017-06-21  4:48                   ` Theodore Ts'o
2017-06-21 11:22                     ` Tahsin Erdogan
2017-06-20  9:53             ` [PATCH 28/28] quota: add extra inode count to dquot transfer functions Tahsin Erdogan
2017-05-31 16:42 ` [PATCH 01/28] ext4: xattr-in-inode support Darrick J. Wong
2017-05-31 16:42   ` [Ocfs2-devel] " Darrick J. Wong
2017-05-31 16:42   ` Darrick J. Wong
2017-05-31 19:59   ` Tahsin Erdogan
2017-06-01 15:50     ` [PATCH v2 " Tahsin Erdogan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.