All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Darrick J. Wong" <darrick.wong@oracle.com>
To: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Cc: linux-ext4@vger.kernel.org, adilger.kernel@dilger.ca,
	Andreas Dilger <andreas.dilger@intel.com>
Subject: Re: [RFC PATCH v2 1/2] ext4: dirdata feature
Date: Tue, 7 Nov 2017 10:53:33 -0800	[thread overview]
Message-ID: <20171107185333.GA6233@magnolia> (raw)
In-Reply-To: <20171101212455.47964-2-artem.blagodarenko@gmail.com>

On Thu, Nov 02, 2017 at 12:24:54AM +0300, Artem Blagodarenko wrote:
> From: Andreas Dilger <andreas.dilger@intel.com>
> 
> This patch implements feature which allows ext4 fs users (e.g. Lustre)
> to store data in ext4 dirent. Data is stored in ext4 dirent after
> file-name, this space is accounted in de->rec_len.
> Flag EXT4_DIRENT_LUFID added to d_type if extra data
> is present.
> 
> Make use of dentry->d_fsdata to pass fid to ext4. so no
> changes in ext4_add_entry() interface required.
> 
> Signed-off-by: Andreas Dilger <andreas.dilger@intel.com>
> Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
> ---
>  fs/ext4/dir.c    |  17 +++++---
>  fs/ext4/ext4.h   |  85 ++++++++++++++++++++++++++++++++++---
>  fs/ext4/inline.c |  18 ++++----
>  fs/ext4/namei.c  | 126 ++++++++++++++++++++++++++++++++++++++++++-------------
>  fs/ext4/super.c  |   3 +-
>  5 files changed, 200 insertions(+), 49 deletions(-)
> 
> diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
> index b04e882179c6..46fcb8ec47a6 100644
> --- a/fs/ext4/dir.c
> +++ b/fs/ext4/dir.c
> @@ -67,11 +67,11 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
>  	const int rlen = ext4_rec_len_from_disk(de->rec_len,
>  						dir->i_sb->s_blocksize);
>  
> -	if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
> +	if (unlikely(rlen < __EXT4_DIR_REC_LEN(1)))
>  		error_msg = "rec_len is smaller than minimal";
>  	else if (unlikely(rlen % 4 != 0))
>  		error_msg = "rec_len % 4 != 0";
> -	else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
> +	else if (unlikely(rlen < EXT4_DIR_REC_LEN(de)))
>  		error_msg = "rec_len is too small for name_len";
>  	else if (unlikely(((char *) de - buf) + rlen > size))
>  		error_msg = "directory entry across range";
> @@ -218,7 +218,8 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
>  				 * failure will be detected in the
>  				 * dirent test below. */
>  				if (ext4_rec_len_from_disk(de->rec_len,
> -					sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
> +						sb->s_blocksize) <
> +						__EXT4_DIR_REC_LEN(1))
>  					break;
>  				i += ext4_rec_len_from_disk(de->rec_len,
>  							    sb->s_blocksize);
> @@ -441,12 +442,18 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
>  	struct fname *fname, *new_fn;
>  	struct dir_private_info *info;
>  	int len;
> +	int extra_data = 0;
>  
>  	info = dir_file->private_data;
>  	p = &info->root.rb_node;
>  
>  	/* Create and allocate the fname structure */
> -	len = sizeof(struct fname) + ent_name->len + 1;
> +	if (dirent->file_type & ~EXT4_FT_MASK)
> +		extra_data = ext4_get_dirent_data_len(dirent);
> +
> +	len = sizeof(struct fname) + dirent->name_len + extra_data + 1;
> +
> +
>  	new_fn = kzalloc(len, GFP_KERNEL);
>  	if (!new_fn)
>  		return -ENOMEM;
> @@ -455,7 +462,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
>  	new_fn->inode = le32_to_cpu(dirent->inode);
>  	new_fn->name_len = ent_name->len;
>  	new_fn->file_type = dirent->file_type;
> -	memcpy(new_fn->name, ent_name->name, ent_name->len);
> +	memcpy(new_fn->name, ent_name->name, ent_name->len + extra_data);
>  	new_fn->name[ent_name->len] = 0;
>  
>  	while (*p) {
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index e2abe01c8c6b..9a9b01b0956a 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1111,6 +1111,7 @@ struct ext4_inode_info {
>   * Mount flags set via mount options or defaults
>   */
>  #define EXT4_MOUNT_NO_MBCACHE		0x00001 /* Do not use mbcache */
> +#define EXT4_MOUNT_DIRDATA		0x00002 /* Data in directory entries*/
>  #define EXT4_MOUNT_GRPID		0x00004	/* Create files with directory's group */
>  #define EXT4_MOUNT_DEBUG		0x00008	/* Some debugging messages */
>  #define EXT4_MOUNT_ERRORS_CONT		0x00010	/* Continue on errors */
> @@ -1804,7 +1805,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,		ENCRYPT)
>  					 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
>  					 EXT4_FEATURE_INCOMPAT_ENCRYPT | \
>  					 EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
> -					 EXT4_FEATURE_INCOMPAT_LARGEDIR)
> +					 EXT4_FEATURE_INCOMPAT_LARGEDIR | \
> +					 EXT4_FEATURE_INCOMPAT_DIRDATA)
>  #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
>  					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
>  					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
> @@ -1965,6 +1967,45 @@ struct ext4_dir_entry_tail {
>  
>  #define EXT4_FT_DIR_CSUM	0xDE
>  
> +#define EXT4_FT_MASK		0xf
> +
> +#if EXT4_FT_MAX > EXT4_FT_MASK
> +#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK"
> +#endif
> +
> +/*
> + * d_type has 4 unused bits, so it can hold four types data. these different
> + * type of data (e.g. lustre data, high 32 bits of 64-bit inode number) can be
> + * stored, in flag order, after file-name in ext4 dirent.
> + */
> +/*
> + * this flag is added to d_type if ext4 dirent has extra data after
> + * filename. this data length is variable and length is stored in first byte
> + * of data. data start after filename NUL byte.
> + * This is used by Lustre FS.
> + */
> +#define EXT4_DIRENT_LUFID		0x10
> +#define EXT4_DIRENT_INODE		0x20
> +#define DIRENT_INODE_LEN		2

Unrelated addition, since large inodes are the next patch?

> +
> +#define EXT4_LUFID_MAGIC    0xAD200907UL
> +struct ext4_dentry_param {
> +	__u32  edp_magic;	/* EXT4_LUFID_MAGIC */

If this is an on-disk data structure, this field type should be __le32.

> +	char   edp_len;		/* size of edp_data in bytes */

Don't we already have a length byte preceeding edp_magic that tells us
the length of the data?  I guess it's necessary for the incore buffer to
track the length of edp_data, but since this gets memcpy'd into the
dirent that means we store redundant size information.

> +	char   edp_data[0];	/* packed array of data */

(and these should be __u8, not char)

> +} __packed;
> +
> +static inline unsigned char *ext4_dentry_get_data(struct super_block *sb,
> +		struct ext4_dentry_param *p)
> +{
> +	if (!ext4_has_feature_dirdata(sb))
> +		return NULL;
> +	if (p && p->edp_magic == EXT4_LUFID_MAGIC)
> +		return &p->edp_len;
> +	else
> +		return NULL;
> +}
> +
>  /*
>   * EXT4_DIR_PAD defines the directory entries boundaries
>   *
> @@ -1972,8 +2013,11 @@ struct ext4_dir_entry_tail {
>   */
>  #define EXT4_DIR_PAD			4
>  #define EXT4_DIR_ROUND			(EXT4_DIR_PAD - 1)
> -#define EXT4_DIR_REC_LEN(name_len)	(((name_len) + 8 + EXT4_DIR_ROUND) & \
> +#define __EXT4_DIR_REC_LEN(name_len)	(((name_len) + 8 + EXT4_DIR_ROUND) & \
>  					 ~EXT4_DIR_ROUND)
> +#define EXT4_DIR_REC_LEN(de)		(__EXT4_DIR_REC_LEN(de->name_len +\
> +					ext4_get_dirent_data_len(de)))

Now that we have __EXT4_DIR_REC_LEN and EXT4_DIR_REC_LEN, how about a
comment to describe how they differ from each other?

> +
>  #define EXT4_MAX_REC_LEN		((1<<16)-1)
>  
>  /*
> @@ -2376,7 +2420,10 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
>  			     struct buffer_head *bh,
>  			     void *buf, int buf_size,
>  			     struct ext4_filename *fname,
> -			     struct ext4_dir_entry_2 **dest_de);
> +			     struct ext4_dir_entry_2 **dest_de,
> +			     bool is_dotdot,
> +			     bool *write_short_dotdot,
> +			     unsigned short dotdot_reclen);
>  void ext4_insert_dentry(struct inode *inode,
>  			struct ext4_dir_entry_2 *de,
>  			int buf_size,
> @@ -2392,10 +2439,16 @@ static const unsigned char ext4_filetype_table[] = {
>  
>  static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
>  {
> -	if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX)
> +	int fl_index = filetype & EXT4_FT_MASK;
> +
> +	if (!ext4_has_feature_filetype(sb) || fl_index >= EXT4_FT_MAX)
>  		return DT_UNKNOWN;
>  
> -	return ext4_filetype_table[filetype];
> +	if (!test_opt(sb, DIRDATA))
> +		return (ext4_filetype_table[fl_index]);

What's the use case for having the incompat feature flag set on disk but
no mount option?

> +	return (ext4_filetype_table[fl_index]) |
> +		(filetype & ~EXT4_FT_MASK);

So I guess this just overrides DT_*?  Is the high nibble of de->filetype
(the new EXT4_DIRENT_* flags) exposed to userspace?  It would seem to
be, since the return value is passed to dir_emit(), in which case
userland readdir callers are in for a surprise.

>  }
>  extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
>  			     void *buf, int buf_size);
> @@ -3271,6 +3324,28 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
>  
>  extern const struct iomap_ops ext4_iomap_ops;
>  
> +/*
> + * Compute the total directory entry data length.
> + * This includes the filename and an implicit NUL terminator (always present),
> + * and optional extensions.  Each extension has a bit set in the high 4 bits of
> + * de->file_type, and the extension length is the first byte in each entry.
> + */
> +static inline int ext4_get_dirent_data_len(struct ext4_dir_entry_2 *de)
> +{
> +	char *len = de->name + de->name_len + 1 /* NUL terminator */;
> +	int dlen = 0;
> +	__u8 extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4;
> +
> +	while (extra_data_flags) {
> +		if (extra_data_flags & 1) {
> +			dlen += *len + (dlen == 0);
> +			len += *len;

Ugh, dereferencing an char pointer to get the length.  See later rant
about adding struct ext4_dirent_data_header to avoid this raw byte
interpretation stuff.

> +		}
> +		extra_data_flags >>= 1;
> +	}
> +	return dlen;
> +}
> +
>  #endif	/* __KERNEL__ */
>  
>  #define EFSBADCRC	EBADMSG		/* Bad CRC detected */
> diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
> index 28c5c3abddb3..ea46735e18c6 100644
> --- a/fs/ext4/inline.c
> +++ b/fs/ext4/inline.c
> @@ -1026,7 +1026,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
>  	struct ext4_dir_entry_2 *de;
>  
>  	err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start,
> -				inline_size, fname, &de);
> +				inline_size, fname, &de, 0, NULL, 0);
>  	if (err)
>  		return err;
>  
> @@ -1103,7 +1103,7 @@ static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
>  	int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
>  	int new_size = get_max_inline_xattr_value_size(dir, iloc);
>  
> -	if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
> +	if (new_size - old_size <= __EXT4_DIR_REC_LEN(1))
>  		return -ENOSPC;
>  
>  	ret = ext4_update_inline_data(handle, dir,
> @@ -1384,8 +1384,8 @@ int htree_inlinedir_to_tree(struct file *dir_file,
>  			fake.name_len = 1;
>  			strcpy(fake.name, ".");
>  			fake.rec_len = ext4_rec_len_to_disk(
> -						EXT4_DIR_REC_LEN(fake.name_len),
> -						inline_size);
> +					__EXT4_DIR_REC_LEN(fake.name_len),
> +					inline_size);
>  			ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
>  			de = &fake;
>  			pos = EXT4_INLINE_DOTDOT_OFFSET;
> @@ -1394,8 +1394,8 @@ int htree_inlinedir_to_tree(struct file *dir_file,
>  			fake.name_len = 2;
>  			strcpy(fake.name, "..");
>  			fake.rec_len = ext4_rec_len_to_disk(
> -						EXT4_DIR_REC_LEN(fake.name_len),
> -						inline_size);
> +					__EXT4_DIR_REC_LEN(fake.name_len),
> +					inline_size);

Unrelated indenting changes...

>  			ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
>  			de = &fake;
>  			pos = EXT4_INLINE_DOTDOT_SIZE;
> @@ -1492,8 +1492,8 @@ int ext4_read_inline_dir(struct file *file,
>  	 * So we will use extra_offset and extra_size to indicate them
>  	 * during the inline dir iteration.
>  	 */
> -	dotdot_offset = EXT4_DIR_REC_LEN(1);
> -	dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
> +	dotdot_offset = __EXT4_DIR_REC_LEN(1);
> +	dotdot_size = dotdot_offset + __EXT4_DIR_REC_LEN(2);
>  	extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
>  	extra_size = extra_offset + inline_size;
>  
> @@ -1528,7 +1528,7 @@ int ext4_read_inline_dir(struct file *file,
>  			 * failure will be detected in the
>  			 * dirent test below. */
>  			if (ext4_rec_len_from_disk(de->rec_len, extra_size)
> -				< EXT4_DIR_REC_LEN(1))
> +				< __EXT4_DIR_REC_LEN(1))
>  				break;
>  			i += ext4_rec_len_from_disk(de->rec_len,
>  						    extra_size);
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index c1cf020d1889..b09e73100e14 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -249,7 +249,8 @@ static unsigned dx_get_count(struct dx_entry *entries);
>  static unsigned dx_get_limit(struct dx_entry *entries);
>  static void dx_set_count(struct dx_entry *entries, unsigned value);
>  static void dx_set_limit(struct dx_entry *entries, unsigned value);
> -static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
> +static inline unsigned int dx_root_limit(struct inode *dir,
> +		struct ext4_dir_entry_2 *dot_de, unsigned int infosize);
>  static unsigned dx_node_limit(struct inode *dir);
>  static struct dx_frame *dx_probe(struct ext4_filename *fname,
>  				 struct inode *dir,
> @@ -551,10 +552,16 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
>  	((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
>  }
>  
> -static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
> +static inline unsigned int dx_root_limit(struct inode *dir,
> +		struct ext4_dir_entry_2 *dot_de, unsigned int infosize)
>  {
> -	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
> -		EXT4_DIR_REC_LEN(2) - infosize;
> +	struct ext4_dir_entry_2 *dotdot_de;
> +	unsigned int entry_space;
> +
> +	BUG_ON(dot_de->name_len != 1);

Yikes, this will crash the kernel when someone feeds us malicious
metadata!

> +	dotdot_de = ext4_next_entry(dot_de, dir->i_sb->s_blocksize);
> +	entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(dot_de) -
> +			 EXT4_DIR_REC_LEN(dotdot_de) - infosize;
>  
>  	if (ext4_has_metadata_csum(dir->i_sb))
>  		entry_space -= sizeof(struct dx_tail);
> @@ -563,7 +570,8 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
>  
>  static inline unsigned dx_node_limit(struct inode *dir)
>  {
> -	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
> +	unsigned int entry_space = dir->i_sb->s_blocksize -
> +					__EXT4_DIR_REC_LEN(0);
>  
>  	if (ext4_has_metadata_csum(dir->i_sb))
>  		entry_space -= sizeof(struct dx_tail);
> @@ -675,7 +683,7 @@ static struct stats dx_show_leaf(struct inode *dir,
>  				       (unsigned) ((char *) de - base));
>  #endif
>  			}
> -			space += EXT4_DIR_REC_LEN(de->name_len);
> +			space += EXT4_DIR_REC_LEN(de);
>  			names++;
>  		}
>  		de = ext4_next_entry(de, size);
> @@ -785,10 +793,14 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
>  				      root->info.info_length);
>  
>  	if (dx_get_limit(entries) != dx_root_limit(dir,
> -						   root->info.info_length)) {
> +				(struct ext4_dir_entry_2 *) frame->bh->b_data,
> +				root->info.info_length)) {
>  		ext4_warning_inode(dir, "dx entry: limit %u != root limit %u",
>  				   dx_get_limit(entries),
> -				   dx_root_limit(dir, root->info.info_length));
> +				   dx_root_limit(dir,
> +						 (struct ext4_dir_entry_2 *)
> +						 frame->bh->b_data,
> +						 root->info.info_length));
>  		goto fail;
>  	}
>  
> @@ -980,7 +992,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
>  	de = (struct ext4_dir_entry_2 *) bh->b_data;
>  	top = (struct ext4_dir_entry_2 *) ((char *) de +
>  					   dir->i_sb->s_blocksize -
> -					   EXT4_DIR_REC_LEN(0));
> +					   __EXT4_DIR_REC_LEN(0));
>  #ifdef CONFIG_EXT4_FS_ENCRYPTION
>  	/* Check if the directory is encrypted */
>  	if (ext4_encrypted_inode(dir)) {
> @@ -1563,6 +1575,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
>  	inode = NULL;
>  	if (bh) {
>  		__u32 ino = le32_to_cpu(de->inode);
> +
>  		brelse(bh);
>  		if (!ext4_valid_inum(dir->i_sb, ino)) {
>  			EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
> @@ -1631,7 +1644,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
>  	while (count--) {
>  		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
>  						(from + (map->offs<<2));
> -		rec_len = EXT4_DIR_REC_LEN(de->name_len);
> +		rec_len = EXT4_DIR_REC_LEN(de);
>  		memcpy (to, de, rec_len);
>  		((struct ext4_dir_entry_2 *) to)->rec_len =
>  				ext4_rec_len_to_disk(rec_len, blocksize);
> @@ -1655,7 +1668,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
>  	while ((char*)de < base + blocksize) {
>  		next = ext4_next_entry(de, blocksize);
>  		if (de->inode && de->name_len) {
> -			rec_len = EXT4_DIR_REC_LEN(de->name_len);
> +			rec_len = EXT4_DIR_REC_LEN(de);
>  			if (de > to)
>  				memmove(to, de, rec_len);
>  			to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
> @@ -1786,10 +1799,13 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
>  		      struct buffer_head *bh,
>  		      void *buf, int buf_size,
>  		      struct ext4_filename *fname,
> -		      struct ext4_dir_entry_2 **dest_de)
> +		      struct ext4_dir_entry_2 **dest_de,
> +		      bool is_dotdot,
> +		      bool *write_short_dotdot,
> +		      unsigned short dotdot_reclen)
>  {
>  	struct ext4_dir_entry_2 *de;
> -	unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname));
> +	unsigned short reclen = __EXT4_DIR_REC_LEN(fname_len(fname));
>  	int nlen, rlen;
>  	unsigned int offset = 0;
>  	char *top;
> @@ -1802,10 +1818,28 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
>  			return -EFSCORRUPTED;
>  		if (ext4_match(fname, de))
>  			return -EEXIST;
> -		nlen = EXT4_DIR_REC_LEN(de->name_len);
> +		nlen = EXT4_DIR_REC_LEN(de);
>  		rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
> +		/* Check first for enough space for the full entry */
>  		if ((de->inode ? rlen - nlen : rlen) >= reclen)
>  			break;
> +		/* Then for dotdot entries, check for the smaller space
> +		 * required for just the entry, no FID
> +		 */
> +		if (is_dotdot) {
> +			if ((de->inode ? rlen - nlen : rlen) >=
> +			    dotdot_reclen) {
> +				*write_short_dotdot = true;
> +				break;
> +			}
> +			/* The new ".." entry mut be written over the
> +			 * previous ".." entry, which is the first
> +			 * entry traversed by this scan.  If it doesn't
> +			 * fit, something is badly wrong, so -EIO.
> +			 */
> +			return -EIO;
> +		}
> +
>  		de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
>  		offset += rlen;
>  	}
> @@ -1824,7 +1858,8 @@ void ext4_insert_dentry(struct inode *inode,
>  
>  	int nlen, rlen;
>  
> -	nlen = EXT4_DIR_REC_LEN(de->name_len);
> +	nlen = EXT4_DIR_REC_LEN(de);
> +
>  	rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
>  	if (de->inode) {
>  		struct ext4_dir_entry_2 *de1 =
> @@ -1848,21 +1883,46 @@ void ext4_insert_dentry(struct inode *inode,
>   * space.  It will return -ENOSPC if no space is available, and -EIO
>   * and -EEXIST if directory entry already exists.
>   */
> -static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
> +static int add_dirent_to_buf(handle_t *handle,
> +			     struct dentry *dentry,
> +			     struct ext4_filename *fname,
>  			     struct inode *dir,
>  			     struct inode *inode, struct ext4_dir_entry_2 *de,
>  			     struct buffer_head *bh)
>  {
>  	unsigned int	blocksize = dir->i_sb->s_blocksize;
>  	int		csum_size = 0;
> -	int		err;
> +	unsigned short	reclen, dotdot_reclen = 0;
> +	int		 err, dlen = 0;
> +	bool		is_dotdot = false, write_short_dotdot = false;
> +	unsigned char	*data;
> +	int namelen = dentry->d_name.len;
>  
>  	if (ext4_has_metadata_csum(inode->i_sb))
>  		csum_size = sizeof(struct ext4_dir_entry_tail);
>  
> +	data = ext4_dentry_get_data(inode->i_sb, (struct ext4_dentry_param *)
> +						dentry->d_fsdata);
> +	if (data)
> +		dlen = (*data) + 1;

Ok, now I /really/ want this to be some kind of data structure instead
of raw dereferencing of an unsigned char pointer to find the length.

struct ext4_dirent_data_header {
	/* length of this header + the whole data blob */
	__u8				ddh_length;
} __packed;

struct ext4_dirent_lufid {
	struct ext4_dirent_data_header	dl_header; /* 6+ */
	__le32				dl_magic; /* 0xAD200907 */
	__u8				dl_datalen;
	__u8				dl_data[0];
} __packed;

struct ext4_dirent_inohi {
	struct ext4_dirent_data_header	di_header; /* 5 */
	__le32				di_inohi;
} __packed;


...and then:

struct ext4_dirent_lufid *dl = ext4_dentry_get_data(...);

if (dl)
	dlen = dl->dl_header.ddh_length + 1;

> +
> +	is_dotdot = (namelen == 2 &&
> +		     memcmp(dentry->d_name.name, "..", 2) == 0);
> +
> +	/* dotdot entries must be in the second place in a directory block,
> +	 * so calculate an alternate length without the dirdata so they can
> +	 * always be made to fit in the existing slot
> +	 */
> +	if (is_dotdot)
> +		dotdot_reclen = __EXT4_DIR_REC_LEN(namelen);
> +
> +	reclen = __EXT4_DIR_REC_LEN(namelen + dlen + 3);
> +
>  	if (!de) {
>  		err = ext4_find_dest_de(dir, inode, bh, bh->b_data,
> -					blocksize - csum_size, fname, &de);
> +					blocksize - csum_size, fname, &de,
> +					is_dotdot,
> +					&write_short_dotdot, dotdot_reclen);
>  		if (err)
>  			return err;
>  	}
> @@ -1876,6 +1936,13 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
>  	/* By now the buffer is marked for journaling */
>  	ext4_insert_dentry(inode, de, blocksize, fname);
>  
> +	/* If we're writing short form of "dotdot", don't add data section */
> +	if (data && !write_short_dotdot) {

What if we're writing a long dotdot entry and write_short_dotdot is true?
We're not just dropping the LUFID on the floor, are we?

> +		de->name[namelen] = 0;

Not sure why we suddenly need this extra null byte in the name; we've
gotten along just fine without it.

> +		memcpy(&de->name[namelen + 1], data, *(char *)data);

memcpy(&de->name[namelen + 1], dl, dl->dl_header.ddh_length);

(Endian conversions?)

--D

> +		de->file_type |= EXT4_DIRENT_LUFID;
> +	}
> +
>  	/*
>  	 * XXX shouldn't update any times until successful
>  	 * completion of syscall, but too many callers depend
> @@ -1970,7 +2037,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
>  
>  	/* Initialize the root; the dot dirents already exist */
>  	de = (struct ext4_dir_entry_2 *) (&root->dotdot);
> -	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
> +	de->rec_len = ext4_rec_len_to_disk(blocksize - __EXT4_DIR_REC_LEN(2),
>  					   blocksize);
>  	memset (&root->info, 0, sizeof(root->info));
>  	root->info.info_length = sizeof(root->info);
> @@ -1978,7 +2045,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
>  	entries = root->entries;
>  	dx_set_block(entries, 1);
>  	dx_set_count(entries, 1);
> -	dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
> +	dx_set_limit(entries, dx_root_limit(dir,
> +					 fde, sizeof(root->info)));
>  
>  	/* Initialize as for dx_probe */
>  	fname->hinfo.hash_version = root->info.hash_version;
> @@ -2006,7 +2074,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
>  		goto out_frames;
>  	}
>  
> -	retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2);
> +	retval = add_dirent_to_buf(handle, NULL, fname, dir, inode, de, bh2);
>  out_frames:
>  	/*
>  	 * Even if the block split failed, we have to properly write
> @@ -2083,7 +2151,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
>  			bh = NULL;
>  			goto out;
>  		}
> -		retval = add_dirent_to_buf(handle, &fname, dir, inode,
> +		retval = add_dirent_to_buf(handle, dentry, &fname, dir, inode,
>  					   NULL, bh);
>  		if (retval != -ENOSPC)
>  			goto out;
> @@ -2112,7 +2180,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
>  		initialize_dirent_tail(t, blocksize);
>  	}
>  
> -	retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh);
> +	retval = add_dirent_to_buf(handle, dentry, &fname, dir, inode, de, bh);
>  out:
>  	ext4_fname_free_filename(&fname);
>  	brelse(bh);
> @@ -2154,7 +2222,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>  	if (err)
>  		goto journal_error;
>  
> -	err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh);
> +	err = add_dirent_to_buf(handle, NULL, fname, dir, inode, NULL, bh);
>  	if (err != -ENOSPC)
>  		goto cleanup;
>  
> @@ -2279,7 +2347,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>  		err = PTR_ERR(de);
>  		goto cleanup;
>  	}
> -	err = add_dirent_to_buf(handle, fname, dir, inode, de, bh);
> +	err = add_dirent_to_buf(handle, NULL, fname, dir, inode, de, bh);
>  	goto cleanup;
>  
>  journal_error:
> @@ -2545,7 +2613,7 @@ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
>  {
>  	de->inode = cpu_to_le32(inode->i_ino);
>  	de->name_len = 1;
> -	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
> +	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de),
>  					   blocksize);
>  	strcpy(de->name, ".");
>  	ext4_set_de_type(inode->i_sb, de, S_IFDIR);
> @@ -2555,11 +2623,11 @@ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
>  	de->name_len = 2;
>  	if (!dotdot_real_len)
>  		de->rec_len = ext4_rec_len_to_disk(blocksize -
> -					(csum_size + EXT4_DIR_REC_LEN(1)),
> +					(csum_size + __EXT4_DIR_REC_LEN(1)),
>  					blocksize);
>  	else
>  		de->rec_len = ext4_rec_len_to_disk(
> -				EXT4_DIR_REC_LEN(de->name_len), blocksize);
> +				EXT4_DIR_REC_LEN(de), blocksize);
>  	strcpy(de->name, "..");
>  	ext4_set_de_type(inode->i_sb, de, S_IFDIR);
>  
> @@ -2688,7 +2756,7 @@ bool ext4_empty_dir(struct inode *inode)
>  	}
>  
>  	sb = inode->i_sb;
> -	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
> +	if (inode->i_size < __EXT4_DIR_REC_LEN(1) + __EXT4_DIR_REC_LEN(2)) {
>  		EXT4_ERROR_INODE(inode, "invalid size");
>  		return true;
>  	}
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index b0915b734a38..ead9406d9cff 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1339,7 +1339,7 @@ enum {
>  	Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
>  	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
>  	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
> -	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
> +	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_dirdata,
>  	Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax,
>  	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
>  	Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
> @@ -1400,6 +1400,7 @@ static const match_table_t tokens = {
>  	{Opt_noquota, "noquota"},
>  	{Opt_quota, "quota"},
>  	{Opt_usrquota, "usrquota"},
> +	{Opt_dirdata, "dirdata"},
>  	{Opt_prjquota, "prjquota"},
>  	{Opt_barrier, "barrier=%u"},
>  	{Opt_barrier, "barrier"},
> -- 
> 2.13.5 (Apple Git-94)
> 

  reply	other threads:[~2017-11-07 18:53 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-11-01 21:24 [RFC PATCH v2 0/2] 64 bit inode counter support Artem Blagodarenko
2017-11-01 21:24 ` [RFC PATCH v2 1/2] ext4: dirdata feature Artem Blagodarenko
2017-11-07 18:53   ` Darrick J. Wong [this message]
2017-11-08  5:40     ` Andreas Dilger
2017-11-01 21:24 ` [RFC PATCH v2 2/2] ext4: Add 64-bit inode number support Artem Blagodarenko
2017-11-07 19:04   ` Darrick J. Wong
2017-11-08  5:51     ` Andreas Dilger
2017-11-07 23:37   ` Andreas Dilger

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20171107185333.GA6233@magnolia \
    --to=darrick.wong@oracle.com \
    --cc=adilger.kernel@dilger.ca \
    --cc=andreas.dilger@intel.com \
    --cc=artem.blagodarenko@gmail.com \
    --cc=linux-ext4@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.