All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] fat (exportfs): fix NFS file handle decode
@ 2012-07-03 19:09 Steven J. Magnani
  2012-07-03 19:09 ` [PATCH 1/2] fat (exportfs): drop ineffective get_parent code Steven J. Magnani
  2012-07-03 19:09 ` [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries Steven J. Magnani
  0 siblings, 2 replies; 27+ messages in thread
From: Steven J. Magnani @ 2012-07-03 19:09 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel

When the system evicts inodes and dentries under memory pressure, the FAT
driver is unable to map NFS file handles back to the objects they reference.
In many cases this causes client operations to fail with ENOENT. This is
partially due to ineffectiveness of the current FAT NFS implementation,
and partially due to export_operations that have not yet been implemented for
FAT. For example, the lack of a fh_to_parent method can cause file accesses to
fail on shares exported with subtree_check.

This series depends on the following patches:
* fat: Fix non-atomic NFS i_pos read
* fat: Accessors for msdos_dir_entry 'start' fields
* fat: Refactor shortname parsing


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 1/2] fat (exportfs): drop ineffective get_parent code
  2012-07-03 19:09 [PATCH 0/2] fat (exportfs): fix NFS file handle decode Steven J. Magnani
@ 2012-07-03 19:09 ` Steven J. Magnani
  2012-07-04 10:30   ` OGAWA Hirofumi
  2012-07-03 19:09 ` [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries Steven J. Magnani
  1 sibling, 1 reply; 27+ messages in thread
From: Steven J. Magnani @ 2012-07-03 19:09 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel, Steven J. Magnani

This patch reorganizes existing code for reuse by the next patch in the series.

The only functional change is to remove ineffective code from 
fat_get_dotdot_entry() and limit fat_get_parent() to immediate children of the
root (the only disconnected directory case that can be handled without a 
search). The limitation will be removed in the next patch in the series.

These changes are a slight improvement over the current situation, in that they
allow reconnection of immediate subdirectories of the root, and reporting of
ESTALE for other cases.

The reason the i_pos code in fat_get_dotdot_entry() is ineffective is
because the value is the on-disk position of a ".." directory entry.
What is needed in order to map NFS file handles back to objects is the on-disk
position of the named entry to which ".." refers.

Signed-off-by: Steven J. Magnani <steve@digidescorp.com>
---
diff -uprN a/fs/fat/dir.c b/fs/fat/dir.c
--- a/fs/fat/dir.c	2012-07-03 07:21:58.918610460 -0500
+++ b/fs/fat/dir.c	2012-07-03 10:07:58.315967863 -0500
@@ -40,14 +40,6 @@ static inline unsigned char fat_tolower(
 	return ((c >= 'A') && (c <= 'Z')) ? c+32 : c;
 }
 
-static inline loff_t fat_make_i_pos(struct super_block *sb,
-				    struct buffer_head *bh,
-				    struct msdos_dir_entry *de)
-{
-	return ((loff_t)bh->b_blocknr << MSDOS_SB(sb)->dir_per_block_bits)
-		| (de - (struct msdos_dir_entry *)bh->b_data);
-}
-
 static inline void fat_dir_readahead(struct inode *dir, sector_t iblock,
 				     sector_t phys)
 {
@@ -876,17 +868,14 @@ static int fat_get_short_entry(struct in
  * for inode. So, this function provide the some informations only.
  */
 int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
-			 struct msdos_dir_entry **de, loff_t *i_pos)
+			 struct msdos_dir_entry **de)
 {
-	loff_t offset;
+	loff_t offset = 0;
 
-	offset = 0;
-	*bh = NULL;
+	*de = NULL;
 	while (fat_get_short_entry(dir, &offset, bh, de) >= 0) {
-		if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME)) {
-			*i_pos = fat_make_i_pos(dir->i_sb, *bh, *de);
+		if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME))
 			return 0;
-		}
 	}
 	return -ENOENT;
 }
diff -uprN a/fs/fat/fat.h b/fs/fat/fat.h
--- a/fs/fat/fat.h	2012-07-03 07:21:53.723637293 -0500
+++ b/fs/fat/fat.h	2012-07-03 10:07:58.325967754 -0500
@@ -169,6 +169,15 @@ static inline umode_t fat_make_mode(stru
 		return (mode & ~sbi->options.fs_fmask) | S_IFREG;
 }
 
+/* Construct i_pos from on-disk position of directory entry */
+static inline loff_t fat_make_i_pos(struct super_block *sb,
+				    struct buffer_head *bh,
+				    struct msdos_dir_entry *de)
+{
+	return ((loff_t)bh->b_blocknr << MSDOS_SB(sb)->dir_per_block_bits)
+		| (de - (struct msdos_dir_entry *)bh->b_data);
+}
+
 /* Return the FAT attribute byte for this inode */
 static inline u8 fat_make_attrs(struct inode *inode)
 {
@@ -262,7 +271,7 @@ extern int fat_subdirs(struct inode *dir
 extern int fat_scan(struct inode *dir, const unsigned char *name,
 		    struct fat_slot_info *sinfo);
 extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
-				struct msdos_dir_entry **de, loff_t *i_pos);
+				struct msdos_dir_entry **de);
 extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
 extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
 			   struct fat_slot_info *sinfo);
diff -uprN a/fs/fat/inode.c b/fs/fat/inode.c
--- a/fs/fat/inode.c	2012-07-03 07:21:53.733637243 -0500
+++ b/fs/fat/inode.c	2012-07-03 10:07:58.335967643 -0500
@@ -411,31 +411,48 @@ static int fat_fill_inode(struct inode *
 	return 0;
 }
 
+/**
+ * Create inode from specified directory entry.
+ * Do not call this directly unless intentionally bypassing the FAT dir cache.
+ */
+static struct inode *fat_build_unhashed_inode(struct super_block *sb,
+			struct msdos_dir_entry *de)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (inode) {
+		int err;
+
+		inode->i_ino = iunique(sb, MSDOS_ROOT_INO);
+		inode->i_version = 1;
+		err = fat_fill_inode(inode, de);
+		if (err) {
+			iput(inode);
+			inode = ERR_PTR(err);
+		}
+	} else
+		inode = ERR_PTR(-ENOMEM);
+
+	return inode;
+}
+
+/**
+ * Find or create FAT dir cache inode for i_pos.
+ */
 struct inode *fat_build_inode(struct super_block *sb,
 			struct msdos_dir_entry *de, loff_t i_pos)
 {
 	struct inode *inode;
-	int err;
 
 	inode = fat_iget(sb, i_pos);
-	if (inode)
-		goto out;
-	inode = new_inode(sb);
 	if (!inode) {
-		inode = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-	inode->i_ino = iunique(sb, MSDOS_ROOT_INO);
-	inode->i_version = 1;
-	err = fat_fill_inode(inode, de);
-	if (err) {
-		iput(inode);
-		inode = ERR_PTR(err);
-		goto out;
+		inode = fat_build_unhashed_inode(sb, de);
+		if (!IS_ERR(inode)) {
+			fat_attach(inode, i_pos);
+			insert_inode_hash(inode);
+		}
 	}
-	fat_attach(inode, i_pos);
-	insert_inode_hash(inode);
-out:
 	return inode;
 }
 
@@ -677,25 +694,59 @@ static const struct super_operations fat
  * of i_logstart is used to store the directory entry offset.
  */
 
+static int
+fat_encode_fh(struct inode *inode, __u32 *fh, int *lenp, struct inode *parent)
+{
+	int len = *lenp;
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	loff_t i_pos;
+
+	if (len < 5) {
+		*lenp = 5;
+		return 255; /* no room */
+	}
+
+	i_pos = fat_i_pos_read(sbi, inode);
+	*lenp = 5;
+	fh[0] = inode->i_ino;
+	fh[1] = inode->i_generation;
+	fh[2] = i_pos >> 8;
+	fh[3] = ((i_pos & 0xf0) << 24) | MSDOS_I(inode)->i_logstart;
+	fh[4] = (i_pos & 0x0f) << 28;
+	if (parent)
+		fh[4] |= MSDOS_I(parent)->i_logstart;
+	return 3;
+}
+
+static int fat_is_valid_fh(int fh_len, int fh_type)
+{
+	return ((fh_len >= 5) && (fh_type == 3));
+}
+
 static struct dentry *fat_fh_to_dentry(struct super_block *sb,
 		struct fid *fid, int fh_len, int fh_type)
 {
 	struct inode *inode = NULL;
 	u32 *fh = fid->raw;
+	loff_t i_pos;
+	unsigned long i_ino;
+	__u32 i_generation;
+	int i_logstart;
 
-	if (fh_len < 5 || fh_type != 3)
+	if (!fat_is_valid_fh(fh_len, fh_type))
 		return NULL;
 
-	inode = ilookup(sb, fh[0]);
-	if (!inode || inode->i_generation != fh[1]) {
+	i_ino = fh[0];
+	i_generation = fh[1];
+	i_logstart = fh[3] & 0x0fffffff;
+
+	inode = ilookup(sb, i_ino);
+	if (!inode || inode->i_generation != i_generation) {
 		if (inode)
 			iput(inode);
 		inode = NULL;
 	}
 	if (!inode) {
-		loff_t i_pos;
-		int i_logstart = fh[3] & 0x0fffffff;
-
 		i_pos = (loff_t)fh[2] << 8;
 		i_pos |= ((fh[3] >> 24) & 0xf0) | (fh[4] >> 28);
 
@@ -728,52 +779,28 @@ static struct dentry *fat_fh_to_dentry(s
 	return d_obtain_alias(inode);
 }
 
-static int
-fat_encode_fh(struct inode *inode, __u32 *fh, int *lenp, struct inode *parent)
-{
-	int len = *lenp;
-	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
-	loff_t i_pos;
-
-	if (len < 5) {
-		*lenp = 5;
-		return 255; /* no room */
-	}
-
-	i_pos = fat_i_pos_read(sbi, inode);
-	*lenp = 5;
-	fh[0] = inode->i_ino;
-	fh[1] = inode->i_generation;
-	fh[2] = i_pos >> 8;
-	fh[3] = ((i_pos & 0xf0) << 24) | MSDOS_I(inode)->i_logstart;
-	fh[4] = (i_pos & 0x0f) << 28;
-	if (parent)
-		fh[4] |= MSDOS_I(parent)->i_logstart;
-	return 3;
-}
-
-static struct dentry *fat_get_parent(struct dentry *child)
+/*
+ * NFS support:  Find the parent for a disconnected directory.
+ */
+static struct dentry *fat_get_parent(struct dentry *child_dir)
 {
-	struct super_block *sb = child->d_sb;
-	struct buffer_head *bh;
+	struct super_block *sb = child_dir->d_sb;
+	struct buffer_head *bh = NULL;
 	struct msdos_dir_entry *de;
-	loff_t i_pos;
-	struct dentry *parent;
-	struct inode *inode;
-	int err;
+	struct dentry *parent = ERR_PTR(-ESTALE);
+	int parent_logstart;
 
 	lock_super(sb);
 
-	err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
-	if (err) {
-		parent = ERR_PTR(err);
+	if (fat_get_dotdot_entry(child_dir->d_inode, &bh, &de))
 		goto out;
-	}
-	inode = fat_build_inode(sb, de, i_pos);
-	brelse(bh);
 
-	parent = d_obtain_alias(inode);
+	parent_logstart = fat_get_start(MSDOS_SB(sb), de);
+	if (!parent_logstart)
+		parent = sb->s_root;
+
 out:
+	brelse(bh);
 	unlock_super(sb);
 
 	return parent;
diff -uprN a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
--- a/fs/fat/namei_msdos.c	2012-07-03 07:21:53.750637154 -0500
+++ b/fs/fat/namei_msdos.c	2012-07-03 10:07:58.351967464 -0500
@@ -440,7 +440,7 @@ static int do_msdos_rename(struct inode
 	struct inode *old_inode, *new_inode;
 	struct fat_slot_info old_sinfo, sinfo;
 	struct timespec ts;
-	loff_t dotdot_i_pos, new_i_pos;
+	loff_t new_i_pos;
 	int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
 
 	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
@@ -456,8 +456,8 @@ static int do_msdos_rename(struct inode
 	is_dir = S_ISDIR(old_inode->i_mode);
 	update_dotdot = (is_dir && old_dir != new_dir);
 	if (update_dotdot) {
-		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
-					 &dotdot_i_pos) < 0) {
+		err = fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de);
+		if (err < 0) {
 			err = -EIO;
 			goto out;
 		}
diff -uprN a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
--- a/fs/fat/namei_vfat.c	2012-07-03 07:21:53.758637114 -0500
+++ b/fs/fat/namei_vfat.c	2012-07-03 10:07:58.359967376 -0500
@@ -914,7 +914,7 @@ static int vfat_rename(struct inode *old
 	struct inode *old_inode, *new_inode;
 	struct fat_slot_info old_sinfo, sinfo;
 	struct timespec ts;
-	loff_t dotdot_i_pos, new_i_pos;
+	loff_t new_i_pos;
 	int err, is_dir, update_dotdot, corrupt = 0;
 	struct super_block *sb = old_dir->i_sb;
 
@@ -929,8 +929,8 @@ static int vfat_rename(struct inode *old
 	is_dir = S_ISDIR(old_inode->i_mode);
 	update_dotdot = (is_dir && old_dir != new_dir);
 	if (update_dotdot) {
-		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
-					 &dotdot_i_pos) < 0) {
+		err = fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de);
+		if (err < 0) {
 			err = -EIO;
 			goto out;
 		}


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-03 19:09 [PATCH 0/2] fat (exportfs): fix NFS file handle decode Steven J. Magnani
  2012-07-03 19:09 ` [PATCH 1/2] fat (exportfs): drop ineffective get_parent code Steven J. Magnani
@ 2012-07-03 19:09 ` Steven J. Magnani
  2012-07-04 11:07   ` OGAWA Hirofumi
  1 sibling, 1 reply; 27+ messages in thread
From: Steven J. Magnani @ 2012-07-03 19:09 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel, Steven J. Magnani

This patch adds code to support reconstruction of evicted inodes.
We walk the on-disk structures where necessary to fill in information
not available in the NFS file handle.

One important point is that when reconstructing an inode, in order to avoid the
*client* declaring ESTALE we have to ensure that the NFS file handle of the
reconstruction is identical to that of the original. 

Signed-off-by: Steven J. Magnani <steve@digidescorp.com>
---
diff -uprN a/fs/fat/dir.c b/fs/fat/dir.c
--- a/fs/fat/dir.c	2012-07-03 08:38:17.077404182 -0500
+++ b/fs/fat/dir.c	12:57:02.506715436 -0500
@@ -532,6 +532,194 @@ end_of_dir:
 
 EXPORT_SYMBOL_GPL(fat_search_long);
 
+/**
+ * Fetch the name associated with the specified location (i_pos or i_logstart)
+ * and parent dir.
+ *
+ * On entry, the superblock is assumed locked.
+ *
+ * Returns 0 on success, -ENOENT if child cannot be found,
+ * -ENOMEM on malloc failure
+ */
+static int fat_name_for_loc(struct inode *parent, char *name, int name_size,
+			    loff_t child_loc, int is_logstart)
+{
+	struct super_block *sb = parent->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	int isvfat = sbi->options.isvfat;
+	struct buffer_head *bh = NULL;
+	wchar_t *unicode = NULL;
+	struct msdos_dir_entry *de;
+	loff_t cpos = 0;
+	int err = -ENOENT;
+	loff_t cur_loc;
+
+	if (!is_logstart && !child_loc) {
+		fat_msg(sb, KERN_WARNING, "%s: i_pos == 0", __func__);
+		return -ENOENT;
+	}
+
+	while (1) {
+		unsigned char nr_slots = 0;
+
+		if (fat_get_entry(parent, &cpos, &bh, &de))
+			break;
+
+parse_record:
+		if (isvfat) {
+			if (de->name[0] == DELETED_FLAG)
+				continue;
+			if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME))
+				continue;
+			if (de->attr != ATTR_EXT && IS_FREE(de->name))
+				continue;
+		} else {
+			/* Subtle point: also skips over extended entries */
+			if ((de->attr & ATTR_VOLUME) || IS_FREE(de->name))
+				continue;
+		}
+
+		if (de->attr == ATTR_EXT) {
+			int status = fat_parse_long(parent, &cpos, &bh, &de,
+						    &unicode, &nr_slots);
+			if (status < 0) {
+				err = status;
+				break;
+			} else if (status == PARSE_INVALID)
+				continue;
+			else if (status == PARSE_NOT_LONGNAME) {
+				nr_slots = 0;
+				goto parse_record;
+			} else if (status == PARSE_EOF)
+				break;
+
+			/* At this point, we
+			 * (a) have a long name, and
+			 * (2) are hopefully gazing at the matching shortname
+			 */
+			goto parse_record;
+		}
+
+		/* Here, we have a shortname entry */
+
+		cur_loc = is_logstart ? fat_get_start(sbi, de)
+				      : fat_make_i_pos(sb, bh, de);
+
+		if (cur_loc == child_loc) {
+			if (nr_slots) {
+				fat_uni_to_x8(sb, unicode, name, name_size);
+				err = 0;
+			} else {
+				int short_len;
+				if (name_size < FAT_MAX_SHORT_SIZE) {
+					err = -ENOMEM;
+					break;
+				}
+
+				short_len = fat_parse_short(sb, de, name,
+							sbi->options.dotsOK);
+				if (short_len) {
+					name[short_len] = '\0';
+					err = 0;
+				}
+			}
+			break;
+		}
+	}
+
+	brelse(bh);
+
+	if (unicode)
+		__putname(unicode);
+
+	return err;
+}
+
+static int fat_name_for_ipos(struct inode *parent, char *name,
+			     int name_size, loff_t child_ipos)
+{
+	return fat_name_for_loc(parent, name, name_size, child_ipos, 0);
+}
+
+static int fat_name_for_logstart(struct inode *parent, char *name,
+				 int name_size, loff_t child_logstart)
+{
+	return fat_name_for_loc(parent, name, name_size, child_logstart, 1);
+}
+
+/**
+ * NFS helper: retrieve the name of an anonymous (disconnected) child using
+ * its i_pos or i_logstart and knowledge of its parent
+ *
+ * Returns 0 on success, -ENOENT if child cannot be found,
+ * -ENOMEM on malloc failure
+ */
+int fat_get_name(struct dentry *parent, char *name,
+		 struct dentry *child)
+{
+	struct super_block *sb = parent->d_inode->i_sb;
+
+	loff_t child_loc = MSDOS_I(child->d_inode)->i_pos;
+	int err;
+
+	lock_super(sb);
+
+	if (child_loc) {
+		err = fat_name_for_ipos(parent->d_inode, name, NAME_MAX+1,
+					child_loc);
+	} else {
+		child_loc = MSDOS_I(child->d_inode)->i_logstart;
+		err = fat_name_for_logstart(parent->d_inode, name, NAME_MAX+1,
+					    child_loc);
+	}
+
+	unlock_super(sb);
+	return err;
+}
+
+/**
+ * Find the directory entry that specifies a particular location
+ * (start cluster or i_pos).
+ * On entry, the superblock is assumed locked.
+ * The caller is responsible for releasing the buffer_head.
+ */
+int fat_lookup_loc(struct inode *parent, loff_t loc,
+		   struct buffer_head **bh, struct msdos_dir_entry **de,
+		   int is_logstart)
+{
+	struct super_block *sb = parent->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	loff_t cpos = 0;
+	int err = -ENOENT;
+
+	*de = NULL;	/* Force scan from beginning of directory */
+
+	while (1) {
+		if (fat_get_entry(parent, &cpos, bh, de))
+			break;
+
+		if (IS_FREE((*de)->name))
+			continue;
+
+		if ((*de)->attr == ATTR_EXT)
+			continue;
+
+		if (is_logstart) {
+			if (fat_get_start(sbi, *de) == (int) loc) {
+				err = 0;
+				break;
+			}
+		} else {
+			if (fat_make_i_pos(sb, *bh, *de) == loc) {
+				err = 0;
+				break;
+			}
+		}
+	}
+
+	return err;
+}
+
 struct fat_ioctl_filldir_callback {
 	void __user *dirent;
 	int result;
diff -uprN a/fs/fat/fat.h b/fs/fat/fat.h
--- a/fs/fat/fat.h	2012-07-03 08:37:51.890543883 -0500
+++ b/fs/fat/fat.h	2012-07-03 12:56:58.154737489 -0500
@@ -276,6 +276,25 @@ extern int fat_alloc_new_dir(struct inod
 extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
 			   struct fat_slot_info *sinfo);
 extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo);
+extern int fat_get_name(struct dentry *parent, char *name,
+			struct dentry *child);
+extern int fat_lookup_loc(struct inode *parent, loff_t loc,
+			       struct buffer_head **bh,
+			       struct msdos_dir_entry **de, int is_logstart);
+
+static inline int fat_lookup_logstart(struct inode *parent, int i_logstart,
+				      struct buffer_head **bh,
+				      struct msdos_dir_entry **de)
+{
+	return fat_lookup_loc(parent, (loff_t)i_logstart, bh, de, 1);
+}
+
+static inline int fat_lookup_ipos(struct inode *parent, loff_t i_pos,
+				  struct buffer_head **bh,
+				  struct msdos_dir_entry **de)
+{
+	return fat_lookup_loc(parent, i_pos, bh, de, 0);
+}
 
 /* fat/fatent.c */
 struct fat_entry {
diff -uprN a/fs/fat/inode.c b/fs/fat/inode.c
--- a/fs/fat/inode.c	2012-07-03 10:07:37.494122152 -0500
+++ b/fs/fat/inode.c	2012-07-03 12:56:51.242772517 -0500
@@ -723,6 +723,67 @@ static int fat_is_valid_fh(int fh_len, i
 	return ((fh_len >= 5) && (fh_type == 3));
 }
 
+/**
+ * NFS helper: try to rebuild an inode that has been evicted from the caches,
+ * using some of the original information.
+ *
+ * It is important that the rebuilt inode have the same NFS file handle
+ * (signature) as the evicted one, otherwise NFS clients will detect the
+ * mismatch and report ESTALE.
+ *
+ * NOTE: This function must NOT be called to reconstitute a cached inode.
+ *
+ * On entry, the superblock is assumed locked.
+ */
+static struct inode *fat_reconstitute_inode(struct inode *parent,
+					    unsigned long i_ino, loff_t i_pos,
+					    int i_logstart, __u32 i_generation)
+{
+	struct super_block   *sb  = parent->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bh = NULL;
+	struct msdos_dir_entry *de;
+	struct inode *inode = NULL;
+	int found_logstart;
+	int err;
+
+	/* Find the directory entry (on-disk inode)
+	 * NOTE: must use i_pos here in case we have been called for a
+	 * zero-length file, since all zero-length files have logstart == 0
+	 */
+	if (fat_lookup_ipos(parent, i_pos, &bh, &de))
+		goto out;
+
+	found_logstart = fat_get_start(sbi, de);
+	if (found_logstart != i_logstart)
+		goto out;
+
+	/* Now do the reconstruction */
+	inode = new_inode(sb);
+	if (!inode)
+		goto out;
+
+	inode->i_ino = i_ino;
+	inode->i_version = 1;
+	err = fat_fill_inode(inode, de);
+	if (err || ((inode->i_generation ^ i_generation) & 1)) {
+		/* Error calculating directory size,
+		 * or found a file where we expected a directory,
+		 * or found a directory where we expected a file
+		 */
+		iput(inode);
+		inode = NULL;
+		goto out;
+	}
+	inode->i_generation = i_generation;
+	fat_attach(inode, i_pos);
+	insert_inode_hash(inode);
+out:
+	brelse(bh);
+
+	return inode;
+}
+
 static struct dentry *fat_fh_to_dentry(struct super_block *sb,
 		struct fid *fid, int fh_len, int fh_type)
 {
@@ -732,6 +793,7 @@ static struct dentry *fat_fh_to_dentry(s
 	unsigned long i_ino;
 	__u32 i_generation;
 	int i_logstart;
+	int cache_hit;
 
 	if (!fat_is_valid_fh(fh_len, fh_type))
 		return NULL;
@@ -741,6 +803,7 @@ static struct dentry *fat_fh_to_dentry(s
 	i_logstart = fh[3] & 0x0fffffff;
 
 	inode = ilookup(sb, i_ino);
+	cache_hit = !!inode;
 	if (!inode || inode->i_generation != i_generation) {
 		if (inode)
 			iput(inode);
@@ -756,30 +819,152 @@ static struct dentry *fat_fh_to_dentry(s
 		 */
 
 		inode = fat_iget(sb, i_pos);
+		cache_hit |= !!inode;
 		if (inode && MSDOS_I(inode)->i_logstart != i_logstart) {
 			iput(inode);
 			inode = NULL;
 		}
 	}
 
-	/*
-	 * For now, do nothing if the inode is not found.
-	 *
-	 * What we could do is:
-	 *
-	 *	- follow the file starting at fh[4], and record the ".." entry,
-	 *	  and the name of the fh[2] entry.
-	 *	- then follow the ".." file finding the next step up.
-	 *
-	 * This way we build a path to the root of the tree. If this works, we
-	 * lookup the path and so get this inode into the cache.  Finally try
-	 * the fat_iget lookup again.  If that fails, then we are totally out
-	 * of luck.  But all that is for another day
-	 */
+	if (!inode && !cache_hit) {
+		/* Last chance:
+		 * Try to reconstitute the inode using the information
+		 * available in the file handle.
+		 */
+		struct msdos_dir_entry parent_de;
+		struct inode *parent;
+		int parent_logstart = fh[4] & 0x0fffffff;
+
+		memset(&parent_de, 0, sizeof(parent_de));
+		parent_de.name[0] = 'X';  /* Anything to make it !IS_FREE() */
+		parent_de.attr    = ATTR_DIR;
+		fat_set_start(&parent_de, parent_logstart);
+
+		lock_super(sb);
+
+		parent = fat_build_unhashed_inode(sb, &parent_de);
+		if (IS_ERR(parent))
+			inode = parent;		/* Relay the error code */
+		else {
+			inode = fat_reconstitute_inode(parent, i_ino, i_pos,
+						       i_logstart,
+						       i_generation);
+			iput(parent);
+		}
+
+		unlock_super(sb);
+	}
 	return d_obtain_alias(inode);
 }
 
 /*
+ * NFS support: try to get the dentry for a directory.
+ *
+ * @sb:		superblock
+ * @de:		on-disk directory entry of interest
+ * @logstart:	Start cluster of the directory
+ * @bh:		Buffer space
+ */
+struct dentry *fat_lookup_dir(struct super_block *sb,
+			      struct msdos_dir_entry *de,
+			      int logstart, struct buffer_head **bh)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct inode *root_inode = sb->s_root->d_inode;
+	struct inode *temp_inode, *inode, *parent = NULL;
+	struct dentry *dir = NULL;
+	struct msdos_dir_entry *parent_de;
+	int parent_logstart;
+	loff_t i_pos;
+	int err;
+
+	temp_inode = fat_build_unhashed_inode(sb, de);
+	if (IS_ERR(temp_inode)) {
+		err = PTR_ERR(temp_inode);
+		temp_inode = NULL;
+		goto out;
+	}
+
+	err = fat_get_dotdot_entry(temp_inode, bh, &parent_de);
+	if (err)
+		goto out;
+
+	parent_logstart = fat_get_start(sbi, parent_de);
+	if (!parent_logstart)
+		parent = root_inode;
+	else {
+		parent = fat_build_unhashed_inode(sb, parent_de);
+		if (IS_ERR(parent)) {
+			err = PTR_ERR(parent);
+			parent = NULL;
+			goto out;
+		}
+	}
+
+	err = fat_lookup_logstart(parent, logstart, bh, &parent_de);
+	if (err)
+		goto out;
+
+	/* Only a cached inode will do.
+	 * There is no point in building an inode from scratch because it will
+	 * have a different NFS file handle than the last one reported to the
+	 * client. Clients don't like that and fail operations with ESTALE.
+	 */
+	i_pos = fat_make_i_pos(sb, *bh, parent_de);
+	inode = fat_iget(sb, i_pos);
+
+	dir = d_obtain_alias(inode);
+	if (!IS_ERR(dir))
+		dir->d_op = sb->s_root->d_op;
+
+out:
+	if (parent != root_inode)
+		iput(parent);
+	iput(temp_inode);
+
+	if (err) {
+		if (err == -ENOENT)
+			err = -ESTALE;
+		return ERR_PTR(err);
+	} else
+		return dir;
+}
+
+/*
+ * NFS support: Find the parent for a file specified by NFS handle.
+ */
+struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
+				int fh_len, int fh_type)
+{
+	u32 *fh = fid->raw;
+	struct msdos_dir_entry parent_de;
+	struct buffer_head *bh = NULL;
+	struct dentry *parent = NULL;
+	int parent_logstart;
+
+	if (!fat_is_valid_fh(fh_len, fh_type))
+		return NULL;
+
+	parent_logstart = fh[4] & 0x0fffffff;
+	if (!parent_logstart)
+		return sb->s_root;
+
+	memset(&parent_de, 0, sizeof(parent_de));
+	parent_de.name[0] = 'X';  /* Anything to make it !IS_FREE() */
+	parent_de.attr    = ATTR_DIR;
+	fat_set_start(&parent_de, parent_logstart);
+
+	lock_super(sb);
+
+	parent = fat_lookup_dir(sb, &parent_de, parent_logstart, &bh);
+	brelse(bh);
+
+	unlock_super(sb);
+
+	return parent;
+}
+
+/*
  * NFS support:  Find the parent for a disconnected directory.
  */
 static struct dentry *fat_get_parent(struct dentry *child_dir)
@@ -798,7 +983,8 @@ static struct dentry *fat_get_parent(str
 	parent_logstart = fat_get_start(MSDOS_SB(sb), de);
 	if (!parent_logstart)
 		parent = sb->s_root;
-
+	else
+		parent = fat_lookup_dir(sb, de, parent_logstart, &bh);
 out:
 	brelse(bh);
 	unlock_super(sb);
@@ -809,7 +995,9 @@ out:
 static const struct export_operations fat_export_ops = {
 	.encode_fh	= fat_encode_fh,
 	.fh_to_dentry	= fat_fh_to_dentry,
+	.fh_to_parent	= fat_fh_to_parent,
 	.get_parent	= fat_get_parent,
+	.get_name	= fat_get_name
 };
 
 static int fat_show_options(struct seq_file *m, struct dentry *root)


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] fat (exportfs): drop ineffective get_parent code
  2012-07-03 19:09 ` [PATCH 1/2] fat (exportfs): drop ineffective get_parent code Steven J. Magnani
@ 2012-07-04 10:30   ` OGAWA Hirofumi
  0 siblings, 0 replies; 27+ messages in thread
From: OGAWA Hirofumi @ 2012-07-04 10:30 UTC (permalink / raw)
  To: Steven J. Magnani; +Cc: linux-kernel

"Steven J. Magnani" <steve@digidescorp.com> writes:

> This patch reorganizes existing code for reuse by the next patch in the series.

Please separate cleanup and logic change. This patch seems to be almost
all changes are cleanup.

Maybe it would be the cleanup and nfs improvement patches.

> The only functional change is to remove ineffective code from 
> fat_get_dotdot_entry() and limit fat_get_parent() to immediate children of the
> root (the only disconnected directory case that can be handled without a 
> search). The limitation will be removed in the next patch in the series.
>
> These changes are a slight improvement over the current situation, in that they
> allow reconnection of immediate subdirectories of the root, and reporting of
> ESTALE for other cases.

I can't see why we need the limited get_parent(). I guess, it is better
to just do in next patch at once.

> The reason the i_pos code in fat_get_dotdot_entry() is ineffective is
> because the value is the on-disk position of a ".." directory entry.
> What is needed in order to map NFS file handles back to objects is the on-disk
> position of the named entry to which ".." refers.

[...]

> +/**
> + * Create inode from specified directory entry.
> + * Do not call this directly unless intentionally bypassing the FAT dir cache.
> + */
> +static struct inode *fat_build_unhashed_inode(struct super_block *sb,
> +			struct msdos_dir_entry *de)

fat_new_inode() would be better name.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-03 19:09 ` [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries Steven J. Magnani
@ 2012-07-04 11:07   ` OGAWA Hirofumi
  2012-07-04 18:03     ` Steve Magnani
  2012-07-06 20:33     ` Steven J. Magnani
  0 siblings, 2 replies; 27+ messages in thread
From: OGAWA Hirofumi @ 2012-07-04 11:07 UTC (permalink / raw)
  To: Steven J. Magnani; +Cc: linux-kernel

"Steven J. Magnani" <steve@digidescorp.com> writes:

> This patch adds code to support reconstruction of evicted inodes.
> We walk the on-disk structures where necessary to fill in information
> not available in the NFS file handle.
>
> One important point is that when reconstructing an inode, in order to avoid the
> *client* declaring ESTALE we have to ensure that the NFS file handle of the
> reconstruction is identical to that of the original. 

Sorry, I didn't review fully yet though.

We would like to add the comment of [0/2] explanation here. Here is
missing the explanation of the problem.

And the codes of nfs support became larger, and I think we should give
the place for it. I.e. we would like to add new file (maybe, export.c,
nfs.c, or something), move nfs code into it.

With it, we don't need to add "NFS ..." annotation in the comment, and
it would make more clear.

And can you add explanation the test of this? What were tested?

> +/**
> + * NFS helper: retrieve the name of an anonymous (disconnected) child using
> + * its i_pos or i_logstart and knowledge of its parent
> + *
> + * Returns 0 on success, -ENOENT if child cannot be found,
> + * -ENOMEM on malloc failure
> + */
> +int fat_get_name(struct dentry *parent, char *name,
> +		 struct dentry *child)
> +{
> +	struct super_block *sb = parent->d_inode->i_sb;
> +
> +	loff_t child_loc = MSDOS_I(child->d_inode)->i_pos;
> +	int err;
> +
> +	lock_super(sb);
> +
> +	if (child_loc) {
> +		err = fat_name_for_ipos(parent->d_inode, name, NAME_MAX+1,
> +					child_loc);
> +	} else {
> +		child_loc = MSDOS_I(child->d_inode)->i_logstart;
> +		err = fat_name_for_logstart(parent->d_inode, name, NAME_MAX+1,
> +					    child_loc);
> +	}
> +
> +	unlock_super(sb);
> +	return err;
> +}

Please don't add new lock_super() usage if it is not necessary. Almost
all of lock_super() just replaced lock_kernel() usage. It rather should
be removed in future.  Probably, this should use inode->i_mutex instead.

BTW, the above issue is same with all of directory read.

And although this is using i_pos, is there no possibility to be passed
the detached inode (i.e. open but unlinked inode, i_pos == 0)?

What happens in the case of detached inode?

Thanks.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-04 11:07   ` OGAWA Hirofumi
@ 2012-07-04 18:03     ` Steve Magnani
  2012-07-05  3:59       ` OGAWA Hirofumi
  2012-07-06 20:33     ` Steven J. Magnani
  1 sibling, 1 reply; 27+ messages in thread
From: Steve Magnani @ 2012-07-04 18:03 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel

OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> writes:
> "Steven J. Magnani" <steve@digidescorp.com> writes:
> 
> > This patch adds code to support reconstruction of evicted inodes.
> > We walk the on-disk structures where necessary to fill in information
> > not available in the NFS file handle.
> >
> > One important point is that when reconstructing an inode, in order to
> > avoid the *client* declaring ESTALE we have to ensure that the NFS
> > file handle of the reconstruction is identical to that of the
> > original. 

> We would like to add the comment of [0/2] explanation here. Here is
> missing the explanation of the problem.

OK

> And the codes of nfs support became larger, and I think we should give
> the place for it. I.e. we would like to add new file (maybe, export.c,
> nfs.c, or something), move nfs code into it.
> 
> With it, we don't need to add "NFS ..." annotation in the comment, and
> it would make more clear.

Sure.

> And can you add explanation the test of this? What were tested?

I set up a memory-limited virtual machine with a 2 GB FAT partition 
containing a kernel tree (~770 MB, ~40000 files, 9 levels) and did some 
'cp -r' and 'ls -lR' operations on it, some overlapping, some not.

> > +int fat_get_name(struct dentry *parent, char *name,
> > +		 struct dentry *child)
> > +{
> > +	struct super_block *sb = parent->d_inode->i_sb;
> > +
> > +	loff_t child_loc = MSDOS_I(child->d_inode)->i_pos;
> > +	int err;
> > +
> > +	lock_super(sb);
> > +
> > +	if (child_loc) {
> > +		err = fat_name_for_ipos(parent->d_inode, name, NAME_MAX+1,
> > +					child_loc);
> > +	} else {
> > +		child_loc = MSDOS_I(child->d_inode)->i_logstart;
> > +		err = fat_name_for_logstart(parent->d_inode, name, 
NAME_MAX+1,
> > +					    child_loc);
> > +	}
> > +
> > +	unlock_super(sb);
> > +	return err;
> > +}
> 
> Please don't add new lock_super() usage if it is not necessary. Almost
> all of lock_super() just replaced lock_kernel() usage. It rather should
> be removed in future.  Probably, this should use inode->i_mutex
> instead.

I will look into this. My concern was freezing the filesystem while we 
walk the on-disk structures. Also I developed this patch against 2.6.35 
(the Bad Old BKL days) and ported it forward to 3.5.

> BTW, the above issue is same with all of directory read.
> 
> And although this is using i_pos, is there no possibility to be passed
> the detached inode (i.e. open but unlinked inode, i_pos == 0)?

It is possible, that's why I added code to fall back to using logstart.

I may yet rip out the get_name code. The testing I did before posting the 
patch seemed to indicate that it was needed - I saw ESTALE errors without 
get_name support that I did not see with it present. But I've been 
digging into this some more and I think that was just a coincidence; 
probably I just generated more extreme memory pressure when testing 
without get_name. I should know more tomorrow.

Thanks for the quick feedback.
Steve



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-04 18:03     ` Steve Magnani
@ 2012-07-05  3:59       ` OGAWA Hirofumi
  2012-07-05 20:03         ` Steven J. Magnani
  0 siblings, 1 reply; 27+ messages in thread
From: OGAWA Hirofumi @ 2012-07-05  3:59 UTC (permalink / raw)
  To: Steve Magnani; +Cc: linux-kernel

"Steve Magnani" <steve@digidescorp.com> writes:

>> And can you add explanation the test of this? What were tested?
>
> I set up a memory-limited virtual machine with a 2 GB FAT partition 
> containing a kernel tree (~770 MB, ~40000 files, 9 levels) and did some 
> 'cp -r' and 'ls -lR' operations on it, some overlapping, some not.

Sounds good. It would be useful to add to changelog.

>> Please don't add new lock_super() usage if it is not necessary. Almost
>> all of lock_super() just replaced lock_kernel() usage. It rather should
>> be removed in future.  Probably, this should use inode->i_mutex
>> instead.
>
> I will look into this. My concern was freezing the filesystem while we 
> walk the on-disk structures. Also I developed this patch against 2.6.35 
> (the Bad Old BKL days) and ported it forward to 3.5.

I see.

>> BTW, the above issue is same with all of directory read.
>> 
>> And although this is using i_pos, is there no possibility to be passed
>> the detached inode (i.e. open but unlinked inode, i_pos == 0)?
>
> It is possible, that's why I added code to fall back to using logstart.
>
> I may yet rip out the get_name code. The testing I did before posting the 
> patch seemed to indicate that it was needed - I saw ESTALE errors without 
> get_name support that I did not see with it present. But I've been 
> digging into this some more and I think that was just a coincidence; 
> probably I just generated more extreme memory pressure when testing 
> without get_name. I should know more tomorrow.

Thanks.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-05  3:59       ` OGAWA Hirofumi
@ 2012-07-05 20:03         ` Steven J. Magnani
  0 siblings, 0 replies; 27+ messages in thread
From: Steven J. Magnani @ 2012-07-05 20:03 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel

On Thu, 2012-07-05 at 12:59 +0900, OGAWA Hirofumi wrote: 
> "Steve Magnani" <steve@digidescorp.com> writes:
> > I may yet rip out the get_name code. The testing I did before posting the 
> > patch seemed to indicate that it was needed - I saw ESTALE errors without 
> > get_name support that I did not see with it present. But I've been 
> > digging into this some more and I think that was just a coincidence; 
> > probably I just generated more extreme memory pressure when testing 
> > without get_name. I should know more tomorrow.

The get_name code can go away.

It turns out that it is much harder to generate severe memory pressure
in my virtual 3.5 kernel than in my embedded 2.6.35 kernel.
fat_reconstitute_inode() was not being exercised in 3.5 as much as I
thought.

And, there was a change made in 3.5
(b0b0382bb4904965a9e9fca77ad87514dfda0d1c) that causes the parent
i_logstart field not to be populated when building NFS file handles for
shares marked 'no_subtree_check'. That causes fat_reconstitute_inode()
to fail, because it interprets the parent of such objects as 'root'
rather than as 'unknown'.

I'm reworking the patch to cope with the 'unknown' case, and to address
your other comments. Still need to look into lock_super() vs.
inode->i_mutex.

Steve
------------------------------------------------------------------------
Steven J. Magnani               "I claim this network for MARS!
www.digidescorp.com              Earthling, return my space modulator!"

#include <standard.disclaimer>



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-04 11:07   ` OGAWA Hirofumi
  2012-07-04 18:03     ` Steve Magnani
@ 2012-07-06 20:33     ` Steven J. Magnani
  2012-07-06 21:07       ` OGAWA Hirofumi
  1 sibling, 1 reply; 27+ messages in thread
From: Steven J. Magnani @ 2012-07-06 20:33 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel

On Wed, 2012-07-04 at 20:07 +0900, OGAWA Hirofumi wrote: 
> Please don't add new lock_super() usage if it is not necessary. Almost
> all of lock_super() just replaced lock_kernel() usage. It rather should
> be removed in future.  Probably, this should use inode->i_mutex instead.
> 
> BTW, the above issue is same with all of directory read.

I don't think there's really an alternative here. The cases addressed by
this patch all involve walking on-disk structures via
unofficial/temporary inodes created from information in the NFS handle
(i.e., outside the normal inode creation paths). When this process is
successful we end up with "official" connected inodes/dentries, but
getting there is really a "bottom up" strategy instead of the usual "top
down" approach.

Because the "bottom up" method is lacking guarantees that "top down"
takes for granted - i.e., that a cluster on disk that's supposed to be a
directory actually *is* a directory -  I am adding some defensive code
in the next spin of the patch.

Steve



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-06 20:33     ` Steven J. Magnani
@ 2012-07-06 21:07       ` OGAWA Hirofumi
  2012-07-07  1:16         ` Steven J. Magnani
  0 siblings, 1 reply; 27+ messages in thread
From: OGAWA Hirofumi @ 2012-07-06 21:07 UTC (permalink / raw)
  To: Steven J. Magnani; +Cc: linux-kernel

"Steven J. Magnani" <steve@digidescorp.com> writes:

> On Wed, 2012-07-04 at 20:07 +0900, OGAWA Hirofumi wrote: 
>> Please don't add new lock_super() usage if it is not necessary. Almost
>> all of lock_super() just replaced lock_kernel() usage. It rather should
>> be removed in future.  Probably, this should use inode->i_mutex instead.
>> 
>> BTW, the above issue is same with all of directory read.
>
> I don't think there's really an alternative here. The cases addressed by
> this patch all involve walking on-disk structures via
> unofficial/temporary inodes created from information in the NFS handle
> (i.e., outside the normal inode creation paths). When this process is
> successful we end up with "official" connected inodes/dentries, but
> getting there is really a "bottom up" strategy instead of the usual "top
> down" approach.
>
> Because the "bottom up" method is lacking guarantees that "top down"
> takes for granted - i.e., that a cluster on disk that's supposed to be a
> directory actually *is* a directory -  I am adding some defensive code
> in the next spin of the patch.

I'm not sure what you meant. Where is the problem? ->get_name()? If so,
it has parent dentry parameter. What is the wrong if we take
mutex_lock(parent->d_inode)?

Thanks.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-06 21:07       ` OGAWA Hirofumi
@ 2012-07-07  1:16         ` Steven J. Magnani
  2012-07-07  6:03           ` OGAWA Hirofumi
  0 siblings, 1 reply; 27+ messages in thread
From: Steven J. Magnani @ 2012-07-07  1:16 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel

On Sat, 2012-07-07 at 06:07 +0900, OGAWA Hirofumi wrote: 
> "Steven J. Magnani" <steve@digidescorp.com> writes:
> 
> > On Wed, 2012-07-04 at 20:07 +0900, OGAWA Hirofumi wrote: 
> >> Please don't add new lock_super() usage if it is not necessary. Almost
> >> all of lock_super() just replaced lock_kernel() usage. It rather should
> >> be removed in future.  Probably, this should use inode->i_mutex instead.
> >> 
> >> BTW, the above issue is same with all of directory read.
> >
> > I don't think there's really an alternative here. The cases addressed by
> > this patch all involve walking on-disk structures via
> > unofficial/temporary inodes created from information in the NFS handle
> > (i.e., outside the normal inode creation paths). When this process is
> > successful we end up with "official" connected inodes/dentries, but
> > getting there is really a "bottom up" strategy instead of the usual "top
> > down" approach.
> >
> > Because the "bottom up" method is lacking guarantees that "top down"
> > takes for granted - i.e., that a cluster on disk that's supposed to be a
> > directory actually *is* a directory -  I am adding some defensive code
> > in the next spin of the patch.
> 
> I'm not sure what you meant. Where is the problem? ->get_name()? If so,
> it has parent dentry parameter. What is the wrong if we take
> mutex_lock(parent->d_inode)?

The temporary/"unofficial" inodes are unhashed and thus not visible
outside of the functions using them. They exist only to support access
to directory contents when we can't gain that access via other means
(because we only have "bottom up" information, about an object and
perhaps its parent, in a form that can't be used to look up hashed
inodes/dentries). Hashing them wouldn't help, because they don't have
the correct key (for instance, in the case of a ".." entry).

Am I missing something?

Steve
------------------------------------------------------------------------
Steven J. Magnani               "I claim this network for MARS!
www.digidescorp.com              Earthling, return my space modulator!"

#include <standard.disclaimer>



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-07  1:16         ` Steven J. Magnani
@ 2012-07-07  6:03           ` OGAWA Hirofumi
  2012-07-07 16:41             ` Steven J. Magnani
  0 siblings, 1 reply; 27+ messages in thread
From: OGAWA Hirofumi @ 2012-07-07  6:03 UTC (permalink / raw)
  To: Steven J. Magnani; +Cc: linux-kernel

"Steven J. Magnani" <steve@digidescorp.com> writes:

> On Sat, 2012-07-07 at 06:07 +0900, OGAWA Hirofumi wrote: 
>> "Steven J. Magnani" <steve@digidescorp.com> writes:
>> 
>> > On Wed, 2012-07-04 at 20:07 +0900, OGAWA Hirofumi wrote: 
>> >> Please don't add new lock_super() usage if it is not necessary. Almost
>> >> all of lock_super() just replaced lock_kernel() usage. It rather should
>> >> be removed in future.  Probably, this should use inode->i_mutex instead.
>> >> 
>> >> BTW, the above issue is same with all of directory read.
>> >
>> > I don't think there's really an alternative here. The cases addressed by
>> > this patch all involve walking on-disk structures via
>> > unofficial/temporary inodes created from information in the NFS handle
>> > (i.e., outside the normal inode creation paths). When this process is
>> > successful we end up with "official" connected inodes/dentries, but
>> > getting there is really a "bottom up" strategy instead of the usual "top
>> > down" approach.
>> >
>> > Because the "bottom up" method is lacking guarantees that "top down"
>> > takes for granted - i.e., that a cluster on disk that's supposed to be a
>> > directory actually *is* a directory -  I am adding some defensive code
>> > in the next spin of the patch.
>> 
>> I'm not sure what you meant. Where is the problem? ->get_name()? If so,
>> it has parent dentry parameter. What is the wrong if we take
>> mutex_lock(parent->d_inode)?
>
> The temporary/"unofficial" inodes are unhashed and thus not visible
> outside of the functions using them. They exist only to support access
> to directory contents when we can't gain that access via other means
> (because we only have "bottom up" information, about an object and
> perhaps its parent, in a form that can't be used to look up hashed
> inodes/dentries). Hashing them wouldn't help, because they don't have
> the correct key (for instance, in the case of a ".." entry).
>
> Am I missing something?

You mean the unhashed inode is created by ->get_parent()? If so, the
root cause sounds like ->get_parent() itself. If not, I'm not
understanding the meaning of the temporary/unofficial inode here.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-07  6:03           ` OGAWA Hirofumi
@ 2012-07-07 16:41             ` Steven J. Magnani
  2012-07-07 17:00               ` OGAWA Hirofumi
  0 siblings, 1 reply; 27+ messages in thread
From: Steven J. Magnani @ 2012-07-07 16:41 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel

On Sat, 2012-07-07 at 15:03 +0900, OGAWA Hirofumi wrote: 
> "Steven J. Magnani" <steve@digidescorp.com> writes:
> 
> > On Sat, 2012-07-07 at 06:07 +0900, OGAWA Hirofumi wrote: 
> >> "Steven J. Magnani" <steve@digidescorp.com> writes:
> >> 
> >> > On Wed, 2012-07-04 at 20:07 +0900, OGAWA Hirofumi wrote: 
> >> >> Please don't add new lock_super() usage if it is not necessary. Almost
> >> >> all of lock_super() just replaced lock_kernel() usage. It rather should
> >> >> be removed in future.  Probably, this should use inode->i_mutex instead.
> >> >> 
> >> >> BTW, the above issue is same with all of directory read.
> >> >
> >> > I don't think there's really an alternative here. The cases addressed by
> >> > this patch all involve walking on-disk structures via
> >> > unofficial/temporary inodes created from information in the NFS handle
> >> > (i.e., outside the normal inode creation paths). When this process is
> >> > successful we end up with "official" connected inodes/dentries, but
> >> > getting there is really a "bottom up" strategy instead of the usual "top
> >> > down" approach.
> >> >
> >> > Because the "bottom up" method is lacking guarantees that "top down"
> >> > takes for granted - i.e., that a cluster on disk that's supposed to be a
> >> > directory actually *is* a directory -  I am adding some defensive code
> >> > in the next spin of the patch.
> >> 
> >> I'm not sure what you meant. Where is the problem? ->get_name()? If so,
> >> it has parent dentry parameter. What is the wrong if we take
> >> mutex_lock(parent->d_inode)?
> >
> > The temporary/"unofficial" inodes are unhashed and thus not visible
> > outside of the functions using them. They exist only to support access
> > to directory contents when we can't gain that access via other means
> > (because we only have "bottom up" information, about an object and
> > perhaps its parent, in a form that can't be used to look up hashed
> > inodes/dentries). Hashing them wouldn't help, because they don't have
> > the correct key (for instance, in the case of a ".." entry).
> >
> > Am I missing something?
> 
> You mean the unhashed inode is created by ->get_parent()? If so, the
> root cause sounds like ->get_parent() itself. If not, I'm not
> understanding the meaning of the temporary/unofficial inode here.

Maybe "private" is a better word than "unofficial". Private inodes are
created anywhere fat_new_inode (nee fat_build_unhashed_inode) is called
directly, instead of through fat_build_inode. So yes, this is on the
get_parent paths (via fat_lookup_dir), and also on the fh_to_dentry path
when inode reconstruction is necessary.

With private inodes, I don't see how anyone but the code that created
them could find them to lock them. The reason they're private is that
they're temporary aliases; at the time they're created, we don't have
enough information to register them in a way that others could find
them. A lookup, etc. operation will look for the inode of the "drivers"
directory, not the ".." of the "usb" directory. We do need these private
inodes in order to walk directory entries. I don't think they're a
problem that needs solving; if we didn't use private inodes, we'd still
need a way to walk directory entries in the context of these NFS
operations, and there would still be potential races between that and
other operations on the filesystem.

------------------------------------------------------------------------
Steven J. Magnani               "I claim this network for MARS!
www.digidescorp.com              Earthling, return my space modulator!"

#include <standard.disclaimer>



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-07 16:41             ` Steven J. Magnani
@ 2012-07-07 17:00               ` OGAWA Hirofumi
  2012-07-09 12:03                 ` Steven J. Magnani
  0 siblings, 1 reply; 27+ messages in thread
From: OGAWA Hirofumi @ 2012-07-07 17:00 UTC (permalink / raw)
  To: Steven J. Magnani; +Cc: linux-kernel

"Steven J. Magnani" <steve@digidescorp.com> writes:

>> You mean the unhashed inode is created by ->get_parent()? If so, the
>> root cause sounds like ->get_parent() itself. If not, I'm not
>> understanding the meaning of the temporary/unofficial inode here.
>
> Maybe "private" is a better word than "unofficial". Private inodes are
> created anywhere fat_new_inode (nee fat_build_unhashed_inode) is called
> directly, instead of through fat_build_inode. So yes, this is on the
> get_parent paths (via fat_lookup_dir), and also on the fh_to_dentry path
> when inode reconstruction is necessary.
>
> With private inodes, I don't see how anyone but the code that created
> them could find them to lock them. The reason they're private is that
> they're temporary aliases; at the time they're created, we don't have
> enough information to register them in a way that others could find
> them. A lookup, etc. operation will look for the inode of the "drivers"
> directory, not the ".." of the "usb" directory. We do need these private
> inodes in order to walk directory entries. I don't think they're a
> problem that needs solving; if we didn't use private inodes, we'd still
> need a way to walk directory entries in the context of these NFS
> operations, and there would still be potential races between that and
> other operations on the filesystem.

How do you prevent to modify or free the those inode/blocks from other
path?  Yeah, it is racy. And if races is not solved, that's simply wrong
and not solution.

Although I'm not thinking deeply about NFS support on FAT. Just a idea,
the one of possible solutions would be register it to hash, and find it
on all path. So, all path will use same inode and lock.

We need the key, possible key is - if it is only directory, FAT may be
able to use i_start as additional search key.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-07 17:00               ` OGAWA Hirofumi
@ 2012-07-09 12:03                 ` Steven J. Magnani
  2012-07-09 13:43                   ` OGAWA Hirofumi
  0 siblings, 1 reply; 27+ messages in thread
From: Steven J. Magnani @ 2012-07-09 12:03 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel

On Sun, 2012-07-08 at 02:00 +0900, OGAWA Hirofumi wrote: 
> "Steven J. Magnani" <steve@digidescorp.com> writes:
> 
> >> You mean the unhashed inode is created by ->get_parent()? If so, the
> >> root cause sounds like ->get_parent() itself. If not, I'm not
> >> understanding the meaning of the temporary/unofficial inode here.
> >
> > Maybe "private" is a better word than "unofficial". Private inodes are
> > created anywhere fat_new_inode (nee fat_build_unhashed_inode) is called
> > directly, instead of through fat_build_inode. So yes, this is on the
> > get_parent paths (via fat_lookup_dir), and also on the fh_to_dentry path
> > when inode reconstruction is necessary.
> >
> > With private inodes, I don't see how anyone but the code that created
> > them could find them to lock them. The reason they're private is that
> > they're temporary aliases; at the time they're created, we don't have
> > enough information to register them in a way that others could find
> > them. A lookup, etc. operation will look for the inode of the "drivers"
> > directory, not the ".." of the "usb" directory. We do need these private
> > inodes in order to walk directory entries. I don't think they're a
> > problem that needs solving; if we didn't use private inodes, we'd still
> > need a way to walk directory entries in the context of these NFS
> > operations, and there would still be potential races between that and
> > other operations on the filesystem.
> 
> How do you prevent to modify or free the those inode/blocks from other
> path?  Yeah, it is racy. And if races is not solved, that's simply wrong
> and not solution.
> 
> Although I'm not thinking deeply about NFS support on FAT. Just a idea,
> the one of possible solutions would be register it to hash, and find it
> on all path. So, all path will use same inode and lock.
> 
> We need the key, possible key is - if it is only directory, FAT may be
> able to use i_start as additional search key.

Interesting idea. I think this, and reformulating the FAT NFS file
handle to include the parent's i_ino, will greatly simplify (and speed
up) the code.

I am having a hard time seeing how inclusion of i_pos in the file handle
is useful. The only scenarios I can conceive where an i_ino lookup fails
and an i_pos lookup succeeds are the object referenced by the file
handle has:

    (A) been replaced by something else,
or  (2) had its inode evicted, but later re-instantiated

Perhaps the design was intended to support case (2), but since the file
handle of the re-instantiated inode will differ from the original (since
i_ino and i_generation will have changed), NFS clients will cry ESTALE
anyway.

Steve



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-09 12:03                 ` Steven J. Magnani
@ 2012-07-09 13:43                   ` OGAWA Hirofumi
  2012-07-09 14:47                     ` Steven J. Magnani
  0 siblings, 1 reply; 27+ messages in thread
From: OGAWA Hirofumi @ 2012-07-09 13:43 UTC (permalink / raw)
  To: Steven J. Magnani; +Cc: linux-kernel

"Steven J. Magnani" <steve@digidescorp.com> writes:

>> How do you prevent to modify or free the those inode/blocks from other
>> path?  Yeah, it is racy. And if races is not solved, that's simply wrong
>> and not solution.
>> 
>> Although I'm not thinking deeply about NFS support on FAT. Just a idea,
>> the one of possible solutions would be register it to hash, and find it
>> on all path. So, all path will use same inode and lock.
>> 
>> We need the key, possible key is - if it is only directory, FAT may be
>> able to use i_start as additional search key.
>
> Interesting idea. I think this, and reformulating the FAT NFS file
> handle to include the parent's i_ino, will greatly simplify (and speed
> up) the code.

Does it work even if the inode was rename()'ed?
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-09 13:43                   ` OGAWA Hirofumi
@ 2012-07-09 14:47                     ` Steven J. Magnani
  2012-07-09 16:10                       ` OGAWA Hirofumi
  0 siblings, 1 reply; 27+ messages in thread
From: Steven J. Magnani @ 2012-07-09 14:47 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel

On Mon, 2012-07-09 at 22:43 +0900, OGAWA Hirofumi wrote: 
> "Steven J. Magnani" <steve@digidescorp.com> writes:
> 
> >> We need the key, possible key is - if it is only directory, FAT may be
> >> able to use i_start as additional search key.
> >
> > Interesting idea. I think this, and reformulating the FAT NFS file
> > handle to include the parent's i_ino, will greatly simplify (and speed
> > up) the code.
> 
> Does it work even if the inode was rename()'ed?

AFAICT. I don't see why it wouldn't; on a rename, the inode's i_pos
changes but its i_ino stays the same, right?

Do you have any objection to making the use of a directory logstart
cache a mount option that defaults to off? It seems a shame to penalize
everyone - particularly embedded systems - with the overhead of such a
cache when FAT-backed-NFS seems to be such a small percentage of use
cases.

------------------------------------------------------------------------
Steven J. Magnani               "I claim this network for MARS!
www.digidescorp.com              Earthling, return my space modulator!"

#include <standard.disclaimer>



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-09 14:47                     ` Steven J. Magnani
@ 2012-07-09 16:10                       ` OGAWA Hirofumi
  2012-07-09 16:27                         ` Steven J. Magnani
  0 siblings, 1 reply; 27+ messages in thread
From: OGAWA Hirofumi @ 2012-07-09 16:10 UTC (permalink / raw)
  To: Steven J. Magnani; +Cc: linux-kernel

"Steven J. Magnani" <steve@digidescorp.com> writes:

> On Mon, 2012-07-09 at 22:43 +0900, OGAWA Hirofumi wrote: 
>> "Steven J. Magnani" <steve@digidescorp.com> writes:
>> 
>> >> We need the key, possible key is - if it is only directory, FAT may be
>> >> able to use i_start as additional search key.
>> >
>> > Interesting idea. I think this, and reformulating the FAT NFS file
>> > handle to include the parent's i_ino, will greatly simplify (and speed
>> > up) the code.
>> 
>> Does it work even if the inode was rename()'ed?
>
> AFAICT. I don't see why it wouldn't; on a rename, the inode's i_pos
> changes but its i_ino stays the same, right?

If the inode is not on cache anymore, is there the possibility that
selects the wrong parent? IIRC, NFS Server can be rebooted at any time
while the client using the same file handle.

> Do you have any objection to making the use of a directory logstart
> cache a mount option that defaults to off? It seems a shame to penalize
> everyone - particularly embedded systems - with the overhead of such a
> cache when FAT-backed-NFS seems to be such a small percentage of use
> cases.

I'm not sure what did it mean. It means to remove i_logstart from NFS
file handle?
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-09 16:10                       ` OGAWA Hirofumi
@ 2012-07-09 16:27                         ` Steven J. Magnani
  2012-07-09 17:09                           ` Steven J. Magnani
  0 siblings, 1 reply; 27+ messages in thread
From: Steven J. Magnani @ 2012-07-09 16:27 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel

On Tue, 2012-07-10 at 01:10 +0900, OGAWA Hirofumi wrote: 
> "Steven J. Magnani" <steve@digidescorp.com> writes:
> 
> > On Mon, 2012-07-09 at 22:43 +0900, OGAWA Hirofumi wrote: 
> >> "Steven J. Magnani" <steve@digidescorp.com> writes:
> >> 
> >> >> We need the key, possible key is - if it is only directory, FAT may be
> >> >> able to use i_start as additional search key.
> >> >
> >> > Interesting idea. I think this, and reformulating the FAT NFS file
> >> > handle to include the parent's i_ino, will greatly simplify (and speed
> >> > up) the code.
> >> 
> >> Does it work even if the inode was rename()'ed?
> >
> > AFAICT. I don't see why it wouldn't; on a rename, the inode's i_pos
> > changes but its i_ino stays the same, right?
> 
> If the inode is not on cache anymore, is there the possibility that
> selects the wrong parent? IIRC, NFS Server can be rebooted at any time
> while the client using the same file handle.

True, but it's looking like we can just use the default handle
constructed by export_encode_fh(), namely (i_ino, i_generation,
parent->i_ino, parent->i_generation). None of those components should
change in a server reboot.

Also, my thinking now is that there's no reliable way to reconstruct
evicted inodes. I was going to drop that portion of the patch, and stick
to fixing reconnection of cached inodes to dentries. Clients who are
sensitive to ESTALE should mount with subtree_check (so that parent
information is included in the NFS file handles, increasing our ability
to reconnect), and either handle ESTALE at the application level, or via
patches such as Jeff Layton's series
(https://lkml.org/lkml/2012/6/29/381 - I will be testing this shortly).

> > Do you have any objection to making the use of a directory logstart
> > cache a mount option that defaults to off? It seems a shame to penalize
> > everyone - particularly embedded systems - with the overhead of such a
> > cache when FAT-backed-NFS seems to be such a small percentage of use
> > cases.
> 
> I'm not sure what did it mean. It means to remove i_logstart from NFS
> file handle?

I am proposing to remove i_logstart from the file handle but what I was
asking here is whether the population of the new logstart index you've
proposed could be optional.

Maybe it's time to post a new spin of the patch so we are all talking
about the same concrete thing.
------------------------------------------------------------------------
Steven J. Magnani               "I claim this network for MARS!
www.digidescorp.com              Earthling, return my space modulator!"

#include <standard.disclaimer>



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-09 16:27                         ` Steven J. Magnani
@ 2012-07-09 17:09                           ` Steven J. Magnani
  2012-07-09 17:23                             ` Steven J. Magnani
  2012-07-09 19:10                             ` OGAWA Hirofumi
  0 siblings, 2 replies; 27+ messages in thread
From: Steven J. Magnani @ 2012-07-09 17:09 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel

On Mon, 2012-07-09 at 11:27 -0500, Steven J. Magnani wrote: 
> On Tue, 2012-07-10 at 01:10 +0900, OGAWA Hirofumi wrote: 
> > "Steven J. Magnani" <steve@digidescorp.com> writes:
> > 
> > > On Mon, 2012-07-09 at 22:43 +0900, OGAWA Hirofumi wrote: 
> > >> "Steven J. Magnani" <steve@digidescorp.com> writes:
> > >> 
> > >> >> We need the key, possible key is - if it is only directory, FAT may be
> > >> >> able to use i_start as additional search key.
> > >> >
> > >> > Interesting idea. I think this, and reformulating the FAT NFS file
> > >> > handle to include the parent's i_ino, will greatly simplify (and speed
> > >> > up) the code.
> > >> 
> > >> Does it work even if the inode was rename()'ed?
> > >
> > > AFAICT. I don't see why it wouldn't; on a rename, the inode's i_pos
> > > changes but its i_ino stays the same, right?
> > 
> > If the inode is not on cache anymore, is there the possibility that
> > selects the wrong parent? IIRC, NFS Server can be rebooted at any time
> > while the client using the same file handle.
> 
> True, but it's looking like we can just use the default handle
> constructed by export_encode_fh(), namely (i_ino, i_generation,
> parent->i_ino, parent->i_generation). None of those components should
> change in a server reboot.

I think I misunderstood you when I wrote this. I assumed we were talking
about a restart of nfsd, not the entire machine it was running on. If
there is a danger of mismapping on a reboot isn't that present in the
existing mainline code, i.e. fat_fh_to_dentry()? Ideally, the (i_ino,
i_generation) signature would be different on a reboot, although with
only 2-second granularity in i_generation I suppose that's less likely
than we would prefer. Also I would think that many inodes simply
wouldn't exist in the cache, in which case we would fail the operation
with ESTALE.

------------------------------------------------------------------------
Steven J. Magnani               "I claim this network for MARS!
www.digidescorp.com              Earthling, return my space modulator!"

#include <standard.disclaimer>




^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-09 17:09                           ` Steven J. Magnani
@ 2012-07-09 17:23                             ` Steven J. Magnani
  2012-07-09 19:10                             ` OGAWA Hirofumi
  1 sibling, 0 replies; 27+ messages in thread
From: Steven J. Magnani @ 2012-07-09 17:23 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel

On Mon, 2012-07-09 at 12:09 -0500, Steven J. Magnani wrote: 
> On Mon, 2012-07-09 at 11:27 -0500, Steven J. Magnani wrote: 
> > On Tue, 2012-07-10 at 01:10 +0900, OGAWA Hirofumi wrote: 
> > > If the inode is not on cache anymore, is there the possibility that
> > > selects the wrong parent? IIRC, NFS Server can be rebooted at any time
> > > while the client using the same file handle.
> > 
> > True, but it's looking like we can just use the default handle
> > constructed by export_encode_fh(), namely (i_ino, i_generation,
> > parent->i_ino, parent->i_generation). None of those components should
> > change in a server reboot.
> 
> I think I misunderstood you when I wrote this. I assumed we were talking
> about a restart of nfsd, not the entire machine it was running on. If
> there is a danger of mismapping on a reboot isn't that present in the
> existing mainline code, i.e. fat_fh_to_dentry()? Ideally, the (i_ino,
> i_generation) signature would be different on a reboot, although with
> only 2-second granularity in i_generation I suppose that's less likely
> than we would prefer. Also I would think that many inodes simply
> wouldn't exist in the cache, in which case we would fail the operation
> with ESTALE.

On further investigation I don't think there is an issue here.
i_generation is set to the value returned by get_seconds(), which is
wall-clock time, not relative time since boot.

------------------------------------------------------------------------
Steven J. Magnani               "I claim this network for MARS!
www.digidescorp.com              Earthling, return my space modulator!"

#include <standard.disclaimer>



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-09 17:09                           ` Steven J. Magnani
  2012-07-09 17:23                             ` Steven J. Magnani
@ 2012-07-09 19:10                             ` OGAWA Hirofumi
  2012-07-09 20:26                               ` Steven J. Magnani
  1 sibling, 1 reply; 27+ messages in thread
From: OGAWA Hirofumi @ 2012-07-09 19:10 UTC (permalink / raw)
  To: Steven J. Magnani; +Cc: linux-kernel

"Steven J. Magnani" <steve@digidescorp.com> writes:

>> > >> > Interesting idea. I think this, and reformulating the FAT NFS file
>> > >> > handle to include the parent's i_ino, will greatly simplify (and speed
>> > >> > up) the code.
>> > >> 
>> > >> Does it work even if the inode was rename()'ed?
>> > >
>> > > AFAICT. I don't see why it wouldn't; on a rename, the inode's i_pos
>> > > changes but its i_ino stays the same, right?
>> > 
>> > If the inode is not on cache anymore, is there the possibility that
>> > selects the wrong parent? IIRC, NFS Server can be rebooted at any time
>> > while the client using the same file handle.
>> 
>> True, but it's looking like we can just use the default handle
>> constructed by export_encode_fh(), namely (i_ino, i_generation,
>> parent->i_ino, parent->i_generation). None of those components should
>> change in a server reboot.
>
> I think I misunderstood you when I wrote this. I assumed we were talking
> about a restart of nfsd, not the entire machine it was running on. If
> there is a danger of mismapping on a reboot isn't that present in the
> existing mainline code, i.e. fat_fh_to_dentry()? Ideally, the (i_ino,
> i_generation) signature would be different on a reboot, although with
> only 2-second granularity in i_generation I suppose that's less likely
> than we would prefer. Also I would think that many inodes simply
> wouldn't exist in the cache, in which case we would fail the operation
> with ESTALE.

Ah, i_ino. I was talking about i_pos. Well, so, what happens if the
child was renamed to other parent on NFS server machine (not via nfs
client)? The file handle would be including the old i_ino, and the old
i_ino on file handle is still vaild as old parent. So, it returns the
wrong parent?
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-09 19:10                             ` OGAWA Hirofumi
@ 2012-07-09 20:26                               ` Steven J. Magnani
  2012-07-09 21:34                                 ` OGAWA Hirofumi
  0 siblings, 1 reply; 27+ messages in thread
From: Steven J. Magnani @ 2012-07-09 20:26 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel

On Tue, 2012-07-10 at 04:10 +0900, OGAWA Hirofumi wrote: 
> "Steven J. Magnani" <steve@digidescorp.com> writes:
> 
> >> > >> > Interesting idea. I think this, and reformulating the FAT NFS file
> >> > >> > handle to include the parent's i_ino, will greatly simplify (and speed
> >> > >> > up) the code.
> >> > >> 
> >> > >> Does it work even if the inode was rename()'ed?
> >> > >
> >> > > AFAICT. I don't see why it wouldn't; on a rename, the inode's i_pos
> >> > > changes but its i_ino stays the same, right?
> >> > 
> >> > If the inode is not on cache anymore, is there the possibility that
> >> > selects the wrong parent? IIRC, NFS Server can be rebooted at any time
> >> > while the client using the same file handle.
> >> 
> >> True, but it's looking like we can just use the default handle
> >> constructed by export_encode_fh(), namely (i_ino, i_generation,
> >> parent->i_ino, parent->i_generation). None of those components should
> >> change in a server reboot.
> >
> > I think I misunderstood you when I wrote this. I assumed we were talking
> > about a restart of nfsd, not the entire machine it was running on. If
> > there is a danger of mismapping on a reboot isn't that present in the
> > existing mainline code, i.e. fat_fh_to_dentry()? Ideally, the (i_ino,
> > i_generation) signature would be different on a reboot, although with
> > only 2-second granularity in i_generation I suppose that's less likely
> > than we would prefer. Also I would think that many inodes simply
> > wouldn't exist in the cache, in which case we would fail the operation
> > with ESTALE.
> 
> Ah, i_ino. I was talking about i_pos. Well, so, what happens if the
> child was renamed to other parent on NFS server machine (not via nfs
> client)? The file handle would be including the old i_ino, and the old
> i_ino on file handle is still vaild as old parent. So, it returns the
> wrong parent?

Yes, but I believe exportfs_decode_fh() handles that case:

  /*
   * Now that we've got both a well-connected parent and a
   * dentry for the inode we're after, make sure that our
   * inode is actually connected to the parent.
   */


Really, the FAT NFS code will pretty much parallel that of ext2.
------------------------------------------------------------------------
Steven J. Magnani               "I claim this network for MARS!
www.digidescorp.com              Earthling, return my space modulator!"

#include <standard.disclaimer>




^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-09 20:26                               ` Steven J. Magnani
@ 2012-07-09 21:34                                 ` OGAWA Hirofumi
  2012-07-09 22:03                                   ` Steven J. Magnani
  0 siblings, 1 reply; 27+ messages in thread
From: OGAWA Hirofumi @ 2012-07-09 21:34 UTC (permalink / raw)
  To: Steven J. Magnani; +Cc: linux-kernel

"Steven J. Magnani" <steve@digidescorp.com> writes:

>> Ah, i_ino. I was talking about i_pos. Well, so, what happens if the
>> child was renamed to other parent on NFS server machine (not via nfs
>> client)? The file handle would be including the old i_ino, and the old
>> i_ino on file handle is still vaild as old parent. So, it returns the
>> wrong parent?
>
> Yes, but I believe exportfs_decode_fh() handles that case:
>
>   /*
>    * Now that we've got both a well-connected parent and a
>    * dentry for the inode we're after, make sure that our
>    * inode is actually connected to the parent.
>    */
>
>
> Really, the FAT NFS code will pretty much parallel that of ext2.

Hm, not really, if the file handle is including parent ino. ext2 will
get the latest parent ino, because it checks parent of inode of file
handle.

But if the file handle is including parent ino and we believe it is
parent, I think NFS server can be return the old parent. The difference
is the result of ->get_parent().
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-09 21:34                                 ` OGAWA Hirofumi
@ 2012-07-09 22:03                                   ` Steven J. Magnani
  2012-07-09 22:17                                     ` OGAWA Hirofumi
  0 siblings, 1 reply; 27+ messages in thread
From: Steven J. Magnani @ 2012-07-09 22:03 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel

On Tue, 2012-07-10 at 06:34 +0900, OGAWA Hirofumi wrote: 
> "Steven J. Magnani" <steve@digidescorp.com> writes:
> 
> >> Ah, i_ino. I was talking about i_pos. Well, so, what happens if the
> >> child was renamed to other parent on NFS server machine (not via nfs
> >> client)? The file handle would be including the old i_ino, and the old
> >> i_ino on file handle is still vaild as old parent. So, it returns the
> >> wrong parent?
> >
> > Yes, but I believe exportfs_decode_fh() handles that case:
> >
> >   /*
> >    * Now that we've got both a well-connected parent and a
> >    * dentry for the inode we're after, make sure that our
> >    * inode is actually connected to the parent.
> >    */
> >
> >
> > Really, the FAT NFS code will pretty much parallel that of ext2.
> 
> Hm, not really, if the file handle is including parent ino. ext2 will
> get the latest parent ino, because it checks parent of inode of file
> handle.

Can you point me to the code for this? The code I see looks pretty
congruent to what I think the FAT code would be.

> But if the file handle is including parent ino and we believe it is
> parent, I think NFS server can be return the old parent. The difference
> is the result of ->get_parent().

I'm a little confused about which function we're discussing here.
fat_get_parent() isn't called with a file handle. fat_fh_to_parent() is,
but it is only called by exportfs_decode_fh() and I am reasonably sure
that that function is handling the case you're concerned about.
------------------------------------------------------------------------
Steven J. Magnani               "I claim this network for MARS!
www.digidescorp.com              Earthling, return my space modulator!"

#include <standard.disclaimer>




^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries
  2012-07-09 22:03                                   ` Steven J. Magnani
@ 2012-07-09 22:17                                     ` OGAWA Hirofumi
  0 siblings, 0 replies; 27+ messages in thread
From: OGAWA Hirofumi @ 2012-07-09 22:17 UTC (permalink / raw)
  To: Steven J. Magnani; +Cc: linux-kernel

"Steven J. Magnani" <steve@digidescorp.com> writes:

>> Hm, not really, if the file handle is including parent ino. ext2 will
>> get the latest parent ino, because it checks parent of inode of file
>> handle.
>
> Can you point me to the code for this? The code I see looks pretty
> congruent to what I think the FAT code would be.
>
>> But if the file handle is including parent ino and we believe it is
>> parent, I think NFS server can be return the old parent. The difference
>> is the result of ->get_parent().
>
> I'm a little confused about which function we're discussing here.
> fat_get_parent() isn't called with a file handle. fat_fh_to_parent() is,
> but it is only called by exportfs_decode_fh() and I am reasonably sure
> that that function is handling the case you're concerned about.

Oh, you are right. I should discuss about it based on the new patch, and
only if there is new usage of file handle (e.g. parent->i_no).
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/2] fat (exportfs): fix NFS file handle decode
@ 2012-07-03 19:06 Steven J. Magnani
  0 siblings, 0 replies; 27+ messages in thread
From: Steven J. Magnani @ 2012-07-03 19:06 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: linux-kernel

When the system evicts inodes and dentries under memory pressure, the FAT
driver is unable to map NFS file handles back to the objects they reference.
In many cases this causes client operations to fail with ENOENT. This is
partially due to ineffectiveness of the current FAT NFS implementation,
and partially due to export_operations that have not yet been implemented for
FAT. For example, the lack of a fh_to_parent method can cause file accesses to
fail on shares exported with subtree_check.

This series depends on the following patches:
* fat: Fix non-atomic NFS i_pos read
* fat: Accessors for msdos_dir_entry 'start' fields
* fat: Refactor shortname parsing



^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2012-07-09 22:17 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-07-03 19:09 [PATCH 0/2] fat (exportfs): fix NFS file handle decode Steven J. Magnani
2012-07-03 19:09 ` [PATCH 1/2] fat (exportfs): drop ineffective get_parent code Steven J. Magnani
2012-07-04 10:30   ` OGAWA Hirofumi
2012-07-03 19:09 ` [PATCH 2/2] fat (exportfs): reconnect file handles to evicted inodes/dentries Steven J. Magnani
2012-07-04 11:07   ` OGAWA Hirofumi
2012-07-04 18:03     ` Steve Magnani
2012-07-05  3:59       ` OGAWA Hirofumi
2012-07-05 20:03         ` Steven J. Magnani
2012-07-06 20:33     ` Steven J. Magnani
2012-07-06 21:07       ` OGAWA Hirofumi
2012-07-07  1:16         ` Steven J. Magnani
2012-07-07  6:03           ` OGAWA Hirofumi
2012-07-07 16:41             ` Steven J. Magnani
2012-07-07 17:00               ` OGAWA Hirofumi
2012-07-09 12:03                 ` Steven J. Magnani
2012-07-09 13:43                   ` OGAWA Hirofumi
2012-07-09 14:47                     ` Steven J. Magnani
2012-07-09 16:10                       ` OGAWA Hirofumi
2012-07-09 16:27                         ` Steven J. Magnani
2012-07-09 17:09                           ` Steven J. Magnani
2012-07-09 17:23                             ` Steven J. Magnani
2012-07-09 19:10                             ` OGAWA Hirofumi
2012-07-09 20:26                               ` Steven J. Magnani
2012-07-09 21:34                                 ` OGAWA Hirofumi
2012-07-09 22:03                                   ` Steven J. Magnani
2012-07-09 22:17                                     ` OGAWA Hirofumi
  -- strict thread matches above, loose matches on Subject: below --
2012-07-03 19:06 [PATCH 0/2] fat (exportfs): fix NFS file handle decode Steven J. Magnani

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.