linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] fat: editions to support fat_fallocate()
@ 2012-10-13 14:31 Namjae Jeon
  2012-10-14 16:20 ` OGAWA Hirofumi
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Namjae Jeon @ 2012-10-13 14:31 UTC (permalink / raw)
  To: hirofumi, akpm; +Cc: linux-kernel, Namjae Jeon, Ravishankar N, Amit Sahrawat

Implement preallocation via the fallocate syscall on VFAT partitions.
This patch is based on an earlier patch of the same name which had some
issues detailed below and did not get accepted. Refer
https://lkml.org/lkml/2007/12/22/130.

a)The preallocated space was not persistent across remounts when the
FALLOC_FL_KEEP_SIZE flag was set. Also, writes to the file allocated new
clusters instead of using the preallocated area.

Consider the scenario:
mount-->preallocate space for a file --> unmount.
In the old patch,the preallocated space was not reflected for that
file (verified using the 'du' command).

This is now fixed with modifications to fat_fill_inode().

b)There was no need to zero out the clusters when the flag was set.

Instead of doing an expanding truncate, just allocate clusters and add
them to the fat chain. This reduces preallocation time.If the file is
seeked beyond the current file size(i_size) at the time of writing, zero
out the bytes from i_size to the seek point at write time.

Compatibility with windows:
There are no issues when FALLOC_FL_KEEP_SIZE is not set
because it just does an expanding truncate. Thus reading from the
preallocated area on windows returns null until data is written to it.

When a file with preallocated area using the FALLOC_FL_KEEP_SIZE was
written to on windows, the windows driver freed-up the preallocated
clusters and allocated new clusters for the new data. The freed up
clusters gets reflected in the free space available for the partition
which can be seen from the Volume properties.

The windows chkdsk tool also does not report any errors on a
disk containing files with preallocated space.

Signed-off-by: Namjae Jeon <linkinjeon@gmail.com>
Signed-off-by: Ravishankar N <cyberax82@gmail.com>
Signed-off-by: Amit Sahrawat <amit.sahrawat83@gmail.com>
---
 fs/fat/file.c  |   82 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fat/inode.c |   59 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 141 insertions(+)

diff --git a/fs/fat/file.c b/fs/fat/file.c
index 1f81cb4..cae2eec 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -17,8 +17,11 @@
 #include <linux/blkdev.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
+#include <linux/falloc.h>
 #include "fat.h"
 
+static long fat_fallocate(struct file *file, int mode,
+				loff_t offset, loff_t len);
 static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
 {
 	u32 attr;
@@ -174,6 +177,7 @@ const struct file_operations fat_file_operations = {
 #endif
 	.fsync		= fat_file_fsync,
 	.splice_read	= generic_file_splice_read,
+	.fallocate      = fat_fallocate,
 };
 
 static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -211,7 +215,85 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
 out:
 	return err;
 }
+/*
+ * preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset.If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out.Otherwise we
+ * allocate and zero out clusters via an expanding truncate.
+ */
+static long fat_fallocate(struct file *file, int mode,
+				loff_t offset, loff_t len)
+{
+	int err = 0;
+	struct inode *inode = file->f_mapping->host;
+	int cluster, nr_cluster, fclus, dclus, free_bytes, nr_bytes;
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
+	/* No support for hole punch or other fallocate flags. */
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
+	if ((offset + len) <= MSDOS_I(inode)->mmu_private) {
+		fat_msg(sb, KERN_ERR,
+			"fat_fallocate():Blocks already allocated");
+		return -EINVAL;
+	}
+
+	if ((mode & FALLOC_FL_KEEP_SIZE)) {
+		/* First compute the number of clusters to be allocated */
+		if (inode->i_size > 0) {
+			err = fat_get_cluster(inode, FAT_ENT_EOF,
+					      &fclus, &dclus);
+			if (err < 0) {
+				fat_msg(sb, KERN_ERR,
+					"fat_fallocate():fat_get_cluster() error");
+				return err;
+			}
+			free_bytes = ((fclus+1) << sbi->cluster_bits)-
+				     (inode->i_size);
+			nr_bytes = (offset + len - inode->i_size) - free_bytes;
+		} else
+			nr_bytes = (offset + len - inode->i_size);
+		nr_cluster = (nr_bytes + (sbi->cluster_size - 1)) >>
+			     sbi->cluster_bits;
+		mutex_lock(&inode->i_mutex);
+		/* Start the allocation.We are not zeroing out the clusters */
+		while (nr_cluster-- > 0) {
+			err = fat_alloc_clusters(inode, &cluster, 1);
+			if (err) {
+				fat_msg(sb, KERN_ERR,
+					"fat_fallocate():fat_alloc_clusters() error");
+				goto error;
+			}
+			err = fat_chain_add(inode, cluster, 1);
+			if (err) {
+				fat_free_clusters(inode, cluster);
+				goto error;
+			}
+		}
+		/* update mmu_private to allow writing to allocated clusters */
+		err = fat_get_cluster(inode, FAT_ENT_EOF, &fclus, &dclus);
+		if (err < 0) {
+			fat_msg(sb, KERN_ERR,
+				"fat_fallocate():fat_get_cluster() error");
+			goto error;
+		}
+		MSDOS_I(inode)->mmu_private = (fclus + 1) << sbi->cluster_bits;
+	} else {
+		mutex_lock(&inode->i_mutex);
+		/* This is just an expanding truncate */
+		err = fat_cont_expand(inode, (offset + len));
+		if (err) {
+			fat_msg(sb, KERN_ERR,
+				"fat_fallocate():fat_cont_expand() error");
+		}
+	}
+error:
+	mutex_unlock(&inode->i_mutex);
+	return err;
+}
 /* Free all clusters after the skip'th cluster. */
 static int fat_free(struct inode *inode, int skip)
 {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 80c6fdd..4a2d929 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -151,11 +151,58 @@ static void fat_write_failed(struct address_space *mapping, loff_t to)
 	}
 }
 
+static int fat_zero_falloc_area(struct file *file,
+				struct address_space *mapping, loff_t pos)
+{
+	struct page *page;
+	struct inode *inode = mapping->host;
+	loff_t curpos = inode->i_size;
+	size_t count = pos-curpos;
+	int err;
+	do {
+		unsigned offset, bytes;
+		void *fsdata;
+
+		offset = (curpos & (PAGE_CACHE_SIZE - 1));
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count)
+			bytes = count;
+
+		err = pagecache_write_begin(NULL, mapping, curpos, bytes,
+					AOP_FLAG_UNINTERRUPTIBLE,
+					&page, &fsdata);
+		if (err)
+			break;
+
+		zero_user(page, offset, bytes);
+
+		err = pagecache_write_end(NULL, mapping, curpos, bytes, bytes,
+					page, fsdata);
+		WARN_ON(err <= 0);
+		curpos += bytes;
+		count -= bytes;
+		err = 0;
+	} while (count);
+
+	return -err;
+}
+
 static int fat_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
 	int err;
+	struct inode *inode = mapping->host;
+	struct super_block *sb = inode->i_sb;
+	loff_t mmu_private_actual = MSDOS_I(inode)->mmu_private;
+	loff_t mmu_private_ideal = (inode->i_size + (sb->s_blocksize-1)) &
+					 ~(sb->s_blocksize-1);
+
+	if ((mmu_private_actual > mmu_private_ideal) && (pos > inode->i_size)) {
+		err = fat_zero_falloc_area(file, mapping, pos);
+		if (err)
+			fat_msg(sb, KERN_ERR, "error zeroing fallocated area");
+	}
 
 	*pagep = NULL;
 	err = cont_write_begin(file, mapping, pos, len, flags,
@@ -422,6 +469,15 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 		inode->i_op = &fat_file_inode_operations;
 		inode->i_fop = &fat_file_operations;
 		inode->i_mapping->a_ops = &fat_aops;
+		/*
+		 * calculate mmu_private and i_blocks from the actual number of
+		 * allocated clusters instead of doing it from file size.This
+		 * ensures that the preallocated disk space with
+		 * FALLOC_FL_KEEP_SIZE is persistent across remounts and writes
+		 * go into the preallocated clusters.Doing this changes i_size
+		 * which we restore below.
+		 */
+		fat_calc_dir_size(inode);
 		MSDOS_I(inode)->mmu_private = inode->i_size;
 	}
 	if (de->attr & ATTR_SYS) {
@@ -432,6 +488,9 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
+	/* restore i_size */
+	if (!(de->attr & ATTR_DIR))
+		inode->i_size = le32_to_cpu(de->size);
 
 	fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
 	if (sbi->options.isvfat) {
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] fat: editions to support fat_fallocate()
  2012-10-13 14:31 [PATCH v2] fat: editions to support fat_fallocate() Namjae Jeon
@ 2012-10-14 16:20 ` OGAWA Hirofumi
  2012-10-16  4:12   ` Namjae Jeon
  2013-02-14  2:40 ` Read support for fat_fallocate()? (was [v2] fat: editions to support fat_fallocate()) Andrew Bartlett
  2013-02-14  2:48 ` Andrew Bartlett
  2 siblings, 1 reply; 20+ messages in thread
From: OGAWA Hirofumi @ 2012-10-14 16:20 UTC (permalink / raw)
  To: Namjae Jeon; +Cc: akpm, linux-kernel, Ravishankar N, Amit Sahrawat

Namjae Jeon <linkinjeon@gmail.com> writes:

> Implement preallocation via the fallocate syscall on VFAT partitions.
> This patch is based on an earlier patch of the same name which had some
> issues detailed below and did not get accepted. Refer
> https://lkml.org/lkml/2007/12/22/130.
>
> a)The preallocated space was not persistent across remounts when the
> FALLOC_FL_KEEP_SIZE flag was set. Also, writes to the file allocated new
> clusters instead of using the preallocated area.
>
> Consider the scenario:
> mount-->preallocate space for a file --> unmount.
> In the old patch,the preallocated space was not reflected for that
> file (verified using the 'du' command).
>
> This is now fixed with modifications to fat_fill_inode().

What is real usage pattern of persistent across remounts on FAT? If once
device was unmounted, we can't know the state of FS anymore, there are
many implementations of FAT. And preallocation is not in the spec.

I worry to break something. And I guess the freeing preallocation on
last close may fix the issue for usage.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] fat: editions to support fat_fallocate()
  2012-10-14 16:20 ` OGAWA Hirofumi
@ 2012-10-16  4:12   ` Namjae Jeon
  2012-10-16 10:14     ` OGAWA Hirofumi
  0 siblings, 1 reply; 20+ messages in thread
From: Namjae Jeon @ 2012-10-16  4:12 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: akpm, linux-kernel, Ravishankar N, Amit Sahrawat

2012/10/15 OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>:
> Namjae Jeon <linkinjeon@gmail.com> writes:
>
>> Implement preallocation via the fallocate syscall on VFAT partitions.
>> This patch is based on an earlier patch of the same name which had some
>> issues detailed below and did not get accepted. Refer
>> https://lkml.org/lkml/2007/12/22/130.
>>
>> a)The preallocated space was not persistent across remounts when the
>> FALLOC_FL_KEEP_SIZE flag was set. Also, writes to the file allocated new
>> clusters instead of using the preallocated area.
>>
>> Consider the scenario:
>> mount-->preallocate space for a file --> unmount.
>> In the old patch,the preallocated space was not reflected for that
>> file (verified using the 'du' command).
>>
>> This is now fixed with modifications to fat_fill_inode().
>
When we consider other filesystems like XFS and ext4,  the space which
is preallocated is reserved for the life-time of that file which is
persistent across(mount/umount).
So, we tried to make this as similar to the existent solution - as
that would keep the meaning of FALLOCATE - WITH_KEEP_SIZE as same
across all filesystems.

> What is real usage pattern of persistent across remounts on FAT?
Yes,  like a TORRENT FILE -> it reserves space in advance
even though the system can be rebooted/disk unmounted and remount
but the space still remains there - as long as the torrent exists
Or if Torrent case does not matches currently
Then, Consider a case for a TV series to be recorded
Since – we want all the parts to be recorded on the same file (i.e.,
APPEND write) – and in such cases there are chances of TV shutdown,
device unmount-mount again. So, we need to have the space to be remain
available in such cases.

> If once device was unmounted, we can't know the state of FS anymore, there are
> many implementations of FAT. And preallocation is not in the spec.
I agree, As you said before, we can make fat fallocate feature as
configurable – so this is entirely in the hands of USER.
>
> I worry to break something. And I guess the freeing preallocation on
> last close may fix the issue for usage.
Okay, we can avoid most of your concerns except suddenly unplugging usb device.
But fallocate behavior will be different with other filesystem.

How about to make fat fallocate with configuration to be used by users
is having needs?

Let me know your opinion :)

Thanks.
Thanks.> --
> OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] fat: editions to support fat_fallocate()
  2012-10-16  4:12   ` Namjae Jeon
@ 2012-10-16 10:14     ` OGAWA Hirofumi
  2012-10-17 10:57       ` Namjae Jeon
  0 siblings, 1 reply; 20+ messages in thread
From: OGAWA Hirofumi @ 2012-10-16 10:14 UTC (permalink / raw)
  To: Namjae Jeon; +Cc: akpm, linux-kernel, Ravishankar N, Amit Sahrawat

Namjae Jeon <linkinjeon@gmail.com> writes:

>> What is real usage pattern of persistent across remounts on FAT?
> Yes,  like a TORRENT FILE -> it reserves space in advance
> even though the system can be rebooted/disk unmounted and remount
> but the space still remains there - as long as the torrent exists
> Or if Torrent case does not matches currently
> Then, Consider a case for a TV series to be recorded
> Since – we want all the parts to be recorded on the same file (i.e.,
> APPEND write) – and in such cases there are chances of TV shutdown,
> device unmount-mount again. So, we need to have the space to be remain
> available in such cases.

The expectation of fallocate() is just for space reservation? If it was
just for space reservation, I'm not sure, why TV applications can't
reserve in userland without any kernel help (I wonder who interrupts TV
application). I feel a bit, it may be more lightweight than fallocate(),
and more reliable than out of spec fallocate().

I'm still not sure why apps really want fallocate() on FAT.

>> If once device was unmounted, we can't know the state of FS anymore, there are
>> many implementations of FAT. And preallocation is not in the spec.
> I agree, As you said before, we can make fat fallocate feature as
> configurable – so this is entirely in the hands of USER.
>>
>> I worry to break something. And I guess the freeing preallocation on
>> last close may fix the issue for usage.
> Okay, we can avoid most of your concerns except suddenly unplugging usb device.
> But fallocate behavior will be different with other filesystem.
>
> How about to make fat fallocate with configuration to be used by users
> is having needs?

Hmm... I'm not still convinced to add makes really apps happy. Maybe,
I'm sill not understanding your usage. I think the out of spec feature
wouldn't be added if it was just a "better than nothing".

Thanks.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] fat: editions to support fat_fallocate()
  2012-10-16 10:14     ` OGAWA Hirofumi
@ 2012-10-17 10:57       ` Namjae Jeon
  2012-10-21 23:54         ` OGAWA Hirofumi
  0 siblings, 1 reply; 20+ messages in thread
From: Namjae Jeon @ 2012-10-17 10:57 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: akpm, linux-kernel, Ravishankar N, Amit Sahrawat, linux-fsdevel

2012/10/16, OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>:
> Namjae Jeon <linkinjeon@gmail.com> writes:
>
>>> What is real usage pattern of persistent across remounts on FAT?
>> Yes,  like a TORRENT FILE -> it reserves space in advance
>> even though the system can be rebooted/disk unmounted and remount
>> but the space still remains there - as long as the torrent exists
>> Or if Torrent case does not matches currently
>> Then, Consider a case for a TV series to be recorded
>> Since – we want all the parts to be recorded on the same file (i.e.,
>> APPEND write) – and in such cases there are chances of TV shutdown,
>> device unmount-mount again. So, we need to have the space to be remain
>> available in such cases.
>
> The expectation of fallocate() is just for space reservation? If it was
> just for space reservation, I'm not sure, why TV applications can't
> reserve in userland without any kernel help (I wonder who interrupts TV
> application). I feel a bit, it may be more lightweight than fallocate(),
> and more reliable than out of spec fallocate().
>
> I'm still not sure why apps really want fallocate() on FAT.
Yes, it is for user space reservation.

>From the application perspective it is needed to realize in advance
how much space is needed for that file write – so the requirement is
precisely that the space reserved is entirely for me and no other I/O
operation in that time should consume the space.

Of course, as you said, space can be pre-allocated from user space by
doing expanding truncate.
Main drawbacks for reserving through USER space:
1) If we need to allocate 1GB space -> seek (1GB) and write -> it will
ZEROUT the 1GB area (which is very time consuming) just for reserving
space.
2) The Application must always be aware of the SEEK OFFSET - otherwise
the APPEND WRITE will never occur and file is closed/opened (Append
mode) again for writing – it will instead start writing from the end
of file which is past the reserved space. So, this will also result in
losing space in such case – if application is not keeping track of
OFFSET
3) If suppose from user space we are doing expanding truncate of 1GB
and suppose it fails after 256MB of allocation - in that case it did
allocation of 256MB blocks - did ZEROUT for all these blocks and then
returned failure - which is not optimal for just allocation of space.

While if we make use of FALLOCATE
1)	It allows reserving the space in advance without any delay.
2)	Since, the space is reserved in advance. So, if suppose space is
reserved for 1hour TV recording than any other application in the
background cannot cause recording to fail in case of “no free space”
left as it already pre-allocated space. Only other Applications will
close.
3)	It allows for APPEND write to continue smoothly without actually
keeping track of the file state, offset.
4)	Initially when the disk is not fragmented. It allows the
possibility to get contiguous blocks and thus reducing fragmentation
for that file.

Thanks.
>
>>> If once device was unmounted, we can't know the state of FS anymore,
>>> there are
>>> many implementations of FAT. And preallocation is not in the spec.
>> I agree, As you said before, we can make fat fallocate feature as
>> configurable – so this is entirely in the hands of USER.
>>>
>>> I worry to break something. And I guess the freeing preallocation on
>>> last close may fix the issue for usage.
>> Okay, we can avoid most of your concerns except suddenly unplugging usb
>> device.
>> But fallocate behavior will be different with other filesystem.
>>
>> How about to make fat fallocate with configuration to be used by users
>> is having needs?
>
> Hmm... I'm not still convinced to add makes really apps happy. Maybe,
> I'm sill not understanding your usage. I think the out of spec feature
> wouldn't be added if it was just a "better than nothing".
>
> Thanks.
> --
> OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] fat: editions to support fat_fallocate()
  2012-10-17 10:57       ` Namjae Jeon
@ 2012-10-21 23:54         ` OGAWA Hirofumi
  2012-10-22 15:10           ` Namjae Jeon
  0 siblings, 1 reply; 20+ messages in thread
From: OGAWA Hirofumi @ 2012-10-21 23:54 UTC (permalink / raw)
  To: Namjae Jeon
  Cc: akpm, linux-kernel, Ravishankar N, Amit Sahrawat, linux-fsdevel

Namjae Jeon <linkinjeon@gmail.com> writes:

>> The expectation of fallocate() is just for space reservation? If it was
>> just for space reservation, I'm not sure, why TV applications can't
>> reserve in userland without any kernel help (I wonder who interrupts TV
>> application). I feel a bit, it may be more lightweight than fallocate(),
>> and more reliable than out of spec fallocate().
>>
>> I'm still not sure why apps really want fallocate() on FAT.
> Yes, it is for user space reservation.
>
>>From the application perspective it is needed to realize in advance
> how much space is needed for that file write – so the requirement is
> precisely that the space reserved is entirely for me and no other I/O
> operation in that time should consume the space.
>
> Of course, as you said, space can be pre-allocated from user space by
> doing expanding truncate.
> Main drawbacks for reserving through USER space:
> 1) If we need to allocate 1GB space -> seek (1GB) and write -> it will
> ZEROUT the 1GB area (which is very time consuming) just for reserving
> space.
> 2) The Application must always be aware of the SEEK OFFSET - otherwise
> the APPEND WRITE will never occur and file is closed/opened (Append
> mode) again for writing – it will instead start writing from the end
> of file which is past the reserved space. So, this will also result in
> losing space in such case – if application is not keeping track of
> OFFSET
> 3) If suppose from user space we are doing expanding truncate of 1GB
> and suppose it fails after 256MB of allocation - in that case it did
> allocation of 256MB blocks - did ZEROUT for all these blocks and then
> returned failure - which is not optimal for just allocation of space.
>
> While if we make use of FALLOCATE
> 1)	It allows reserving the space in advance without any delay.
> 2)	Since, the space is reserved in advance. So, if suppose space is
> reserved for 1hour TV recording than any other application in the
> background cannot cause recording to fail in case of “no free space”
> left as it already pre-allocated space. Only other Applications will
> close.
> 3)	It allows for APPEND write to continue smoothly without actually
> keeping track of the file state, offset.
> 4)	Initially when the disk is not fragmented. It allows the
> possibility to get contiguous blocks and thus reducing fragmentation
> for that file.

OK.

Should TV recorder make sure it is reserving space with fallocate() for
each open() (or first open() after mount())? What fsck is going to do?
Or how to know fallocated space or corrupted space?

Does this break the linux fat driver doesn't know about this
fallocate()?  If so, it sounds like to be easy to break existent
drivers.

Thanks.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] fat: editions to support fat_fallocate()
  2012-10-21 23:54         ` OGAWA Hirofumi
@ 2012-10-22 15:10           ` Namjae Jeon
  2012-10-23  7:19             ` OGAWA Hirofumi
  0 siblings, 1 reply; 20+ messages in thread
From: Namjae Jeon @ 2012-10-22 15:10 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: akpm, linux-kernel, Ravishankar N, Amit Sahrawat, linux-fsdevel

2012/10/22, OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>:
> Namjae Jeon <linkinjeon@gmail.com> writes:
>
>>> The expectation of fallocate() is just for space reservation? If it was
>>> just for space reservation, I'm not sure, why TV applications can't
>>> reserve in userland without any kernel help (I wonder who interrupts TV
>>> application). I feel a bit, it may be more lightweight than fallocate(),
>>> and more reliable than out of spec fallocate().
>>>
>>> I'm still not sure why apps really want fallocate() on FAT.
>> Yes, it is for user space reservation.
>>
>>>From the application perspective it is needed to realize in advance
>> how much space is needed for that file write – so the requirement is
>> precisely that the space reserved is entirely for me and no other I/O
>> operation in that time should consume the space.
>>
>> Of course, as you said, space can be pre-allocated from user space by
>> doing expanding truncate.
>> Main drawbacks for reserving through USER space:
>> 1) If we need to allocate 1GB space -> seek (1GB) and write -> it will
>> ZEROUT the 1GB area (which is very time consuming) just for reserving
>> space.
>> 2) The Application must always be aware of the SEEK OFFSET - otherwise
>> the APPEND WRITE will never occur and file is closed/opened (Append
>> mode) again for writing – it will instead start writing from the end
>> of file which is past the reserved space. So, this will also result in
>> losing space in such case – if application is not keeping track of
>> OFFSET
>> 3) If suppose from user space we are doing expanding truncate of 1GB
>> and suppose it fails after 256MB of allocation - in that case it did
>> allocation of 256MB blocks - did ZEROUT for all these blocks and then
>> returned failure - which is not optimal for just allocation of space.
>>
>> While if we make use of FALLOCATE
>> 1)	It allows reserving the space in advance without any delay.
>> 2)	Since, the space is reserved in advance. So, if suppose space is
>> reserved for 1hour TV recording than any other application in the
>> background cannot cause recording to fail in case of “no free space”
>> left as it already pre-allocated space. Only other Applications will
>> close.
>> 3)	It allows for APPEND write to continue smoothly without actually
>> keeping track of the file state, offset.
>> 4)	Initially when the disk is not fragmented. It allows the
>> possibility to get contiguous blocks and thus reducing fragmentation
>> for that file.
>
> OK.
>
> Should TV recorder make sure it is reserving space with fallocate() for
> each open() (or first open() after mount())?
It is only for first open.

>What fsck is going to do?
> Or how to know fallocated space or corrupted space?
fsck does not know about fallocated space and considers it  corrupted
space due to mismatch between file size and disk usage. So it will
free up the allocated clusters, just like windows driver.

Fsck output for a 100MB prealloacted file.
---------------------------------------------------------------------------------------------------
# fsck.vfat -aw /dev/sdb3
dosfsck 3.0.12, 29 Oct 2011, FAT32, LFN
/falloc_file
  Bad short file name ( \000\005\016\013\032\022\013./\000\000).
  Auto-renaming it.
  Renamed to FSCK0000.001
/falloc_file
  File size is 6 bytes, cluster chain length is > 4096 bytes.
  Truncating file to 6 bytes.
Free cluster summary wrong (2541163 vs. really 2566762)
  Auto-correcting.
Performing changes.
/dev/sdb3: 5 files, 50565/2617327 clusters
--------------------------------------------------------------------------------------------------------

>
> Does this break the linux fat driver doesn't know about this
> fallocate()?  If so, it sounds like to be easy to break existent
> drivers.
Yes, it will break linux drivers without fallocate support. When we
try to write to fallocated file using old drivers, it will cause write
error and make FS read-only.
When fallocate was implemented in other filesystem, maybe,, was there
similar issue and concern  ?

 Thanks OGAWA!
>
> Thanks.
> --
> OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] fat: editions to support fat_fallocate()
  2012-10-22 15:10           ` Namjae Jeon
@ 2012-10-23  7:19             ` OGAWA Hirofumi
  2012-10-23  7:24               ` Namjae Jeon
  0 siblings, 1 reply; 20+ messages in thread
From: OGAWA Hirofumi @ 2012-10-23  7:19 UTC (permalink / raw)
  To: Namjae Jeon
  Cc: akpm, linux-kernel, Ravishankar N, Amit Sahrawat, linux-fsdevel

Namjae Jeon <linkinjeon@gmail.com> writes:

>> Does this break the linux fat driver doesn't know about this
>> fallocate()?  If so, it sounds like to be easy to break existent
>> drivers.
> Yes, it will break linux drivers without fallocate support. When we
> try to write to fallocated file using old drivers, it will cause write
> error and make FS read-only.
> When fallocate was implemented in other filesystem, maybe,, was there
> similar issue and concern  ?

I guess it is not similar. Because other FS can change the spec (e.g. it
can be possible to use feature compat flag for it).
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] fat: editions to support fat_fallocate()
  2012-10-23  7:19             ` OGAWA Hirofumi
@ 2012-10-23  7:24               ` Namjae Jeon
  0 siblings, 0 replies; 20+ messages in thread
From: Namjae Jeon @ 2012-10-23  7:24 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: akpm, linux-kernel, Ravishankar N, Amit Sahrawat, linux-fsdevel

2012/10/23, OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>:
> Namjae Jeon <linkinjeon@gmail.com> writes:
>
>>> Does this break the linux fat driver doesn't know about this
>>> fallocate()?  If so, it sounds like to be easy to break existent
>>> drivers.
>> Yes, it will break linux drivers without fallocate support. When we
>> try to write to fallocated file using old drivers, it will cause write
>> error and make FS read-only.
>> When fallocate was implemented in other filesystem, maybe,, was there
>> similar issue and concern  ?
>
> I guess it is not similar. Because other FS can change the spec (e.g. it
> can be possible to use feature compat flag for it).
Hi. OGAWA.
Okay, I agree.
Thanks for your interest.

> --
> OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Read support for fat_fallocate()?  (was  [v2] fat: editions to support fat_fallocate())
  2012-10-13 14:31 [PATCH v2] fat: editions to support fat_fallocate() Namjae Jeon
  2012-10-14 16:20 ` OGAWA Hirofumi
@ 2013-02-14  2:40 ` Andrew Bartlett
  2013-02-14  2:48 ` Andrew Bartlett
  2 siblings, 0 replies; 20+ messages in thread
From: Andrew Bartlett @ 2013-02-14  2:40 UTC (permalink / raw)
  To: Namjae Jeon; +Cc: hirofumi, linux-kernel, Ravishankar N, Amit Sahrawat

G'day,

I've been looking into the patch "[v2] fat: editions to support
fat_fallocate()" and I wonder if there is a way we can split this issue
in two, so that we get at least some of the patch into the kernel.

https://lkml.org/lkml/2012/10/13/75
https://patchwork.kernel.org/patch/1589161/

What I'm wanting to discuss (and perhaps implement, with you if
possible) is splitting this patch into writing to existing pre-allocated
files, and creating a new pre-allocation.

If Windows does, as you claim, simply read preallocations as zero, and
writes to them normally and without error, then Linux should do the
same.  Here of course I'm assuming that Windows is not preallocating,
but instead simply trying to recover gracefully and safely from a simple
'file system corruption', where the sectors are allocated but not used. 

The bulk of this patch is implementing this transparent recovery, and it
seem relatively harmless to include this into the kernel.

Then vendors doing TV streaming, or in my case copies of large files
onto Samba-mounted USB FAT devices, can add only the smaller patch to
implement fallocate, at their own risk and fully knowing that it will be
regarded as corrupt on Linux. 

If accepted read support will, over a period of years, trickle down to
other Linux users, broadening the base that can still read these
'corrupt' drives, no matter the cause. 

I hope you agree that this is a practical way forward, and I look
forward to working with you on this.

Thanks,

Andrew Bartlett
-- 
Andrew Bartlett                                http://samba.org/~abartlet/
Authentication Developer, Samba Team           http://samba.org



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Read support for fat_fallocate()?  (was  [v2] fat: editions to support fat_fallocate())
  2012-10-13 14:31 [PATCH v2] fat: editions to support fat_fallocate() Namjae Jeon
  2012-10-14 16:20 ` OGAWA Hirofumi
  2013-02-14  2:40 ` Read support for fat_fallocate()? (was [v2] fat: editions to support fat_fallocate()) Andrew Bartlett
@ 2013-02-14  2:48 ` Andrew Bartlett
  2013-02-14  6:44   ` Namjae Jeon
  2 siblings, 1 reply; 20+ messages in thread
From: Andrew Bartlett @ 2013-02-14  2:48 UTC (permalink / raw)
  To: Namjae Jeon; +Cc: hirofumi, linux-kernel, Ravishankar N, Amit Sahrawat

(apologies for the duplicate mail, I typo-ed the maintainers address)

G'day,

I've been looking into the patch "[v2] fat: editions to support
fat_fallocate()" and I wonder if there is a way we can split this issue
in two, so that we get at least some of the patch into the kernel.

https://lkml.org/lkml/2012/10/13/75
https://patchwork.kernel.org/patch/1589161/

What I'm wanting to discuss (and perhaps implement, with you if
possible) is splitting this patch into writing to existing pre-allocated
files, and creating a new pre-allocation.

If Windows does, as you claim, simply read preallocations as zero, and
writes to them normally and without error, then Linux should do the
same.  Here of course I'm assuming that Windows is not preallocating,
but instead simply trying to recover gracefully and safely from a simple
'file system corruption', where the sectors are allocated but not used. 

The bulk of this patch is implementing this transparent recovery, and it
seem relatively harmless to include this into the kernel.

Then vendors doing TV streaming, or in my case copies of large files
onto Samba-mounted USB FAT devices, can add only the smaller patch to
implement fallocate, at their own risk and fully knowing that it will be
regarded as corrupt on Linux. 

If accepted read support will, over a period of years, trickle down to
other Linux users, broadening the base that can still read these
'corrupt' drives, no matter the cause. 

I hope you agree that this is a practical way forward, and I look
forward to working with you on this.

Thanks,

Andrew Bartlett
-- 
Andrew Bartlett                                http://samba.org/~abartlet/
Authentication Developer, Samba Team           http://samba.org




^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Read support for fat_fallocate()? (was [v2] fat: editions to support fat_fallocate())
  2013-02-14  2:48 ` Andrew Bartlett
@ 2013-02-14  6:44   ` Namjae Jeon
  2013-02-14  7:07     ` Andrew Bartlett
  0 siblings, 1 reply; 20+ messages in thread
From: Namjae Jeon @ 2013-02-14  6:44 UTC (permalink / raw)
  To: Andrew Bartlett
  Cc: hirofumi, linux-kernel, Ravishankar N, Amit Sahrawat,
	Nam-Jae Jeon, Ravishankar N, Amit Sahrawat

2013/2/14, Andrew Bartlett <abartlet@samba.org>:
> (apologies for the duplicate mail, I typo-ed the maintainers address)
>
> G'day,
>
> I've been looking into the patch "[v2] fat: editions to support
> fat_fallocate()" and I wonder if there is a way we can split this issue
> in two, so that we get at least some of the patch into the kernel.
>
> https://lkml.org/lkml/2012/10/13/75
> https://patchwork.kernel.org/patch/1589161/
>
> What I'm wanting to discuss (and perhaps implement, with you if
> possible) is splitting this patch into writing to existing pre-allocated
> files, and creating a new pre-allocation.
>
> If Windows does, as you claim, simply read preallocations as zero, and
> writes to them normally and without error, then Linux should do the
> same.  Here of course I'm assuming that Windows is not preallocating,
> but instead simply trying to recover gracefully and safely from a simple
> 'file system corruption', where the sectors are allocated but not used.
>
> The bulk of this patch is implementing this transparent recovery, and it
> seem relatively harmless to include this into the kernel.
>
> Then vendors doing TV streaming, or in my case copies of large files
> onto Samba-mounted USB FAT devices, can add only the smaller patch to
> implement fallocate, at their own risk and fully knowing that it will be
> regarded as corrupt on Linux.
>
> If accepted read support will, over a period of years, trickle down to
> other Linux users, broadening the base that can still read these
> 'corrupt' drives, no matter the cause.
>
> I hope you agree that this is a practical way forward, and I look
> forward to working with you on this.
>
> Thanks,
Hi Andrew.

First, Thanks for your interest !
A mismatch between inode size and reserved blocks can be either due to
pre-allocation (after our changes) or due to corruption (sudden unplug
of media etc).
We don’t think it is right to include only read only support (i.e.
without fallocate support) for such files because if such files are
encountered it only means that the file is corrupted, as there is no
current method to check if the issue is due to pre-allocation.
If it is to be included in the kernel, then the whole patch has to go
in. But then again, since the FAT specifications do not accommodate
for pre-allocation, then it is up to OGAWA to decide if this is
acceptable.
In any case, the patch will definitely break backward compatibility
(on an older fat driver without fallocate support) and also in case
for the two variants for the same kernel versions and only one has
FALLOCATE enabled, in such cases also, the behavior will assume
corruption in one case.

Thanks.

>
> Andrew Bartlett
> --
> Andrew Bartlett                                http://samba.org/~abartlet/
> Authentication Developer, Samba Team           http://samba.org
>
>
>
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Read support for fat_fallocate()? (was [v2] fat: editions to support fat_fallocate())
  2013-02-14  6:44   ` Namjae Jeon
@ 2013-02-14  7:07     ` Andrew Bartlett
  2013-02-14  9:52       ` Namjae Jeon
  0 siblings, 1 reply; 20+ messages in thread
From: Andrew Bartlett @ 2013-02-14  7:07 UTC (permalink / raw)
  To: Namjae Jeon
  Cc: hirofumi, linux-kernel, Ravishankar N, Amit Sahrawat,
	Nam-Jae Jeon, Ravishankar N, Amit Sahrawat

On Thu, 2013-02-14 at 15:44 +0900, Namjae Jeon wrote:
> 2013/2/14, Andrew Bartlett <abartlet@samba.org>:
> > (apologies for the duplicate mail, I typo-ed the maintainers address)
> >
> > G'day,
> >
> > I've been looking into the patch "[v2] fat: editions to support
> > fat_fallocate()" and I wonder if there is a way we can split this issue
> > in two, so that we get at least some of the patch into the kernel.
> >
> > https://lkml.org/lkml/2012/10/13/75
> > https://patchwork.kernel.org/patch/1589161/
> >
> > What I'm wanting to discuss (and perhaps implement, with you if
> > possible) is splitting this patch into writing to existing pre-allocated
> > files, and creating a new pre-allocation.
> >
> > If Windows does, as you claim, simply read preallocations as zero, and
> > writes to them normally and without error, then Linux should do the
> > same.  Here of course I'm assuming that Windows is not preallocating,
> > but instead simply trying to recover gracefully and safely from a simple
> > 'file system corruption', where the sectors are allocated but not used.
> >
> > The bulk of this patch is implementing this transparent recovery, and it
> > seem relatively harmless to include this into the kernel.
> >
> > Then vendors doing TV streaming, or in my case copies of large files
> > onto Samba-mounted USB FAT devices, can add only the smaller patch to
> > implement fallocate, at their own risk and fully knowing that it will be
> > regarded as corrupt on Linux.
> >
> > If accepted read support will, over a period of years, trickle down to
> > other Linux users, broadening the base that can still read these
> > 'corrupt' drives, no matter the cause.
> >
> > I hope you agree that this is a practical way forward, and I look
> > forward to working with you on this.
> >
> > Thanks,
> Hi Andrew.
> 
> First, Thanks for your interest !
> A mismatch between inode size and reserved blocks can be either due to
> pre-allocation (after our changes) or due to corruption (sudden unplug
> of media etc).
> We don’t think it is right to include only read only support (i.e.
> without fallocate support) for such files because if such files are
> encountered it only means that the file is corrupted, as there is no
> current method to check if the issue is due to pre-allocation.
> If it is to be included in the kernel, then the whole patch has to go
> in. 

I don't see why that is the case. 

> But then again, since the FAT specifications do not accommodate
> for pre-allocation, then it is up to OGAWA to decide if this is
> acceptable.
> In any case, the patch will definitely break backward compatibility
> (on an older fat driver without fallocate support) and also in case
> for the two variants for the same kernel versions and only one has
> FALLOCATE enabled, in such cases also, the behavior will assume
> corruption in one case.

I agree that the sudden unplug is a concern, but why not make the
filesystem more robust against that inevitable occurrence?  If the
blocks appear to be allocated to the file, why not use them?

That is, while it is hard to predict the many different ways a
filesystem can be corrupted, what would go wrong if we did use these
clusters?  Do you fear that they might also be allocated to someone
else? 

That would, if I understand correctly just mean that that more broken,
not quite valid USB thumb drives and other FAT filesystems work equally
well on Windows and Linux, without administrative privileges.  (Given
that running fsck requires root, and isn't trivially available to normal
users in Linux, and I presume is similarly privileged in windows). 

What I'm doing is suggesting re-purposing your patch, from preallocation
to robustness.  In this light, do you think this worth pushing forward?

We can later address if there is any safe way to preallocate files on
FAT as a different question, hoping that this means it will 'just work'
on a broader range of other Linux hosts, just as it is claimed to 'just
work' on Windows.

Thanks,

Andrew Bartlett

-- 
Andrew Bartlett                                http://samba.org/~abartlet/
Authentication Developer, Samba Team           http://samba.org



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Read support for fat_fallocate()? (was [v2] fat: editions to support fat_fallocate())
  2013-02-14  7:07     ` Andrew Bartlett
@ 2013-02-14  9:52       ` Namjae Jeon
  2013-02-15  3:49         ` Andrew Bartlett
  0 siblings, 1 reply; 20+ messages in thread
From: Namjae Jeon @ 2013-02-14  9:52 UTC (permalink / raw)
  To: Andrew Bartlett
  Cc: hirofumi, linux-kernel, Ravishankar N, Amit Sahrawat,
	Nam-Jae Jeon, Ravishankar N, Amit Sahrawat

[snip]
>> >
>> > Thanks,
>> Hi Andrew.
>>
>> First, Thanks for your interest !
>> A mismatch between inode size and reserved blocks can be either due to
>> pre-allocation (after our changes) or due to corruption (sudden unplug
>> of media etc).
>> We don’t think it is right to include only read only support (i.e.
>> without fallocate support) for such files because if such files are
>> encountered it only means that the file is corrupted, as there is no
>> current method to check if the issue is due to pre-allocation.
>> If it is to be included in the kernel, then the whole patch has to go
>> in.
>
> I don't see why that is the case.
If we consider that there is no FALLOCATE support, then the condition
of file size and blocks not matching can be only possible in case of
corruption, right?

>
>> But then again, since the FAT specifications do not accommodate
>> for pre-allocation, then it is up to OGAWA to decide if this is
>> acceptable.
>> In any case, the patch will definitely break backward compatibility
>> (on an older fat driver without fallocate support) and also in case
>> for the two variants for the same kernel versions and only one has
>> FALLOCATE enabled, in such cases also, the behavior will assume
>> corruption in one case.
>
> I agree that the sudden unplug is a concern, but why not make the
> filesystem more robust against that inevitable occurrence?  If the
> blocks appear to be allocated to the file, why not use them?
We also agree that there should be pre-allocation feature on FAT, and
we had shared the scenarios where this could be required for a TV/
recorder.
But there are certain drawbacks which were raised by OGAWA with
respect to compatibility and we also tend to agree on them.
There could possibly be an issue where we are unable to distinguish
between pre-allocation and corruption. Perhaps we could set a status
bit on the file to indicate whether the file has pre-allocated blocks.
This will make it clear whether the allocation is genuine through the
FAT Fallocate request or is a result of corruption. Depending on the
status of the flag - the decision can be made regard to reading
blocks.
But, the main issue in this will be storing this bit in on-disk
directory entry for that file. From the feature and usability point of
view, we should have fallocate on FAT too.

But it needs initial ACK from OGAWA to continue to work on this so
that we can figure out the proper solution to move forward.
>
> That is, while it is hard to predict the many different ways a
> filesystem can be corrupted, what would go wrong if we did use these
> clusters?  Do you fear that they might also be allocated to someone
> else?
>
> That would, if I understand correctly just mean that that more broken,
> not quite valid USB thumb drives and other FAT filesystems work equally
> well on Windows and Linux, without administrative privileges.  (Given
> that running fsck requires root, and isn't trivially available to normal
> users in Linux, and I presume is similarly privileged in windows).
>
> What I'm doing is suggesting re-purposing your patch, from preallocation
> to robustness.  In this light, do you think this worth pushing forward?
The patch’s main aim was to reserve space. If the work that you
propose only aims to enable reads in case of corrupt files using size
mismatch as a criteria, then we think it would not be a good idea.

Thanks :-)
>
> We can later address if there is any safe way to preallocate files on
> FAT as a different question, hoping that this means it will 'just work'
> on a broader range of other Linux hosts, just as it is claimed to 'just
> work' on Windows.
>
> Thanks,
>
> Andrew Bartlett
>
> --
> Andrew Bartlett                                http://samba.org/~abartlet/
> Authentication Developer, Samba Team           http://samba.org
>
>
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Read support for fat_fallocate()? (was [v2] fat: editions to support fat_fallocate())
  2013-02-14  9:52       ` Namjae Jeon
@ 2013-02-15  3:49         ` Andrew Bartlett
  2013-02-18 11:36           ` OGAWA Hirofumi
  0 siblings, 1 reply; 20+ messages in thread
From: Andrew Bartlett @ 2013-02-15  3:49 UTC (permalink / raw)
  To: Namjae Jeon
  Cc: hirofumi, linux-kernel, Ravishankar N, Amit Sahrawat,
	Nam-Jae Jeon, Ravishankar N, Amit Sahrawat

On Thu, 2013-02-14 at 18:52 +0900, Namjae Jeon wrote:
> [snip]
> >> >
> >> > Thanks,
> >> Hi Andrew.
> >>
> >> First, Thanks for your interest !
> >> A mismatch between inode size and reserved blocks can be either due to
> >> pre-allocation (after our changes) or due to corruption (sudden unplug
> >> of media etc).
> >> We don’t think it is right to include only read only support (i.e.
> >> without fallocate support) for such files because if such files are
> >> encountered it only means that the file is corrupted, as there is no
> >> current method to check if the issue is due to pre-allocation.
> >> If it is to be included in the kernel, then the whole patch has to go
> >> in.
> >
> > I don't see why that is the case.
> If we consider that there is no FALLOCATE support, then the condition
> of file size and blocks not matching can be only possible in case of
> corruption, right?

Sure.  I was just suggesting we transparently recover from that, by
using the blocks.  Think of it more as an online fsck not about
fallocate. 

Anyway, if you don't think it's reasonable to use those blocks, or to
'just fix it', then we just have to continue to do as we currently do.
That is on first sign of FS corruption just stop doing writes, and await
an FSCK.  

> >> But then again, since the FAT specifications do not accommodate
> >> for pre-allocation, then it is up to OGAWA to decide if this is
> >> acceptable.
> >> In any case, the patch will definitely break backward compatibility
> >> (on an older fat driver without fallocate support) and also in case
> >> for the two variants for the same kernel versions and only one has
> >> FALLOCATE enabled, in such cases also, the behavior will assume
> >> corruption in one case.
> >
> > I agree that the sudden unplug is a concern, but why not make the
> > filesystem more robust against that inevitable occurrence?  If the
> > blocks appear to be allocated to the file, why not use them?
> We also agree that there should be pre-allocation feature on FAT, and
> we had shared the scenarios where this could be required for a TV/
> recorder.
> But there are certain drawbacks which were raised by OGAWA with
> respect to compatibility and we also tend to agree on them.
> There could possibly be an issue where we are unable to distinguish
> between pre-allocation and corruption. Perhaps we could set a status
> bit on the file to indicate whether the file has pre-allocated blocks.
> This will make it clear whether the allocation is genuine through the
> FAT Fallocate request or is a result of corruption. Depending on the
> status of the flag - the decision can be made regard to reading
> blocks.
> But, the main issue in this will be storing this bit in on-disk
> directory entry for that file. From the feature and usability point of
> view, we should have fallocate on FAT too.
> 
> But it needs initial ACK from OGAWA to continue to work on this so
> that we can figure out the proper solution to move forward.

OK.  Given the need to find other approaches, I wanted to suggest some
ideas - some of which you may have already considered:

What about having a shadow FAT in a file, say called 'allocated space',
that would contain inode -> cluster list pairs, and where that file
would itself contain the free space the 'belongs' to other files?

As new clusters become needed in a file, they would simply be removed
from the 'allocated space' file, and assigned to the file they really
belong to.  That way, another OS just sees a large file, nothing more. 

Or, if we cannot make any changes to the on-disk format, what about
keeping such a database in memory, allocating some of the existing free
list to files that have had fallocate() called on them?  (Naturally,
this makes it non-persistent, and instead more of a 'hint', but could at
least solve our mutual performance issues). 

Or, could we leave allocated but unused clusters in the free cluster
list, but maintain a file with a hint that a particular file should use
a particular free cluster next, if available?  That list of 'allocated
free' clusters could be honoured by fallocate-aware OSs to reduce df and
increase du, but be ignored by other OSs, ensuring you could not run out
of space expanding a file in another OS.  

If a cluster was observed no longer to be in the real free list, it
would be ignored in the 'allocated free' list, to avoid corruption. 

In short, I see the restriction on not breaking existing implementations
as a difficult, but certainty not impossible problem. 

Thanks,

Andrew Bartlett

-- 
Andrew Bartlett                                http://samba.org/~abartlet/
Authentication Developer, Samba Team           http://samba.org



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Read support for fat_fallocate()? (was [v2] fat: editions to support fat_fallocate())
  2013-02-15  3:49         ` Andrew Bartlett
@ 2013-02-18 11:36           ` OGAWA Hirofumi
  2013-02-18 13:11             ` Andrew Bartlett
  2013-02-18 14:25             ` Namjae Jeon
  0 siblings, 2 replies; 20+ messages in thread
From: OGAWA Hirofumi @ 2013-02-18 11:36 UTC (permalink / raw)
  To: Andrew Bartlett
  Cc: Namjae Jeon, linux-kernel, Ravishankar N, Amit Sahrawat,
	Nam-Jae Jeon, Ravishankar N, Amit Sahrawat

Andrew Bartlett <abartlet@samba.org> writes:

>> >> First, Thanks for your interest !
>> >> A mismatch between inode size and reserved blocks can be either due to
>> >> pre-allocation (after our changes) or due to corruption (sudden unplug
>> >> of media etc).
>> >> We don’t think it is right to include only read only support (i.e.
>> >> without fallocate support) for such files because if such files are
>> >> encountered it only means that the file is corrupted, as there is no
>> >> current method to check if the issue is due to pre-allocation.
>> >> If it is to be included in the kernel, then the whole patch has to go
>> >> in.
>> >
>> > I don't see why that is the case.
>> If we consider that there is no FALLOCATE support, then the condition
>> of file size and blocks not matching can be only possible in case of
>> corruption, right?
>
> Sure.  I was just suggesting we transparently recover from that, by
> using the blocks.  Think of it more as an online fsck not about
> fallocate. 
>
> Anyway, if you don't think it's reasonable to use those blocks, or to
> 'just fix it', then we just have to continue to do as we currently do.
> That is on first sign of FS corruption just stop doing writes, and await
> an FSCK.  

I'm not sure what is suggesting actually though. We have to consider
about synchronous runtime fsck makes normal path enough slower.

E.g. probably, in this case, all first open(2) of the inode will have to
walk cluster chain until end of cluster mark, to verify cluster chain.

>> >> But then again, since the FAT specifications do not accommodate
>> >> for pre-allocation, then it is up to OGAWA to decide if this is
>> >> acceptable.
>> >> In any case, the patch will definitely break backward compatibility
>> >> (on an older fat driver without fallocate support) and also in case
>> >> for the two variants for the same kernel versions and only one has
>> >> FALLOCATE enabled, in such cases also, the behavior will assume
>> >> corruption in one case.
>> >
>> > I agree that the sudden unplug is a concern, but why not make the
>> > filesystem more robust against that inevitable occurrence?  If the
>> > blocks appear to be allocated to the file, why not use them?
>> We also agree that there should be pre-allocation feature on FAT, and
>> we had shared the scenarios where this could be required for a TV/
>> recorder.
>> But there are certain drawbacks which were raised by OGAWA with
>> respect to compatibility and we also tend to agree on them.
>> There could possibly be an issue where we are unable to distinguish
>> between pre-allocation and corruption. Perhaps we could set a status
>> bit on the file to indicate whether the file has pre-allocated blocks.
>> This will make it clear whether the allocation is genuine through the
>> FAT Fallocate request or is a result of corruption. Depending on the
>> status of the flag - the decision can be made regard to reading
>> blocks.
>> But, the main issue in this will be storing this bit in on-disk
>> directory entry for that file. From the feature and usability point of
>> view, we should have fallocate on FAT too.
>> 
>> But it needs initial ACK from OGAWA to continue to work on this so
>> that we can figure out the proper solution to move forward.
>
> OK.  Given the need to find other approaches, I wanted to suggest some
> ideas - some of which you may have already considered:
>
> What about having a shadow FAT in a file, say called 'allocated space',
> that would contain inode -> cluster list pairs, and where that file
> would itself contain the free space the 'belongs' to other files?
>
> As new clusters become needed in a file, they would simply be removed
> from the 'allocated space' file, and assigned to the file they really
> belong to.  That way, another OS just sees a large file, nothing more. 
>
> Or, if we cannot make any changes to the on-disk format, what about
> keeping such a database in memory, allocating some of the existing free
> list to files that have had fallocate() called on them?  (Naturally,
> this makes it non-persistent, and instead more of a 'hint', but could at
> least solve our mutual performance issues).

[...]

Hm. My concerns are compatibility and reliability. Although We can
change on-disk format if need, but I don't think it can be compatible
and reliable. If so, who wants to use it? I feel there is no reason to
use FAT if there is no compatible.

Well, anyway, possible solution would be, we can pre-allocate physical
blocks via fallocate(2) or something, but discard pre-allocated blocks
at ->release() (or before unmount at least). This way would have
compatibility (no on-disk change over unmount) and possible breakage
would be same with normal extend write patterns on kernel crash
(i.e. Windows or fsck will truncate after i_size).

Thanks.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Read support for fat_fallocate()? (was [v2] fat: editions to support fat_fallocate())
  2013-02-18 11:36           ` OGAWA Hirofumi
@ 2013-02-18 13:11             ` Andrew Bartlett
  2013-02-18 14:25             ` Namjae Jeon
  1 sibling, 0 replies; 20+ messages in thread
From: Andrew Bartlett @ 2013-02-18 13:11 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: Namjae Jeon, linux-kernel, Ravishankar N, Amit Sahrawat,
	Nam-Jae Jeon, Ravishankar N, Amit Sahrawat

On Mon, 2013-02-18 at 20:36 +0900, OGAWA Hirofumi wrote:
> Andrew Bartlett <abartlet@samba.org> writes:

> > Or, if we cannot make any changes to the on-disk format, what about
> > keeping such a database in memory, allocating some of the existing free
> > list to files that have had fallocate() called on them?  (Naturally,
> > this makes it non-persistent, and instead more of a 'hint', but could at
> > least solve our mutual performance issues).
> 
> [...]
> 
> Hm. My concerns are compatibility and reliability. Although We can
> change on-disk format if need, but I don't think it can be compatible
> and reliable. If so, who wants to use it? I feel there is no reason to
> use FAT if there is no compatible.
> 
> Well, anyway, possible solution would be, we can pre-allocate physical
> blocks via fallocate(2) or something, but discard pre-allocated blocks
> at ->release() (or before unmount at least). This way would have
> compatibility (no on-disk change over unmount) and possible breakage
> would be same with normal extend write patterns on kernel crash
> (i.e. Windows or fsck will truncate after i_size).

That would certainly give me what the Samba NAS with USB FAT disk use
case needs.

Thanks,

Andrew Bartlett

-- 
Andrew Bartlett                                http://samba.org/~abartlet/
Authentication Developer, Samba Team           http://samba.org



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Read support for fat_fallocate()? (was [v2] fat: editions to support fat_fallocate())
  2013-02-18 11:36           ` OGAWA Hirofumi
  2013-02-18 13:11             ` Andrew Bartlett
@ 2013-02-18 14:25             ` Namjae Jeon
  2013-02-18 14:59               ` OGAWA Hirofumi
  1 sibling, 1 reply; 20+ messages in thread
From: Namjae Jeon @ 2013-02-18 14:25 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: Andrew Bartlett, linux-kernel, Ravishankar N, Amit Sahrawat,
	Nam-Jae Jeon, Ravishankar N, Amit Sahrawat

2013/2/18 OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>:
> Andrew Bartlett <abartlet@samba.org> writes:
>
>>> >> First, Thanks for your interest !
>>> >> A mismatch between inode size and reserved blocks can be either due to
>>> >> pre-allocation (after our changes) or due to corruption (sudden unplug
>>> >> of media etc).
>>> >> We don’t think it is right to include only read only support (i.e.
>>> >> without fallocate support) for such files because if such files are
>>> >> encountered it only means that the file is corrupted, as there is no
>>> >> current method to check if the issue is due to pre-allocation.
>>> >> If it is to be included in the kernel, then the whole patch has to go
>>> >> in.
>>> >
>>> > I don't see why that is the case.
>>> If we consider that there is no FALLOCATE support, then the condition
>>> of file size and blocks not matching can be only possible in case of
>>> corruption, right?
>>
>> Sure.  I was just suggesting we transparently recover from that, by
>> using the blocks.  Think of it more as an online fsck not about
>> fallocate.
>>
>> Anyway, if you don't think it's reasonable to use those blocks, or to
>> 'just fix it', then we just have to continue to do as we currently do.
>> That is on first sign of FS corruption just stop doing writes, and await
>> an FSCK.
>
> I'm not sure what is suggesting actually though. We have to consider
> about synchronous runtime fsck makes normal path enough slower.
>
> E.g. probably, in this case, all first open(2) of the inode will have to
> walk cluster chain until end of cluster mark, to verify cluster chain.
>
>>> >> But then again, since the FAT specifications do not accommodate
>>> >> for pre-allocation, then it is up to OGAWA to decide if this is
>>> >> acceptable.
>>> >> In any case, the patch will definitely break backward compatibility
>>> >> (on an older fat driver without fallocate support) and also in case
>>> >> for the two variants for the same kernel versions and only one has
>>> >> FALLOCATE enabled, in such cases also, the behavior will assume
>>> >> corruption in one case.
>>> >
>>> > I agree that the sudden unplug is a concern, but why not make the
>>> > filesystem more robust against that inevitable occurrence?  If the
>>> > blocks appear to be allocated to the file, why not use them?
>>> We also agree that there should be pre-allocation feature on FAT, and
>>> we had shared the scenarios where this could be required for a TV/
>>> recorder.
>>> But there are certain drawbacks which were raised by OGAWA with
>>> respect to compatibility and we also tend to agree on them.
>>> There could possibly be an issue where we are unable to distinguish
>>> between pre-allocation and corruption. Perhaps we could set a status
>>> bit on the file to indicate whether the file has pre-allocated blocks.
>>> This will make it clear whether the allocation is genuine through the
>>> FAT Fallocate request or is a result of corruption. Depending on the
>>> status of the flag - the decision can be made regard to reading
>>> blocks.
>>> But, the main issue in this will be storing this bit in on-disk
>>> directory entry for that file. From the feature and usability point of
>>> view, we should have fallocate on FAT too.
>>>
>>> But it needs initial ACK from OGAWA to continue to work on this so
>>> that we can figure out the proper solution to move forward.
>>
>> OK.  Given the need to find other approaches, I wanted to suggest some
>> ideas - some of which you may have already considered:
>>
>> What about having a shadow FAT in a file, say called 'allocated space',
>> that would contain inode -> cluster list pairs, and where that file
>> would itself contain the free space the 'belongs' to other files?
>>
>> As new clusters become needed in a file, they would simply be removed
>> from the 'allocated space' file, and assigned to the file they really
>> belong to.  That way, another OS just sees a large file, nothing more.
>>
>> Or, if we cannot make any changes to the on-disk format, what about
>> keeping such a database in memory, allocating some of the existing free
>> list to files that have had fallocate() called on them?  (Naturally,
>> this makes it non-persistent, and instead more of a 'hint', but could at
>> least solve our mutual performance issues).
>
> [...]
>
> Hm. My concerns are compatibility and reliability. Although We can
> change on-disk format if need, but I don't think it can be compatible
> and reliable. If so, who wants to use it? I feel there is no reason to
> use FAT if there is no compatible.
>
> Well, anyway, possible solution would be, we can pre-allocate physical
> blocks via fallocate(2) or something, but discard pre-allocated blocks
> at ->release() (or before unmount at least). This way would have
> compatibility (no on-disk change over unmount) and possible breakage
> would be same with normal extend write patterns on kernel crash
> (i.e. Windows or fsck will truncate after i_size).
Hi OGAWA.
We don't need to consider device unplugging case ?
If yes, I can rework fat fallocate patch as your suggestion.
Thanks.
>
> Thanks.
> --
> OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Read support for fat_fallocate()? (was [v2] fat: editions to support fat_fallocate())
  2013-02-18 14:25             ` Namjae Jeon
@ 2013-02-18 14:59               ` OGAWA Hirofumi
  2013-02-18 15:15                 ` Namjae Jeon
  0 siblings, 1 reply; 20+ messages in thread
From: OGAWA Hirofumi @ 2013-02-18 14:59 UTC (permalink / raw)
  To: Namjae Jeon
  Cc: Andrew Bartlett, linux-kernel, Ravishankar N, Amit Sahrawat,
	Nam-Jae Jeon, Ravishankar N, Amit Sahrawat

Namjae Jeon <linkinjeon@gmail.com> writes:

>> Hm. My concerns are compatibility and reliability. Although We can
>> change on-disk format if need, but I don't think it can be compatible
>> and reliable. If so, who wants to use it? I feel there is no reason to
>> use FAT if there is no compatible.
>>
>> Well, anyway, possible solution would be, we can pre-allocate physical
>> blocks via fallocate(2) or something, but discard pre-allocated blocks
>> at ->release() (or before unmount at least). This way would have
>> compatibility (no on-disk change over unmount) and possible breakage
>> would be same with normal extend write patterns on kernel crash
>> (i.e. Windows or fsck will truncate after i_size).
> Hi OGAWA.
> We don't need to consider device unplugging case ?
> If yes, I can rework fat fallocate patch as your suggestion.

In my suggestion, I think, kernel crash or something like unplugging
cases handles has no change from current way.

Any pre-allocated blocks are truncated by fsck as inconsistency state,
like crash before updating i_size for normal extend write.  I.e. across
unmount, nobody care whether pre-allocated or not. IOW, if there is
inconsistent between i_size and cluster chain (includes via
fallocate(2)) across unmount, it should be handled as broken state.

In short, the lifetime of pre-allocated blocks are from fallocate(2) to
->release() only.

Thanks.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Read support for fat_fallocate()? (was [v2] fat: editions to support fat_fallocate())
  2013-02-18 14:59               ` OGAWA Hirofumi
@ 2013-02-18 15:15                 ` Namjae Jeon
  0 siblings, 0 replies; 20+ messages in thread
From: Namjae Jeon @ 2013-02-18 15:15 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: Andrew Bartlett, linux-kernel, Ravishankar N, Amit Sahrawat,
	Nam-Jae Jeon, Ravishankar N, Amit Sahrawat

2013/2/18 OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>:
> Namjae Jeon <linkinjeon@gmail.com> writes:
>
>>> Hm. My concerns are compatibility and reliability. Although We can
>>> change on-disk format if need, but I don't think it can be compatible
>>> and reliable. If so, who wants to use it? I feel there is no reason to
>>> use FAT if there is no compatible.
>>>
>>> Well, anyway, possible solution would be, we can pre-allocate physical
>>> blocks via fallocate(2) or something, but discard pre-allocated blocks
>>> at ->release() (or before unmount at least). This way would have
>>> compatibility (no on-disk change over unmount) and possible breakage
>>> would be same with normal extend write patterns on kernel crash
>>> (i.e. Windows or fsck will truncate after i_size).
>> Hi OGAWA.
>> We don't need to consider device unplugging case ?
>> If yes, I can rework fat fallocate patch as your suggestion.
>
> In my suggestion, I think, kernel crash or something like unplugging
> cases handles has no change from current way.
>
> Any pre-allocated blocks are truncated by fsck as inconsistency state,
> like crash before updating i_size for normal extend write.  I.e. across
> unmount, nobody care whether pre-allocated or not. IOW, if there is
> inconsistent between i_size and cluster chain (includes via
> fallocate(2)) across unmount, it should be handled as broken state.
>
> In short, the lifetime of pre-allocated blocks are from fallocate(2) to
> ->release() only.
Okay, I will post updated fat fallocate patch after looking into more.
Thanks.
>
> Thanks.
> --
> OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2013-02-18 15:15 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-10-13 14:31 [PATCH v2] fat: editions to support fat_fallocate() Namjae Jeon
2012-10-14 16:20 ` OGAWA Hirofumi
2012-10-16  4:12   ` Namjae Jeon
2012-10-16 10:14     ` OGAWA Hirofumi
2012-10-17 10:57       ` Namjae Jeon
2012-10-21 23:54         ` OGAWA Hirofumi
2012-10-22 15:10           ` Namjae Jeon
2012-10-23  7:19             ` OGAWA Hirofumi
2012-10-23  7:24               ` Namjae Jeon
2013-02-14  2:40 ` Read support for fat_fallocate()? (was [v2] fat: editions to support fat_fallocate()) Andrew Bartlett
2013-02-14  2:48 ` Andrew Bartlett
2013-02-14  6:44   ` Namjae Jeon
2013-02-14  7:07     ` Andrew Bartlett
2013-02-14  9:52       ` Namjae Jeon
2013-02-15  3:49         ` Andrew Bartlett
2013-02-18 11:36           ` OGAWA Hirofumi
2013-02-18 13:11             ` Andrew Bartlett
2013-02-18 14:25             ` Namjae Jeon
2013-02-18 14:59               ` OGAWA Hirofumi
2013-02-18 15:15                 ` Namjae Jeon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).