linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH] first cut 64 bit block support
@ 2001-07-01  4:53 Ben LaHaise
  2001-07-03  4:53 ` Ragnar Kjørstad
  2001-07-04 10:16 ` [RFC][PATCH] first cut 64 bit block support Chris Wedgwood
  0 siblings, 2 replies; 71+ messages in thread
From: Ben LaHaise @ 2001-07-01  4:53 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel

Hey folks,

Below is the first cut at making the block size limit configurable to 64
bits on x86, as well as always 64 bits on 64 bit machines.  The audit
isn't complete yet, but a good chunk of it is done.

Filesystem           1k-blocks      Used Available Use% Mounted on
/dev/md1             7508125768        20 7476280496   1% /mnt/3

This is a 7TB ext2 filesystem on 4KB blocks.  The 7TB /dev/md1 consists of
7x 1TB sparse files on loop devices raid0'd together.  The current patch
does not have the fixes in the SCSI layer or IDE driver yet; expect the
SCSI fixes in the next version, although I'll need a tester.  The
following should be 64 bit clean now: nbd, loop, raid0, raid1, raid5.

Ugly bits: I had to add libgcc.a to satisfy the need for 64 bit division.
Yeah, it sucks, but RAID needs some more massaging before I can remove the
64 bit division completely.  This will be fixed.

Copies of this and later versions of the patch are available at
http://people.redhat.com/bcrl/lb/ and http://www.kvack.org/~blah/lb/ .
Please forward any bug fixes or comments to me.  Cheers,

		-ben

::::v2.4.6-pre8-largeblock4.diff::::
diff -ur /md0/kernels/2.4/v2.4.6-pre8/arch/i386/Makefile lb-2.4.6-pre8/arch/i386/Makefile
--- /md0/kernels/2.4/v2.4.6-pre8/arch/i386/Makefile	Thu May  3 11:22:07 2001
+++ lb-2.4.6-pre8/arch/i386/Makefile	Sun Jul  1 00:35:25 2001
@@ -92,6 +92,7 @@

 CORE_FILES := arch/i386/kernel/kernel.o arch/i386/mm/mm.o $(CORE_FILES)
 LIBS := $(TOPDIR)/arch/i386/lib/lib.a $(LIBS) $(TOPDIR)/arch/i386/lib/lib.a
+LIBS += $(shell gcc -print-libgcc-file-name)

 ifdef CONFIG_MATH_EMULATION
 SUBDIRS += arch/i386/math-emu
diff -ur /md0/kernels/2.4/v2.4.6-pre8/arch/i386/config.in lb-2.4.6-pre8/arch/i386/config.in
--- /md0/kernels/2.4/v2.4.6-pre8/arch/i386/config.in	Sat Jun 30 14:04:26 2001
+++ lb-2.4.6-pre8/arch/i386/config.in	Sat Jun 30 15:37:37 2001
@@ -185,6 +185,7 @@
 mainmenu_option next_comment
 comment 'General setup'

+bool '64 bit block device support' CONFIG_BLKOFF_LONGLONG
 bool 'Networking support' CONFIG_NET
 bool 'SGI Visual Workstation support' CONFIG_VISWS
 if [ "$CONFIG_VISWS" = "y" ]; then
diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/block/floppy.c lb-2.4.6-pre8/drivers/block/floppy.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/block/floppy.c	Mon Feb 26 10:20:05 2001
+++ lb-2.4.6-pre8/drivers/block/floppy.c	Sat Jun 30 16:23:07 2001
@@ -468,7 +468,7 @@
  */
 static struct floppy_struct user_params[N_DRIVE];

-static int floppy_sizes[256];
+static blkoff_t floppy_sizes[256];
 static int floppy_blocksizes[256];

 /*
@@ -2640,8 +2640,8 @@

 	max_sector = _floppy->sect * _floppy->head;

-	TRACK = CURRENT->sector / max_sector;
-	sector_t = CURRENT->sector % max_sector;
+	TRACK = (int)(CURRENT->sector) / max_sector;
+	sector_t = (int)(CURRENT->sector) % max_sector;
 	if (_floppy->track && TRACK >= _floppy->track) {
 		if (CURRENT->current_nr_sectors & 1) {
 			current_count_sectors = 1;
@@ -2982,7 +2982,7 @@

 	if (usage_count == 0) {
 		printk("warning: usage count=0, CURRENT=%p exiting\n", CURRENT);
-		printk("sect=%ld cmd=%d\n", CURRENT->sector, CURRENT->cmd);
+		printk("sect=%" BLKOFF_FMT " cmd=%d\n", CURRENT->sector, CURRENT->cmd);
 		return;
 	}
 	if (fdc_busy){
diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/block/ll_rw_blk.c lb-2.4.6-pre8/drivers/block/ll_rw_blk.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/block/ll_rw_blk.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/drivers/block/ll_rw_blk.c	Sat Jun 30 15:38:40 2001
@@ -82,7 +82,7 @@
  *
  * if (!blk_size[MAJOR]) then no minor size checking is done.
  */
-int * blk_size[MAX_BLKDEV];
+blkoff_t *blk_size[MAX_BLKDEV];

 /*
  * blksize_size contains the size of all block-devices:
@@ -667,7 +667,8 @@
 static int __make_request(request_queue_t * q, int rw,
 				  struct buffer_head * bh)
 {
-	unsigned int sector, count;
+	blkoff_t sector;
+	unsigned count;
 	int max_segments = MAX_SEGMENTS;
 	struct request * req, *freereq = NULL;
 	int rw_ahead, max_sectors, el_ret;
@@ -859,7 +860,7 @@
 void generic_make_request (int rw, struct buffer_head * bh)
 {
 	int major = MAJOR(bh->b_rdev);
-	int minorsize = 0;
+	blkoff_t minorsize = 0;
 	request_queue_t *q;

 	if (!bh->b_end_io)
@@ -869,8 +870,8 @@
 	if (blk_size[major])
 		minorsize = blk_size[major][MINOR(bh->b_rdev)];
 	if (minorsize) {
-		unsigned long maxsector = (minorsize << 1) + 1;
-		unsigned long sector = bh->b_rsector;
+		blkoff_t maxsector = (minorsize << 1) + 1;
+		blkoff_t sector = bh->b_rsector;
 		unsigned int count = bh->b_size >> 9;

 		if (maxsector < count || maxsector - count < sector) {
@@ -881,8 +882,9 @@
 			   without checking the size of the device, e.g.,
 			   when mounting a device. */
 			printk(KERN_INFO
-			       "attempt to access beyond end of device\n");
-			printk(KERN_INFO "%s: rw=%d, want=%ld, limit=%d\n",
+			       "attempt to access beyond end of device\n"
+			       KERN_INFO "%s: rw=%d, want=%" BLKOFF_FMT
+			       ", limit=%" BLKOFF_FMT "\n",
 			       kdevname(bh->b_rdev), rw,
 			       (sector + count)>>1, minorsize);

@@ -905,7 +907,7 @@
 		if (!q) {
 			printk(KERN_ERR
 			       "generic_make_request: Trying to access "
-			       "nonexistent block-device %s (%ld)\n",
+			       "nonexistent block-device %s (%" BLKOFF_FMT ")\n",
 			       kdevname(bh->b_rdev), bh->b_rsector);
 			buffer_IO_error(bh);
 			break;
@@ -1114,7 +1116,7 @@

 	req->errors = 0;
 	if (!uptodate)
-		printk("end_request: I/O error, dev %s (%s), sector %lu\n",
+		printk("end_request: I/O error, dev %s (%s), sector %" BLKOFF_FMT "\n",
 			kdevname(req->rq_dev), name, req->sector);

 	if ((bh = req->bh) != NULL) {
diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/block/loop.c lb-2.4.6-pre8/drivers/block/loop.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/block/loop.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/drivers/block/loop.c	Sat Jun 30 23:41:37 2001
@@ -76,7 +76,7 @@

 static int max_loop = 8;
 static struct loop_device *loop_dev;
-static int *loop_sizes;
+static blkoff_t *loop_sizes;
 static int *loop_blksizes;
 static devfs_handle_t devfs_handle;      /*  For the directory */

@@ -84,7 +84,7 @@
  * Transfer functions
  */
 static int transfer_none(struct loop_device *lo, int cmd, char *raw_buf,
-			 char *loop_buf, int size, int real_block)
+			 char *loop_buf, int size, blkoff_t real_block)
 {
 	if (cmd == READ)
 		memcpy(loop_buf, raw_buf, size);
@@ -95,7 +95,7 @@
 }

 static int transfer_xor(struct loop_device *lo, int cmd, char *raw_buf,
-			char *loop_buf, int size, int real_block)
+			char *loop_buf, int size, blkoff_t real_block)
 {
 	char	*in, *out, *key;
 	int	i, keysize;
@@ -147,7 +147,7 @@

 #define MAX_DISK_SIZE 1024*1024*1024

-static int compute_loop_size(struct loop_device *lo, struct dentry * lo_dentry, kdev_t lodev)
+static blkoff_t compute_loop_size(struct loop_device *lo, struct dentry * lo_dentry, kdev_t lodev)
 {
 	if (S_ISREG(lo_dentry->d_inode->i_mode))
 		return (lo_dentry->d_inode->i_size - lo->lo_offset) >> BLOCK_SIZE_BITS;
@@ -172,7 +172,7 @@
 	struct address_space_operations *aops = mapping->a_ops;
 	struct page *page;
 	char *kaddr, *data;
-	unsigned long index;
+	blkoff_t index;
 	unsigned size, offset;
 	int len;

@@ -181,7 +181,7 @@
 	len = bh->b_size;
 	data = bh->b_data;
 	while (len > 0) {
-		int IV = index * (PAGE_CACHE_SIZE/bsize) + offset/bsize;
+		blkoff_t IV = index * (PAGE_CACHE_SIZE/bsize) + offset/bsize;
 		size = PAGE_CACHE_SIZE - offset;
 		if (size > len)
 			size = len;
@@ -209,7 +209,7 @@
 	return 0;

 write_fail:
-	printk(KERN_ERR "loop: transfer error block %ld\n", index);
+	printk(KERN_ERR "loop: transfer error block %"BLKOFF_FMT"\n", index);
 	ClearPageUptodate(page);
 	kunmap(page);
 unlock:
@@ -232,7 +232,7 @@
 	unsigned long count = desc->count;
 	struct lo_read_data *p = (struct lo_read_data*)desc->buf;
 	struct loop_device *lo = p->lo;
-	int IV = page->index * (PAGE_CACHE_SIZE/p->bsize) + offset/p->bsize;
+	blkoff_t IV = (blkoff_t)page->index * (PAGE_CACHE_SIZE/p->bsize) + offset/p->bsize;

 	if (size > count)
 		size = count;
@@ -283,15 +283,27 @@

 	return bs;
 }
+static inline int loop_get_shift(struct loop_device *lo)
+{
+	int size = loop_get_bs(lo);
+	int i = 0;
+
+	while (size) {
+		i++;
+		size >>= 1;
+	}
+	return i;
+}

 static inline unsigned long loop_get_iv(struct loop_device *lo,
-					unsigned long sector)
+					blkoff_t sector)
 {
 	int bs = loop_get_bs(lo);
+	int shift = loop_get_shift(lo);
 	unsigned long offset, IV;

-	IV = sector / (bs >> 9) + lo->lo_offset / bs;
-	offset = ((sector % (bs >> 9)) << 9) + lo->lo_offset % bs;
+	IV = (sector >> (bs - 9)) + (lo->lo_offset >> shift);
+	offset = (sector & (bs - 1) & ~511) + (lo->lo_offset & (bs - 1));
 	if (offset >= bs)
 		IV++;

@@ -983,7 +995,7 @@
 	if (!loop_dev)
 		return -ENOMEM;

-	loop_sizes = kmalloc(max_loop * sizeof(int), GFP_KERNEL);
+	loop_sizes = kmalloc(max_loop * sizeof(blkoff_t), GFP_KERNEL);
 	if (!loop_sizes)
 		goto out_sizes;

@@ -1003,7 +1015,7 @@
 		spin_lock_init(&lo->lo_lock);
 	}

-	memset(loop_sizes, 0, max_loop * sizeof(int));
+	memset(loop_sizes, 0, max_loop * sizeof(blkoff_t));
 	memset(loop_blksizes, 0, max_loop * sizeof(int));
 	blk_size[MAJOR_NR] = loop_sizes;
 	blksize_size[MAJOR_NR] = loop_blksizes;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/block/nbd.c lb-2.4.6-pre8/drivers/block/nbd.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/block/nbd.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/drivers/block/nbd.c	Sat Jun 30 14:22:13 2001
@@ -56,7 +56,7 @@

 static int nbd_blksizes[MAX_NBD];
 static int nbd_blksize_bits[MAX_NBD];
-static int nbd_sizes[MAX_NBD];
+static blkoff_t nbd_sizes[MAX_NBD];
 static u64 nbd_bytesizes[MAX_NBD];

 static struct nbd_device nbd_dev[MAX_NBD];
diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/ide/ide-cd.c lb-2.4.6-pre8/drivers/ide/ide-cd.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/ide/ide-cd.c	Fri May 25 22:48:09 2001
+++ lb-2.4.6-pre8/drivers/ide/ide-cd.c	Sat Jun 30 15:40:14 2001
@@ -1060,7 +1060,7 @@
 	   paranoid and check. */
 	if (rq->current_nr_sectors < (rq->bh->b_size >> SECTOR_BITS) &&
 	    (rq->sector % SECTORS_PER_FRAME) != 0) {
-		printk ("%s: cdrom_read_from_buffer: buffer botch (%ld)\n",
+		printk ("%s: cdrom_read_from_buffer: buffer botch (%" BLKOFF_FMT ")\n",
 			drive->name, rq->sector);
 		cdrom_end_request (0, drive);
 		return -1;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/ide/ide-probe.c lb-2.4.6-pre8/drivers/ide/ide-probe.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/ide/ide-probe.c	Thu Apr  5 11:53:40 2001
+++ lb-2.4.6-pre8/drivers/ide/ide-probe.c	Sat Jun 30 20:42:28 2001
@@ -759,7 +759,7 @@
 	}
 	minors    = units * (1<<PARTN_BITS);
 	gd        = kmalloc (sizeof(struct gendisk), GFP_KERNEL);
-	gd->sizes = kmalloc (minors * sizeof(int), GFP_KERNEL);
+	gd->sizes = kmalloc (minors * sizeof(blkoff_t), GFP_KERNEL);
 	gd->part  = kmalloc (minors * sizeof(struct hd_struct), GFP_KERNEL);
 	bs        = kmalloc (minors*sizeof(int), GFP_KERNEL);
 	max_sect  = kmalloc (minors*sizeof(int), GFP_KERNEL);
diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/ide/ide.c lb-2.4.6-pre8/drivers/ide/ide.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/ide/ide.c	Fri May 25 22:48:09 2001
+++ lb-2.4.6-pre8/drivers/ide/ide.c	Sat Jun 30 15:39:44 2001
@@ -881,7 +881,7 @@
 					  IN_BYTE(IDE_SECTOR_REG));
 				}
 				if (HWGROUP(drive)->rq)
-					printk(", sector=%ld", HWGROUP(drive)->rq->sector);
+					printk(", sector=%" BLKOFF_FMT, HWGROUP(drive)->rq->sector);
 			}
 		}
 #endif	/* FANCY_STATUS_DUMPS */
diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/md/linear.c lb-2.4.6-pre8/drivers/md/linear.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/md/linear.c	Mon Feb 26 10:20:07 2001
+++ lb-2.4.6-pre8/drivers/md/linear.c	Sat Jun 30 16:26:55 2001
@@ -125,15 +125,14 @@
         linear_conf_t *conf = mddev_to_conf(mddev);
         struct linear_hash *hash;
         dev_info_t *tmp_dev;
-        long block;
+        blkoff_t block;

 	block = bh->b_rsector >> 1;
 	hash = conf->hash_table + (block / conf->smallest->size);

 	if (block >= (hash->dev0->size + hash->dev0->offset)) {
 		if (!hash->dev1) {
-			printk ("linear_make_request : hash->dev1==NULL for block %ld\n",
-						block);
+			printk ("linear_make_request : hash->dev1==NULL for block %"BLKOFF_FMT"\n", block);
 			buffer_IO_error(bh);
 			return 0;
 		}
@@ -143,7 +142,7 @@

 	if (block >= (tmp_dev->size + tmp_dev->offset)
 				|| block < tmp_dev->offset) {
-		printk ("linear_make_request: Block %ld out of bounds on dev %s size %ld offset %ld\n", block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
+		printk ("linear_make_request: Block %" BLKOFF_FMT " out of bounds on dev %s size %"BLKOFF_FMT" offset %"BLKOFF_FMT"\n", block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
 		buffer_IO_error(bh);
 		return 0;
 	}
diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/md/lvm.c lb-2.4.6-pre8/drivers/md/lvm.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/md/lvm.c	Thu May  3 11:22:10 2001
+++ lb-2.4.6-pre8/drivers/md/lvm.c	Sat Jun 30 14:56:20 2001
@@ -1068,7 +1068,7 @@
 static int lvm_user_bmap(struct inode *inode, struct lv_bmap *user_result)
 {
 	struct buffer_head bh;
-	unsigned long block;
+	blkoff_t block;
 	int err;

 	if (get_user(block, &user_result->lv_block))
@@ -1481,8 +1481,8 @@
 	ulong index;
 	ulong pe_start;
 	ulong size = bh->b_size >> 9;
-	ulong rsector_tmp = bh->b_rsector;
-	ulong rsector_sav;
+	blkoff_t rsector_tmp = bh->b_rsector;
+	blkoff_t rsector_sav;
 	kdev_t rdev_tmp = bh->b_rdev;
 	kdev_t rdev_sav;
 	vg_t *vg_this = vg[VG_BLK(minor)];
@@ -1504,8 +1504,8 @@
 		return -1;
 	}

-	P_MAP("%s - lvm_map minor:%d  *rdev: %02d:%02d  *rsector: %lu  "
-	      "size:%lu\n",
+	P_MAP("%s - lvm_map minor:%d  *rdev: %02d:%02d  *rsector: %" BLKOFF_FMT
+	      "   size:%lu\n",
 	      lvm_name, minor,
 	      MAJOR(rdev_tmp),
 	      MINOR(rdev_tmp),
diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/md/md.c lb-2.4.6-pre8/drivers/md/md.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/md/md.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/drivers/md/md.c	Sat Jun 30 21:34:30 2001
@@ -112,7 +112,7 @@
 static int md_maxreadahead[MAX_MD_DEVS];
 static mdk_thread_t *md_recovery_thread;

-int md_size[MAX_MD_DEVS];
+blkoff_t md_size[MAX_MD_DEVS];

 extern struct block_device_operations md_fops;
 static devfs_handle_t devfs_handle;
@@ -803,7 +803,7 @@

 static void print_rdev(mdk_rdev_t *rdev)
 {
-	printk("md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
+	printk("md: rdev %s: O:%s, SZ:%08"BLKOFF_FMT" F:%d DN:%d ",
 		partition_name(rdev->dev), partition_name(rdev->old_dev),
 		rdev->size, rdev->faulty, rdev->desc_nr);
 	if (rdev->sb) {
@@ -912,7 +912,7 @@
 {
 	struct buffer_head *bh;
 	kdev_t dev;
-	unsigned long sb_offset, size;
+	blkoff_t sb_offset, size;
 	mdp_super_t *sb;

 	if (!rdev->sb) {
@@ -931,7 +931,7 @@
 	dev = rdev->dev;
 	sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
 	if (rdev->sb_offset != sb_offset) {
-		printk("%s's sb offset has changed from %ld to %ld, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
+		printk("%s's sb offset has changed from %"BLKOFF_FMT" to %"BLKOFF_FMT", skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
 		goto skip;
 	}
 	/*
@@ -941,11 +941,11 @@
 	 */
 	size = calc_dev_size(dev, rdev->mddev, 1);
 	if (size != rdev->size) {
-		printk("%s's size has changed from %ld to %ld since import, skipping\n", partition_name(dev), rdev->size, size);
+		printk("%s's size has changed from %"BLKOFF_FMT" to %"BLKOFF_FMT" since import, skipping\n", partition_name(dev), rdev->size, size);
 		goto skip;
 	}

-	printk("(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset);
+	printk("(write) %s's sb offset: %"BLKOFF_FMT"\n", partition_name(dev), sb_offset);
 	fsync_dev(dev);
 	set_blocksize(dev, MD_SB_BYTES);
 	bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
@@ -1485,7 +1485,7 @@
 		rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
 		if (rdev->size < sb->chunk_size / 1024) {
 			printk (KERN_WARNING
-				"md: Dev %s smaller than chunk_size: %ldk < %dk\n",
+				"md: Dev %s smaller than chunk_size: %"BLKOFF_FMT"k < %dk\n",
 				partition_name(rdev->dev),
 				rdev->size, sb->chunk_size / 1024);
 			return -EINVAL;
@@ -2508,10 +2508,27 @@
 				err = -EINVAL;
 				goto abort;
 			}
+			if ((long)md_hd_struct[minor].nr_sects !=
+			    md_hd_struct[minor].nr_sects) {
+				err = -EOVERFLOW;
+				goto abort;
+			}
 			err = md_put_user(md_hd_struct[minor].nr_sects,
 						(long *) arg);
 			goto done;

+		case BLKGETSIZE64:   /* Return device size */
+		{
+			long long val = md_hd_struct[minor].nr_sects;
+			if (!arg) {
+				err = -EINVAL;
+				goto abort;
+			}
+			err = 0;
+			if (copy_to_user((void *)arg, &val, sizeof val))
+				err = -EFAULT;
+			goto done;
+		}
 		case BLKFLSBUF:
 			fsync_dev(dev);
 			invalidate_buffers(dev);
@@ -3051,7 +3068,8 @@
 static int md_status_read_proc(char *page, char **start, off_t off,
 			int count, int *eof, void *data)
 {
-	int sz = 0, j, size;
+	blkoff_t size;
+	int sz = 0, j;
 	struct md_list_head *tmp, *tmp2;
 	mdk_rdev_t *rdev;
 	mddev_t *mddev;
@@ -3092,10 +3110,10 @@

 		if (mddev->nb_dev) {
 			if (mddev->pers)
-				sz += sprintf(page + sz, "\n      %d blocks",
+				sz += sprintf(page + sz, "\n      %" BLKOFF_FMT " blocks",
 						 md_size[mdidx(mddev)]);
 			else
-				sz += sprintf(page + sz, "\n      %d blocks", size);
+				sz += sprintf(page + sz, "\n      %" BLKOFF_FMT " blocks", size);
 		}

 		if (!mddev->pers) {
@@ -3226,8 +3244,9 @@
 int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
 {
 	mddev_t *mddev2;
-	unsigned int max_sectors, currspeed,
-		j, window, err, serialize;
+	blkoff_t max_sectors, j;
+	unsigned int currspeed,
+		window, err, serialize;
 	kdev_t read_disk = mddev_to_kdev(mddev);
 	unsigned long mark[SYNC_MARKS];
 	unsigned long mark_cnt[SYNC_MARKS];
@@ -3288,7 +3307,7 @@
 	 * Tune reconstruction:
 	 */
 	window = MAX_READAHEAD*(PAGE_SIZE/512);
-	printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",window/2,max_sectors/2);
+	printk(KERN_INFO "md: using %dk window, over a total of %" BLKOFF_FMT " blocks.\n",window/2,max_sectors/2);

 	atomic_set(&mddev->recovery_active, 0);
 	init_waitqueue_head(&mddev->recovery_wait);
@@ -3306,6 +3325,7 @@
 		j += sectors;
 		mddev->curr_resync = j;

+		/* README: Uhhh, is this right?  last_check is always 0 here */
 		if (last_check + window > j)
 			continue;

diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/md/raid0.c lb-2.4.6-pre8/drivers/md/raid0.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/md/raid0.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/drivers/md/raid0.c	Sat Jun 30 23:47:30 2001
@@ -41,7 +41,7 @@
 		printk("raid0: looking at %s\n", partition_name(rdev1->dev));
 		c = 0;
 		ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
-			printk("raid0:   comparing %s(%ld) with %s(%ld)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size);
+			printk("raid0:   comparing %s(%"BLKOFF_FMT") with %s(%"BLKOFF_FMT")\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size);
 			if (rdev2 == rdev1) {
 				printk("raid0:   END\n");
 				break;
@@ -95,7 +95,7 @@
 				c++;
 				if (!smallest || (rdev->size <smallest->size)) {
 					smallest = rdev;
-					printk("  (%ld) is smallest!.\n", rdev->size);
+					printk("  (%"BLKOFF_FMT") is smallest!.\n", rdev->size);
 				}
 			} else
 				printk(" nope.\n");
@@ -103,7 +103,7 @@

 		zone->nb_dev = c;
 		zone->size = (smallest->size - current_offset) * c;
-		printk("raid0: zone->nb_dev: %d, size: %ld\n",zone->nb_dev,zone->size);
+		printk("raid0: zone->nb_dev: %d, size: %"BLKOFF_FMT"\n",zone->nb_dev,zone->size);

 		if (!conf->smallest || (zone->size < conf->smallest->size))
 			conf->smallest = zone;
@@ -138,8 +138,8 @@
 	if (create_strip_zones (mddev))
 		goto out_free_conf;

-	printk("raid0 : md_size is %d blocks.\n", md_size[mdidx(mddev)]);
-	printk("raid0 : conf->smallest->size is %ld blocks.\n", conf->smallest->size);
+	printk("raid0 : md_size is %" BLKOFF_FMT " blocks.\n", md_size[mdidx(mddev)]);
+	printk("raid0 : conf->smallest->size is %" BLKOFF_FMT " blocks.\n", conf->smallest->size);
 	nb_zone = md_size[mdidx(mddev)]/conf->smallest->size +
 			(md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0);
 	printk("raid0 : nb_zone is %ld.\n", nb_zone);
@@ -231,7 +231,8 @@
 	struct raid0_hash *hash;
 	struct strip_zone *zone;
 	mdk_rdev_t *tmp_dev;
-	unsigned long chunk, block, rsect;
+	blkoff_t chunk;
+	blkoff_t block, rsect;

 	chunk_size = mddev->param.chunk_size >> 10;
 	chunksize_bits = ffz(~chunk_size);
@@ -239,7 +240,7 @@
 	hash = conf->hash_table + block / conf->smallest->size;

 	/* Sanity check */
-	if (chunk_size < (block % chunk_size) + (bh->b_size >> 10))
+	if (chunk_size < (block & (chunk_size-1)) + (bh->b_size >> 10))
 		goto bad_map;

 	if (!hash)
@@ -274,16 +275,16 @@
 	return 1;

 bad_map:
-	printk ("raid0_make_request bug: can't convert block across chunks or bigger than %dk %ld %d\n", chunk_size, bh->b_rsector, bh->b_size >> 10);
+	printk ("raid0_make_request bug: can't convert block across chunks or bigger than %dk %"BLKOFF_FMT" %d\n", chunk_size, bh->b_rsector, bh->b_size >> 10);
 	goto outerr;
 bad_hash:
-	printk("raid0_make_request bug: hash==NULL for block %ld\n", block);
+	printk("raid0_make_request bug: hash==NULL for block %"BLKOFF_FMT"\n", block);
 	goto outerr;
 bad_zone0:
-	printk ("raid0_make_request bug: hash->zone0==NULL for block %ld\n", block);
+	printk ("raid0_make_request bug: hash->zone0==NULL for block %"BLKOFF_FMT"\n", block);
 	goto outerr;
 bad_zone1:
-	printk ("raid0_make_request bug: hash->zone1==NULL for block %ld\n", block);
+	printk ("raid0_make_request bug: hash->zone1==NULL for block %"BLKOFF_FMT"\n", block);
  outerr:
 	buffer_IO_error(bh);
 	return 0;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/md/raid1.c lb-2.4.6-pre8/drivers/md/raid1.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/md/raid1.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/drivers/md/raid1.c	Sat Jun 30 23:48:47 2001
@@ -335,7 +335,7 @@
 }


-static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
+static void inline io_request_done(blkoff_t sector, raid1_conf_t *conf, int phase)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&conf->segment_lock, flags);
@@ -417,7 +417,7 @@
 		/*
 		 * oops, read error:
 		 */
-		printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
+		printk(KERN_ERR "raid1: %s: rescheduling block %"BLKOFF_FMT"\n",
 			 partition_name(bh->b_dev), bh->b_blocknr);
 		raid1_reschedule_retry(r1_bh);
 		return;
@@ -450,10 +450,10 @@
 {
 	int new_disk = conf->last_used;
 	const int sectors = bh->b_size >> 9;
-	const unsigned long this_sector = bh->b_rsector;
+	const blkoff_t this_sector = bh->b_rsector;
 	int disk = new_disk;
-	unsigned long new_distance;
-	unsigned long current_distance;
+	blkoff_t new_distance;
+	blkoff_t current_distance;

 	/*
 	 * Check if it is sane at all to balance
@@ -510,9 +510,9 @@

 		goto rb_out;
 	}
-
-	current_distance = abs(this_sector -
-				conf->mirrors[disk].head_position);
+	current_distance = (this_sector > conf->mirrors[disk].head_position) ?
+			this_sector - conf->mirrors[disk].head_position :
+			conf->mirrors[disk].head_position - this_sector;

 	/* Find the disk which is closest */

@@ -525,8 +525,9 @@
 				(!conf->mirrors[disk].operational))
 			continue;

-		new_distance = abs(this_sector -
-					conf->mirrors[disk].head_position);
+		new_distance = (this_sector > conf->mirrors[disk].head_position) ?
+			this_sector - conf->mirrors[disk].head_position :
+			conf->mirrors[disk].head_position - this_sector;

 		if (new_distance < current_distance) {
 			conf->sect_count = 0;
@@ -1088,10 +1089,10 @@


 #define IO_ERROR KERN_ALERT \
-"raid1: %s: unrecoverable I/O read error for block %lu\n"
+"raid1: %s: unrecoverable I/O read error for block %"BLKOFF_FMT"\n"

 #define REDIRECT_SECTOR KERN_ERR \
-"raid1: %s: redirecting sector %lu to another mirror\n"
+"raid1: %s: redirecting sector %"BLKOFF_FMT" to another mirror\n"

 /*
  * This is a kernel thread which:
@@ -1304,7 +1305,7 @@
  * issue suitable write requests
  */

-static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
+static int raid1_sync_request (mddev_t *mddev, blkoff_t sector_nr)
 {
 	raid1_conf_t *conf = mddev_to_conf(mddev);
 	struct mirror_info *mirror;
@@ -1312,7 +1313,7 @@
 	struct buffer_head *bh;
 	int bsize;
 	int disk;
-	int block_nr;
+	blkoff_t block_nr;

 	spin_lock_irq(&conf->segment_lock);
 	if (!sector_nr) {
diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/md/raid5.c lb-2.4.6-pre8/drivers/md/raid5.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/md/raid5.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/drivers/md/raid5.c	Sat Jun 30 16:04:05 2001
@@ -204,7 +204,7 @@
 	for (i=disks; i--; ) {
 		if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
 		    buffer_locked(sh->bh_cache[i])) {
-			printk("sector=%lx i=%d %p %p %p %d\n",
+			printk("sector=%"BLKOFF_FMT" i=%d %p %p %p %d\n",
 			       sh->sector, i, sh->bh_read[i],
 			       sh->bh_write[i], sh->bh_written[i],
 			       buffer_locked(sh->bh_cache[i]));
@@ -464,7 +464,7 @@
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	struct buffer_head *bh = sh->bh_cache[i];
-	unsigned long block = sh->sector / (sh->size >> 9);
+	blkoff_t block = sh->sector / (sh->size >> 9);

 	init_buffer(bh, raid5_end_read_request, sh);
 	bh->b_dev       = conf->disks[i].dev;
@@ -539,7 +539,7 @@
  * Input: a 'big' sector number,
  * Output: index of the data and parity disk, and the sector # in them.
  */
-static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,
+static unsigned long raid5_compute_sector(blkoff_t r_sector, unsigned int raid_disks,
 			unsigned int data_disks, unsigned int * dd_idx,
 			unsigned int * pd_idx, raid5_conf_t *conf)
 {
@@ -607,12 +607,12 @@
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
-	unsigned long new_sector = sh->sector, check;
+	blkoff_t new_sector = sh->sector, check;
 	int sectors_per_chunk = conf->chunk_size >> 9;
-	unsigned long stripe = new_sector / sectors_per_chunk;
+	blkoff_t stripe = new_sector / sectors_per_chunk;
 	int chunk_offset = new_sector % sectors_per_chunk;
 	int chunk_number, dummy1, dummy2, dd_idx = i;
-	unsigned long r_sector, blocknr;
+	blkoff_t r_sector, blocknr;

 	switch (conf->algorithm) {
 		case ALGORITHM_LEFT_ASYMMETRIC:
@@ -670,7 +670,7 @@
 		if (buffer_uptodate(bh))
 			bh_ptr[count++] = bh;
 		else
-			printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
+			printk("compute_block() %d, stripe %"BLKOFF_FMT", %d not present\n", dd_idx, sh->sector, i);

 		check_xor();
 	}
@@ -781,7 +781,7 @@
 	else
 		bhp = &sh->bh_write[dd_idx];
 	while (*bhp) {
-		printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector);
+		printk(KERN_NOTICE "raid5: multiple %d requests for sector %"BLKOFF_FMT"\n", rw, sh->sector);
 		bhp = & (*bhp)->b_reqnext;
 	}
 	*bhp = bh;
@@ -1236,18 +1236,18 @@
 	return correct_size;
 }

-static int raid5_sync_request (mddev_t *mddev, unsigned long sector_nr)
+static int raid5_sync_request (mddev_t *mddev, blkoff_t sector_nr)
 {
 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
 	struct stripe_head *sh;
 	int sectors_per_chunk = conf->chunk_size >> 9;
-	unsigned long stripe = sector_nr/sectors_per_chunk;
+	blkoff_t stripe = sector_nr/sectors_per_chunk;
 	int chunk_offset = sector_nr % sectors_per_chunk;
 	int dd_idx, pd_idx;
-	unsigned long first_sector;
+	blkoff_t first_sector;
 	int raid_disks = conf->raid_disks;
 	int data_disks = raid_disks-1;
-	int redone = 0;
+	blkoff_t redone = 0;
 	int bufsize;

 	sh = get_active_stripe(conf, sector_nr, 0, 0);
diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/scsi/scsi_lib.c lb-2.4.6-pre8/drivers/scsi/scsi_lib.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/scsi/scsi_lib.c	Fri May 25 22:48:09 2001
+++ lb-2.4.6-pre8/drivers/scsi/scsi_lib.c	Sat Jun 30 16:07:29 2001
@@ -369,7 +369,7 @@
 	req = &SCpnt->request;
 	req->errors = 0;
 	if (!uptodate) {
-		printk(" I/O error: dev %s, sector %lu\n",
+		printk(" I/O error: dev %s, sector %"BLKOFF_FMT"\n",
 		       kdevname(req->rq_dev), req->sector);
 	}
 	do {
diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/scsi/sd.c lb-2.4.6-pre8/drivers/scsi/sd.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/scsi/sd.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/drivers/scsi/sd.c	Sat Jun 30 14:22:13 2001
@@ -81,7 +81,7 @@
 struct hd_struct *sd;

 static Scsi_Disk *rscsi_disks;
-static int *sd_sizes;
+static blkoff_t *sd_sizes;
 static int *sd_blocksizes;
 static int *sd_hardsizes;	/* Hardware sector size */

@@ -1050,10 +1050,11 @@
 	memset(rscsi_disks, 0, sd_template.dev_max * sizeof(Scsi_Disk));

 	/* for every (necessary) major: */
-	sd_sizes = kmalloc((sd_template.dev_max << 4) * sizeof(int), GFP_ATOMIC);
+	/* FIXME: GFP_ATOMIC???  Someone please pass the pipe... */
+	sd_sizes = kmalloc((sd_template.dev_max << 4) * sizeof(blkoff_t), GFP_ATOMIC);
 	if (!sd_sizes)
 		goto cleanup_disks;
-	memset(sd_sizes, 0, (sd_template.dev_max << 4) * sizeof(int));
+	memset(sd_sizes, 0, (sd_template.dev_max << 4) * sizeof(blkoff_t));

 	sd_blocksizes = kmalloc((sd_template.dev_max << 4) * sizeof(int), GFP_ATOMIC);
 	if (!sd_blocksizes)
diff -ur /md0/kernels/2.4/v2.4.6-pre8/drivers/scsi/sr.c lb-2.4.6-pre8/drivers/scsi/sr.c
--- /md0/kernels/2.4/v2.4.6-pre8/drivers/scsi/sr.c	Fri May 25 22:48:09 2001
+++ lb-2.4.6-pre8/drivers/scsi/sr.c	Sat Jun 30 22:00:58 2001
@@ -85,7 +85,7 @@
 };

 Scsi_CD *scsi_CDs;
-static int *sr_sizes;
+static blkoff_t *sr_sizes;

 static int *sr_blocksizes;

@@ -766,10 +766,10 @@
 		goto cleanup_devfs;
 	memset(scsi_CDs, 0, sr_template.dev_max * sizeof(Scsi_CD));

-	sr_sizes = kmalloc(sr_template.dev_max * sizeof(int), GFP_ATOMIC);
+	sr_sizes = kmalloc(sr_template.dev_max * sizeof(blkoff_t), GFP_ATOMIC);
 	if (!sr_sizes)
 		goto cleanup_cds;
-	memset(sr_sizes, 0, sr_template.dev_max * sizeof(int));
+	memset(sr_sizes, 0, sr_template.dev_max * sizeof(blkoff_t));

 	sr_blocksizes = kmalloc(sr_template.dev_max * sizeof(int), GFP_ATOMIC);
 	if (!sr_blocksizes)
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/adfs/adfs.h lb-2.4.6-pre8/fs/adfs/adfs.h
--- /md0/kernels/2.4/v2.4.6-pre8/fs/adfs/adfs.h	Mon Sep 18 18:14:06 2000
+++ lb-2.4.6-pre8/fs/adfs/adfs.h	Sat Jun 30 15:27:18 2001
@@ -66,7 +66,7 @@

 /* Inode stuff */
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,0)
-int adfs_get_block(struct inode *inode, long block,
+int adfs_get_block(struct inode *inode, blkoff_t block,
 		   struct buffer_head *bh, int create);
 #else
 int adfs_bmap(struct inode *inode, int block);
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/adfs/inode.c lb-2.4.6-pre8/fs/adfs/inode.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/adfs/inode.c	Fri Dec 29 17:07:57 2000
+++ lb-2.4.6-pre8/fs/adfs/inode.c	Sat Jun 30 15:25:32 2001
@@ -25,7 +25,7 @@
  * not support creation of new blocks, so we return -EIO for this case.
  */
 int
-adfs_get_block(struct inode *inode, long block, struct buffer_head *bh, int create)
+adfs_get_block(struct inode *inode, blkoff_t block, struct buffer_head *bh, int create)
 {
 	if (block < 0)
 		goto abort_negative;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/affs/file.c lb-2.4.6-pre8/fs/affs/file.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/affs/file.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/fs/affs/file.c	Sat Jun 30 15:28:09 2001
@@ -38,7 +38,6 @@
 static struct buffer_head *affs_alloc_extblock(struct inode *inode, struct buffer_head *bh, u32 ext);
 static inline struct buffer_head *affs_get_extblock(struct inode *inode, u32 ext);
 static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
-static int affs_get_block(struct inode *inode, long block, struct buffer_head *bh_result, int create);

 static ssize_t affs_file_write(struct file *filp, const char *buf, size_t count, loff_t *ppos);
 static int affs_file_open(struct inode *inode, struct file *filp);
@@ -331,7 +330,7 @@
 }

 static int
-affs_get_block(struct inode *inode, long block, struct buffer_head *bh_result, int create)
+affs_get_block(struct inode *inode, blkoff_t block, struct buffer_head *bh_result, int create)
 {
 	struct super_block	*sb = inode->i_sb;
 	struct buffer_head	*ext_bh;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/affs/super.c lb-2.4.6-pre8/fs/affs/super.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/affs/super.c	Thu May  3 11:22:16 2001
+++ lb-2.4.6-pre8/fs/affs/super.c	Sat Jun 30 14:22:13 2001
@@ -29,7 +29,6 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>

-extern int *blk_size[];
 extern struct timezone sys_tz;

 static int affs_statfs(struct super_block *sb, struct statfs *buf);
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/bfs/file.c lb-2.4.6-pre8/fs/bfs/file.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/bfs/file.c	Mon Dec  4 22:02:45 2000
+++ lb-2.4.6-pre8/fs/bfs/file.c	Sat Jun 30 15:26:50 2001
@@ -53,7 +53,7 @@
 	return 0;
 }

-static int bfs_get_block(struct inode * inode, long block,
+static int bfs_get_block(struct inode * inode, blkoff_t block,
 	struct buffer_head * bh_result, int create)
 {
 	long phys;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/block_dev.c lb-2.4.6-pre8/fs/block_dev.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/block_dev.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/fs/block_dev.c	Sat Jun 30 15:01:36 2001
@@ -14,12 +14,10 @@
 #include <linux/major.h>
 #include <linux/devfs_fs_kernel.h>
 #include <linux/smp_lock.h>
+#include <linux/blkdev.h>

 #include <asm/uaccess.h>

-extern int *blk_size[];
-extern int *blksize_size[];
-
 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
 #define NBUF 64

@@ -28,15 +26,15 @@
 {
 	struct inode * inode = filp->f_dentry->d_inode;
 	ssize_t blocksize, blocksize_bits, i, buffercount, write_error;
-	ssize_t block, blocks;
+	ssize_t blocks;
 	loff_t offset;
 	ssize_t chars;
 	ssize_t written, retval;
 	struct buffer_head * bhlist[NBUF];
-	size_t size;
 	kdev_t dev = inode->i_rdev;
 	struct buffer_head * bh, *bufferlist[NBUF];
 	register char * p;
+	blkoff_t block, size;

 	if (is_read_only(dev))
 		return -EPERM;
@@ -57,7 +55,7 @@
 	offset = *ppos & (blocksize-1);

 	if (blk_size[MAJOR(dev)])
-		size = ((loff_t) blk_size[MAJOR(dev)][MINOR(dev)] << BLOCK_SIZE_BITS) >> blocksize_bits;
+		size = ((unsigned long long) blk_size[MAJOR(dev)][MINOR(dev)] << BLOCK_SIZE_BITS) >> blocksize_bits;
 	else
 		size = INT_MAX;
 	while (count>0) {
@@ -177,7 +175,6 @@
 ssize_t block_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
 {
 	struct inode * inode = filp->f_dentry->d_inode;
-	size_t block;
 	loff_t offset;
 	ssize_t blocksize;
 	ssize_t blocksize_bits, i;
@@ -190,6 +187,7 @@
 	loff_t size;
 	kdev_t dev;
 	ssize_t read;
+	blkoff_t block;

 	dev = inode->i_rdev;
 	blocksize = BLOCK_SIZE;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/buffer.c lb-2.4.6-pre8/fs/buffer.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/buffer.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/fs/buffer.c	Sat Jun 30 14:22:13 2001
@@ -531,7 +531,7 @@
  * will force it bad). This shouldn't really happen currently, but
  * the code is ready.
  */
-static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
+static inline struct buffer_head * __get_hash_table(kdev_t dev, blkoff_t block, int size)
 {
 	struct buffer_head *bh = hash(dev, block);

@@ -546,7 +546,7 @@
 	return bh;
 }

-struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
+struct buffer_head * get_hash_table(kdev_t dev, blkoff_t block, int size)
 {
 	struct buffer_head *bh;

@@ -665,7 +665,6 @@

 void set_blocksize(kdev_t dev, int size)
 {
-	extern int *blksize_size[];
 	int i, nlist, slept;
 	struct buffer_head * bh, * bh_next;

@@ -712,7 +711,7 @@
 			if (!atomic_read(&bh->b_count)) {
 				if (buffer_dirty(bh))
 					printk(KERN_WARNING
-					       "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
+					       "set_blocksize: dev %s buffer_dirty %" BLKOFF_FMT " size %hu\n",
 					       kdevname(dev), bh->b_blocknr, bh->b_size);
 				remove_inode_queue(bh);
 				__remove_from_queues(bh);
@@ -723,7 +722,7 @@
 				clear_bit(BH_Uptodate, &bh->b_state);
 				printk(KERN_WARNING
 				       "set_blocksize: "
-				       "b_count %d, dev %s, block %lu, from %p\n",
+				       "b_count %d, dev %s, block %" BLKOFF_FMT ", from %p\n",
 				       atomic_read(&bh->b_count), bdevname(bh->b_dev),
 				       bh->b_blocknr, __builtin_return_address(0));
 			}
@@ -970,7 +969,7 @@
  * 14.02.92: changed it to sync dirty buffers a bit: better performance
  * when the filesystem starts to get full of dirty blocks (I hope).
  */
-struct buffer_head * getblk(kdev_t dev, int block, int size)
+struct buffer_head * getblk(kdev_t dev, blkoff_t block, int size)
 {
 	struct buffer_head * bh;
 	int isize;
@@ -1155,7 +1154,7 @@
  * bread() reads a specified block and returns the buffer that contains
  * it. It returns NULL if the block was unreadable.
  */
-struct buffer_head * bread(kdev_t dev, int block, int size)
+struct buffer_head * bread(kdev_t dev, blkoff_t block, int size)
 {
 	struct buffer_head * bh;

@@ -1659,7 +1658,7 @@
 int block_read_full_page(struct page *page, get_block_t *get_block)
 {
 	struct inode *inode = page->mapping->host;
-	unsigned long iblock, lblock;
+	blkoff_t iblock, lblock;
 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
 	unsigned int blocksize, blocks;
 	int nr, i;
@@ -1672,7 +1671,7 @@
 	head = page->buffers;

 	blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
-	iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+	iblock = (blkoff_t)page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 	lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
 	bh = head;
 	nr = 0;
@@ -1949,7 +1948,7 @@
 	goto done;
 }

-int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
+blkoff_t generic_block_bmap(struct address_space *mapping, blkoff_t block, get_block_t *get_block)
 {
 	struct buffer_head tmp;
 	struct inode *inode = mapping->host;
@@ -2033,7 +2032,7 @@
 	int		pageind;
 	int		bhind;
 	int		offset;
-	unsigned long	blocknr;
+	blkoff_t	blocknr;
 	struct kiobuf *	iobuf = NULL;
 	struct page *	map;
 	struct buffer_head *tmp, **bhs = NULL;
@@ -2147,7 +2146,7 @@
  * FIXME: we need a swapper_inode->get_block function to remove
  *        some of the bmap kludges and interface ugliness here.
  */
-int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
+int brw_page(int rw, struct page *page, kdev_t dev, blkoff_t b[], int size)
 {
 	struct buffer_head *head, *bh;

diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/efs/file.c lb-2.4.6-pre8/fs/efs/file.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/efs/file.c	Sat Feb 26 23:33:05 2000
+++ lb-2.4.6-pre8/fs/efs/file.c	Sat Jun 30 14:22:13 2001
@@ -8,7 +8,7 @@

 #include <linux/efs_fs.h>

-int efs_get_block(struct inode *inode, long iblock,
+int efs_get_block(struct inode *inode, blkoff_t iblock,
 		  struct buffer_head *bh_result, int create)
 {
 	int error = -EROFS;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/ext2/inode.c lb-2.4.6-pre8/fs/ext2/inode.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/ext2/inode.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/fs/ext2/inode.c	Sat Jun 30 14:22:13 2001
@@ -503,7 +503,7 @@
  * reachable from inode.
  */

-static int ext2_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create)
+static int ext2_get_block(struct inode *inode, blkoff_t iblock, struct buffer_head *bh_result, int create)
 {
 	int err = -EIO;
 	int offsets[4];
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/fat/file.c lb-2.4.6-pre8/fs/fat/file.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/fat/file.c	Fri May 25 22:48:09 2001
+++ lb-2.4.6-pre8/fs/fat/file.c	Sat Jun 30 14:22:13 2001
@@ -54,7 +54,7 @@
 }


-int fat_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create)
+int fat_get_block(struct inode *inode, blkoff_t iblock, struct buffer_head *bh_result, int create)
 {
 	struct super_block *sb = inode->i_sb;
 	unsigned long phys;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/freevxfs/vxfs_subr.c lb-2.4.6-pre8/fs/freevxfs/vxfs_subr.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/freevxfs/vxfs_subr.c	Fri May 25 22:48:09 2001
+++ lb-2.4.6-pre8/fs/freevxfs/vxfs_subr.c	Sat Jun 30 14:22:13 2001
@@ -134,7 +134,7 @@
  *   Zero on success, else a negativ error code (-EIO).
  */
 static int
-vxfs_getblk(struct inode *ip, long iblock,
+vxfs_getblk(struct inode *ip, blkoff_t iblock,
 	    struct buffer_head *bp, int create)
 {
 	daddr_t			pblock;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/hfs/file.c lb-2.4.6-pre8/fs/hfs/file.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/hfs/file.c	Mon Feb 26 10:20:13 2001
+++ lb-2.4.6-pre8/fs/hfs/file.c	Sat Jun 30 14:22:13 2001
@@ -106,7 +106,7 @@
  * block number.  This function just calls hfs_extent_map() to do the
  * real work and then stuffs the appropriate info into the buffer_head.
  */
-int hfs_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create)
+int hfs_get_block(struct inode *inode, blkoff_t iblock, struct buffer_head *bh_result, int create)
 {
 	unsigned long phys;

diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/hfs/hfs.h lb-2.4.6-pre8/fs/hfs/hfs.h
--- /md0/kernels/2.4/v2.4.6-pre8/fs/hfs/hfs.h	Sat Jun 30 15:20:48 2001
+++ lb-2.4.6-pre8/fs/hfs/hfs.h	Sat Jun 30 18:03:31 2001
@@ -495,7 +495,7 @@
 extern void hfs_extent_free(struct hfs_fork *);

 /* file.c */
-extern int hfs_get_block(struct inode *, long, struct buffer_head *, int);
+extern int hfs_get_block(struct inode *, blkoff_t, struct buffer_head *, int);

 /* mdb.c */
 extern struct hfs_mdb *hfs_mdb_get(hfs_sysmdb, int, hfs_s32);
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/hpfs/file.c lb-2.4.6-pre8/fs/hpfs/file.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/hpfs/file.c	Fri Dec 29 17:07:57 2000
+++ lb-2.4.6-pre8/fs/hpfs/file.c	Sat Jun 30 14:22:13 2001
@@ -66,7 +66,7 @@
 	hpfs_write_inode(i);
 }

-int hpfs_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create)
+int hpfs_get_block(struct inode *inode, blkoff_t iblock, struct buffer_head *bh_result, int create)
 {
 	secno s;
 	s = hpfs_bmap(inode, iblock);
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/isofs/inode.c lb-2.4.6-pre8/fs/isofs/inode.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/isofs/inode.c	Thu May  3 11:22:16 2001
+++ lb-2.4.6-pre8/fs/isofs/inode.c	Sat Jun 30 16:09:03 2001
@@ -876,7 +876,7 @@
 /* Life is simpler than for other filesystem since we never
  * have to create a new block, only find an existing one.
  */
-static int isofs_get_block(struct inode *inode, long iblock,
+static int isofs_get_block(struct inode *inode, blkoff_t iblock,
 		    struct buffer_head *bh_result, int create)
 {
 	unsigned long b_off;
@@ -951,18 +951,18 @@
 	goto abort;

 abort_beyond_end:
-	printk("isofs_get_block: block >= EOF (%ld, %ld)\n",
+	printk("isofs_get_block: block >= EOF (%"BLKOFF_FMT", %ld)\n",
 	       iblock, (unsigned long) inode->i_size);
 	goto abort;

 abort_too_many_sections:
 	printk("isofs_get_block: More than 100 file sections ?!?, aborting...\n");
-	printk("isofs_get_block: ino=%lu block=%ld firstext=%u sect_size=%u nextino=%lu\n",
+	printk("isofs_get_block: ino=%lu block=%" BLKOFF_FMT " firstext=%u sect_size=%u nextino=%lu\n",
 	       inode->i_ino, iblock, firstext, (unsigned) sect_size, nextino);
 	goto abort;
 }

-static int isofs_bmap(struct inode *inode, int block)
+static blkoff_t isofs_bmap(struct inode *inode, blkoff_t block)
 {
 	struct buffer_head dummy;
 	int error;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/minix/inode.c lb-2.4.6-pre8/fs/minix/inode.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/minix/inode.c	Thu May  3 11:22:16 2001
+++ lb-2.4.6-pre8/fs/minix/inode.c	Sat Jun 30 15:24:05 2001
@@ -350,7 +350,7 @@
 	return 0;
 }

-static int minix_get_block(struct inode *inode, long block,
+static int minix_get_block(struct inode *inode, blkoff_t block,
 		    struct buffer_head *bh_result, int create)
 {
 	if (INODE_VERSION(inode) == MINIX_V1)
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/minix/itree_common.c lb-2.4.6-pre8/fs/minix/itree_common.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/minix/itree_common.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/fs/minix/itree_common.c	Sat Jun 30 14:22:13 2001
@@ -140,7 +140,7 @@
 	return -EAGAIN;
 }

-static inline int get_block(struct inode * inode, long block,
+static inline int get_block(struct inode * inode, blkoff_t block,
 			struct buffer_head *bh_result, int create)
 {
 	int err = -EIO;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/partitions/check.c lb-2.4.6-pre8/fs/partitions/check.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/partitions/check.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/fs/partitions/check.c	Sat Jun 30 14:22:13 2001
@@ -33,8 +33,6 @@
 #include "ibm.h"
 #include "ultrix.h"

-extern int *blk_size[];
-
 struct gendisk *gendisk_head;
 int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/

@@ -250,7 +248,7 @@
 				char buf[64];

 				len += sprintf(page + len,
-					       "%4d  %4d %10d %s\n",
+					       "%4d  %4d %10" BLKOFF_FMT " %s\n",
 					       dsk->major, n, dsk->sizes[n],
 					       disk_name(dsk, n, buf));
 				if (len < offset)
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/qnx4/inode.c lb-2.4.6-pre8/fs/qnx4/inode.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/qnx4/inode.c	Thu May  3 11:22:17 2001
+++ lb-2.4.6-pre8/fs/qnx4/inode.c	Sat Jun 30 14:22:13 2001
@@ -204,7 +204,7 @@
 	return NULL;
 }

-int qnx4_get_block( struct inode *inode, long iblock, struct buffer_head *bh, int create )
+int qnx4_get_block( struct inode *inode, blkoff_t iblock, struct buffer_head *bh, int create )
 {
 	unsigned long phys;

diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/reiserfs/inode.c lb-2.4.6-pre8/fs/reiserfs/inode.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/reiserfs/inode.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/fs/reiserfs/inode.c	Sat Jun 30 15:32:52 2001
@@ -438,7 +438,7 @@

 // this is called to create file map. So, _get_block_create_0 will not
 // read direct item
-int reiserfs_bmap (struct inode * inode, long block,
+int reiserfs_bmap (struct inode * inode, blkoff_t block,
 		   struct buffer_head * bh_result, int create)
 {
     if (!file_capable (inode, block))
@@ -468,7 +468,7 @@
 ** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
 ** don't use this function.
 */
-static int reiserfs_get_block_create_0 (struct inode * inode, long block,
+static int reiserfs_get_block_create_0 (struct inode * inode, blkoff_t block,
 			struct buffer_head * bh_result, int create) {
     return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ;
 }
@@ -559,7 +559,7 @@
 // determine which parts are derivative, if any, understanding that
 // there are only so many ways to code to a given interface.
 //
-int reiserfs_get_block (struct inode * inode, long block,
+int reiserfs_get_block (struct inode * inode, blkoff_t block,
 			struct buffer_head * bh_result, int create)
 {
     int repeat, retval;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/sysv/inode.c lb-2.4.6-pre8/fs/sysv/inode.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/sysv/inode.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/fs/sysv/inode.c	Sat Jun 30 14:22:13 2001
@@ -787,7 +787,7 @@
 	return result;
 }

-static int sysv_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create)
+static int sysv_get_block(struct inode *inode, blkoff_t iblock, struct buffer_head *bh_result, int create)
 {
 	struct super_block *sb;
 	int ret, err, new;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/udf/inode.c lb-2.4.6-pre8/fs/udf/inode.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/udf/inode.c	Sat Jun 30 14:04:27 2001
+++ lb-2.4.6-pre8/fs/udf/inode.c	Sat Jun 30 14:22:13 2001
@@ -56,7 +56,7 @@
 static void udf_update_extents(struct inode *,
 	long_ad [EXTENT_MERGE_SIZE], int, int,
 	lb_addr, Uint32, struct buffer_head **);
-static int udf_get_block(struct inode *, long, struct buffer_head *, int);
+static int udf_get_block(struct inode *, blkoff_t, struct buffer_head *, int);

 /*
  * udf_put_inode
@@ -311,7 +311,7 @@
 	return dbh;
 }

-static int udf_get_block(struct inode *inode, long block, struct buffer_head *bh_result, int create)
+static int udf_get_block(struct inode *inode, blkoff_t block, struct buffer_head *bh_result, int create)
 {
 	int err, new;
 	struct buffer_head *bh;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/blkdev.h lb-2.4.6-pre8/include/linux/blkdev.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/blkdev.h	Mon Jun 18 22:03:03 2001
+++ lb-2.4.6-pre8/include/linux/blkdev.h	Sun Jul  1 00:35:58 2001
@@ -1,6 +1,9 @@
 #ifndef _LINUX_BLKDEV_H
 #define _LINUX_BLKDEV_H

+#ifndef _LINUX_TYPES_H
+#include <linux/types.h>
+#endif
 #include <linux/major.h>
 #include <linux/sched.h>
 #include <linux/genhd.h>
@@ -33,9 +36,10 @@
 	kdev_t rq_dev;
 	int cmd;		/* READ or WRITE */
 	int errors;
-	unsigned long sector;
+	blkoff_t sector;
+	blkoff_t hard_sector;
 	unsigned long nr_sectors;
-	unsigned long hard_sector, hard_nr_sectors;
+	unsigned long hard_nr_sectors;
 	unsigned int nr_segments;
 	unsigned int nr_hw_segments;
 	unsigned long current_nr_sectors;
@@ -164,7 +168,7 @@
 extern void blk_queue_make_request(request_queue_t *, make_request_fn *);
 extern void generic_unplug_device(void *);

-extern int * blk_size[MAX_BLKDEV];
+extern blkoff_t * blk_size[MAX_BLKDEV];

 extern int * blksize_size[MAX_BLKDEV];

diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/fs.h lb-2.4.6-pre8/include/linux/fs.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/fs.h	Sat Jun 30 14:04:28 2001
+++ lb-2.4.6-pre8/include/linux/fs.h	Sun Jul  1 00:35:46 2001
@@ -188,6 +188,7 @@
 /* This was here just to show that the number is taken -
    probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */
 #endif
+#define BLKGETSIZE64 _IO(0x12,109)	/* return device size */


 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
@@ -235,7 +236,7 @@
 struct buffer_head {
 	/* First cache line: */
 	struct buffer_head *b_next;	/* Hash queue list */
-	unsigned long b_blocknr;	/* block number */
+	blkoff_t b_blocknr;		/* block number */
 	unsigned short b_size;		/* block size */
 	unsigned short b_list;		/* List that this buffer appears */
 	kdev_t b_dev;			/* device (B_FREE = free) */
@@ -256,7 +257,7 @@
 	void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */
  	void *b_private;		/* reserved for b_end_io */

-	unsigned long b_rsector;	/* Real buffer location on disk */
+	blkoff_t b_rsector;		/* Real buffer location on disk */
 	wait_queue_head_t b_wait;

 	struct inode *	     b_inode;
@@ -1283,8 +1284,8 @@
 extern struct file * get_empty_filp(void);
 extern void file_move(struct file *f, struct list_head *list);
 extern void file_moveto(struct file *new, struct file *old);
-extern struct buffer_head * get_hash_table(kdev_t, int, int);
-extern struct buffer_head * getblk(kdev_t, int, int);
+extern struct buffer_head * get_hash_table(kdev_t, blkoff_t, int);
+extern struct buffer_head * getblk(kdev_t, blkoff_t, int);
 extern void ll_rw_block(int, int, struct buffer_head * bh[]);
 extern void submit_bh(int, struct buffer_head *);
 extern int is_read_only(kdev_t);
@@ -1301,12 +1302,12 @@
 		__bforget(buf);
 }
 extern void set_blocksize(kdev_t, int);
-extern struct buffer_head * bread(kdev_t, int, int);
+extern struct buffer_head * bread(kdev_t, blkoff_t, int);
 extern void wakeup_bdflush(int wait);

-extern int brw_page(int, struct page *, kdev_t, int [], int);
+extern int brw_page(int, struct page *, kdev_t, blkoff_t [], int);

-typedef int (get_block_t)(struct inode*,long,struct buffer_head*,int);
+typedef int (get_block_t)(struct inode*,blkoff_t,struct buffer_head*,int);

 /* Generic buffer handling for block filesystems.. */
 extern int block_flushpage(struct page *, unsigned long);
@@ -1318,7 +1319,7 @@
 				unsigned long *);
 extern int block_sync_page(struct page *);

-int generic_block_bmap(struct address_space *, long, get_block_t *);
+blkoff_t generic_block_bmap(struct address_space *, blkoff_t, get_block_t *);
 int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);

diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/genhd.h lb-2.4.6-pre8/include/linux/genhd.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/genhd.h	Mon Jun 18 22:03:03 2001
+++ lb-2.4.6-pre8/include/linux/genhd.h	Sun Jul  1 00:35:58 2001
@@ -48,8 +48,8 @@
 #  include <linux/devfs_fs_kernel.h>

 struct hd_struct {
-	long start_sect;
-	long nr_sects;
+	blkoff_t start_sect;
+	blkoff_t nr_sects;
 	devfs_handle_t de;              /* primary (master) devfs entry  */
 };

@@ -63,7 +63,7 @@
 	int max_p;			/* maximum partitions per device */

 	struct hd_struct *part;		/* [indexed by minor] */
-	int *sizes;			/* [idem], device size in blocks */
+	blkoff_t *sizes;		/* [idem], device size in blocks */
 	int nr_real;			/* number of real devices */

 	void *real_devices;		/* internal use */
diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/loop.h lb-2.4.6-pre8/include/linux/loop.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/loop.h	Thu Apr  5 11:53:45 2001
+++ lb-2.4.6-pre8/include/linux/loop.h	Sat Jun 30 23:40:35 2001
@@ -28,13 +28,13 @@
 	int		lo_number;
 	int		lo_refcnt;
 	kdev_t		lo_device;
-	int		lo_offset;
+	loff_t		lo_offset;
 	int		lo_encrypt_type;
 	int		lo_encrypt_key_size;
 	int		lo_flags;
 	int		(*transfer)(struct loop_device *, int cmd,
 				    char *raw_buf, char *loop_buf, int size,
-				    int real_block);
+				    blkoff_t real_block);
 	char		lo_name[LO_NAME_SIZE];
 	char		lo_encrypt_key[LO_KEY_SIZE];
 	__u32           lo_init[2];
@@ -98,7 +98,7 @@
 	dev_t		lo_device; 	/* ioctl r/o */
 	unsigned long	lo_inode; 	/* ioctl r/o */
 	dev_t		lo_rdevice; 	/* ioctl r/o */
-	int		lo_offset;
+	loff_t		lo_offset;
 	int		lo_encrypt_type;
 	int		lo_encrypt_key_size; 	/* ioctl w/o */
 	int		lo_flags;	/* ioctl r/o */
@@ -128,7 +128,7 @@
 struct loop_func_table {
 	int number; 	/* filter type */
 	int (*transfer)(struct loop_device *lo, int cmd, char *raw_buf,
-			char *loop_buf, int size, int real_block);
+			char *loop_buf, int size, blkoff_t real_block);
 	int (*init)(struct loop_device *, struct loop_info *);
 	/* release is called from loop_unregister_transfer or clr_fd */
 	int (*release)(struct loop_device *);
diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/msdos_fs.h lb-2.4.6-pre8/include/linux/msdos_fs.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/msdos_fs.h	Sat Jun 30 15:18:45 2001
+++ lb-2.4.6-pre8/include/linux/msdos_fs.h	Sat Jun 30 18:02:53 2001
@@ -241,7 +241,7 @@
 /* inode.c */
 extern void fat_hash_init(void);
 extern int fat_bmap(struct inode *inode,int block);
-extern int fat_get_block(struct inode *, long, struct buffer_head *, int);
+extern int fat_get_block(struct inode *, blkoff_t, struct buffer_head *, int);
 extern int fat_notify_change(struct dentry *, struct iattr *);
 extern void fat_clear_inode(struct inode *inode);
 extern void fat_delete_inode(struct inode *inode);
diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/qnx4_fs.h lb-2.4.6-pre8/include/linux/qnx4_fs.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/qnx4_fs.h	Thu Jun 29 18:53:42 2000
+++ lb-2.4.6-pre8/include/linux/qnx4_fs.h	Sat Jun 30 15:24:47 2001
@@ -118,7 +118,7 @@
 extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
 extern int qnx4_sync_file(struct file *file, struct dentry *dentry, int);
 extern int qnx4_sync_inode(struct inode *inode);
-extern int qnx4_get_block(struct inode *inode, long iblock, struct buffer_head *bh, int create);
+extern int qnx4_get_block(struct inode *inode, blkoff_t iblock, struct buffer_head *bh, int create);

 #endif				/* __KERNEL__ */

diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/raid/linear.h lb-2.4.6-pre8/include/linux/raid/linear.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/raid/linear.h	Tue Jun 19 13:32:19 2001
+++ lb-2.4.6-pre8/include/linux/raid/linear.h	Sun Jul  1 00:36:41 2001
@@ -5,8 +5,8 @@

 struct dev_info {
 	kdev_t		dev;
-	unsigned long	size;
-	unsigned long	offset;
+	blkoff_t	size;
+	blkoff_t	offset;
 };

 typedef struct dev_info dev_info_t;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/raid/md.h lb-2.4.6-pre8/include/linux/raid/md.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/raid/md.h	Sat Jun 30 02:27:33 2001
+++ lb-2.4.6-pre8/include/linux/raid/md.h	Sun Jul  1 00:35:59 2001
@@ -58,7 +58,7 @@
 #define MD_MINOR_VERSION                90
 #define MD_PATCHLEVEL_VERSION           0

-extern int md_size[MAX_MD_DEVS];
+extern blkoff_t md_size[MAX_MD_DEVS];
 extern struct hd_struct md_hd_struct[MAX_MD_DEVS];

 extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/raid/md_k.h lb-2.4.6-pre8/include/linux/raid/md_k.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/raid/md_k.h	Fri May 25 22:48:10 2001
+++ lb-2.4.6-pre8/include/linux/raid/md_k.h	Sat Jun 30 14:22:13 2001
@@ -162,14 +162,14 @@

 	kdev_t dev;			/* Device number */
 	kdev_t old_dev;			/*  "" when it was last imported */
-	unsigned long size;		/* Device size (in blocks) */
+	blkoff_t size;		/* Device size (in blocks) */
 	mddev_t *mddev;			/* RAID array if running */
 	unsigned long last_events;	/* IO event timestamp */

 	struct block_device *bdev;	/* block device handle */

 	mdp_super_t *sb;
-	unsigned long sb_offset;
+	blkoff_t sb_offset;

 	int faulty;			/* if faulty do not issue IO requests */
 	int desc_nr;			/* descriptor index in the superblock */
@@ -237,7 +237,7 @@

 	int (*stop_resync)(mddev_t *mddev);
 	int (*restart_resync)(mddev_t *mddev);
-	int (*sync_request)(mddev_t *mddev, unsigned long block_nr);
+	int (*sync_request)(mddev_t *mddev, blkoff_t block_nr);
 };


diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/raid/raid0.h lb-2.4.6-pre8/include/linux/raid/raid0.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/raid/raid0.h	Tue Jun 19 13:32:19 2001
+++ lb-2.4.6-pre8/include/linux/raid/raid0.h	Sun Jul  1 00:36:41 2001
@@ -5,9 +5,9 @@

 struct strip_zone
 {
-	unsigned long zone_offset;	/* Zone offset in md_dev */
-	unsigned long dev_offset;	/* Zone offset in real dev */
-	unsigned long size;		/* Zone size */
+	blkoff_t zone_offset;	/* Zone offset in md_dev */
+	blkoff_t dev_offset;	/* Zone offset in real dev */
+	blkoff_t size;		/* Zone size */
 	int nb_dev;			/* # of devices attached to the zone */
 	mdk_rdev_t *dev[MD_SB_DISKS]; /* Devices attached to the zone */
 };
diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/raid/raid1.h lb-2.4.6-pre8/include/linux/raid/raid1.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/raid/raid1.h	Tue Jun 19 13:32:19 2001
+++ lb-2.4.6-pre8/include/linux/raid/raid1.h	Sun Jul  1 00:36:41 2001
@@ -7,8 +7,8 @@
 	int		number;
 	int		raid_disk;
 	kdev_t		dev;
-	int		sect_limit;
-	int		head_position;
+	blkoff_t	sect_limit;
+	blkoff_t	head_position;

 	/*
 	 * State bits:
@@ -27,7 +27,7 @@
 	int			raid_disks;
 	int			working_disks;
 	int			last_used;
-	unsigned long		next_sect;
+	blkoff_t		next_sect;
 	int			sect_count;
 	mdk_thread_t		*thread, *resync_thread;
 	int			resync_mirrors;
@@ -47,7 +47,7 @@
 	md_wait_queue_head_t	wait_buffer;

 	/* for use when syncing mirrors: */
-	unsigned long	start_active, start_ready,
+	blkoff_t	start_active, start_ready,
 		start_pending, start_future;
 	int	cnt_done, cnt_active, cnt_ready,
 		cnt_pending, cnt_future;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/raid/raid5.h lb-2.4.6-pre8/include/linux/raid/raid5.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/raid/raid5.h	Sat Jun 30 14:04:28 2001
+++ lb-2.4.6-pre8/include/linux/raid/raid5.h	Sun Jul  1 00:36:44 2001
@@ -133,7 +133,7 @@
 	struct buffer_head	*bh_write[MD_SB_DISKS];	/* write request buffers of the MD device */
 	struct buffer_head	*bh_written[MD_SB_DISKS]; /* write request buffers of the MD device that have been scheduled for write */
 	struct page		*bh_page[MD_SB_DISKS];	/* saved bh_cache[n]->b_page when reading around the cache */
-	unsigned long		sector;			/* sector of this row */
+	blkoff_t		sector;			/* sector of this row */
 	int			size;			/* buffers size */
 	int			pd_idx;			/* parity disk index */
 	unsigned long		state;			/* state flags */
diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/reiserfs_fs.h lb-2.4.6-pre8/include/linux/reiserfs_fs.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/reiserfs_fs.h	Sat Jun 30 15:28:41 2001
+++ lb-2.4.6-pre8/include/linux/reiserfs_fs.h	Sat Jun 30 18:03:54 2001
@@ -1797,7 +1797,7 @@
 			       loff_t offset, int type, int length, int entry_count);
 /*void store_key (struct key * key);
 void forget_key (struct key * key);*/
-int reiserfs_get_block (struct inode * inode, long block,
+int reiserfs_get_block (struct inode * inode, blkoff_t block,
 			struct buffer_head * bh_result, int create);
 struct inode * reiserfs_iget (struct super_block * s, struct cpu_key * key);
 void reiserfs_read_inode (struct inode * inode) ;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/types.h lb-2.4.6-pre8/include/linux/types.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/types.h	Tue Jun 19 00:54:47 2001
+++ lb-2.4.6-pre8/include/linux/types.h	Sat Jun 30 15:37:56 2001
@@ -3,6 +3,14 @@

 #ifdef	__KERNEL__
 #include <linux/config.h>
+
+#if defined(CONFIG_BLKOFF_LONGLONG)
+#define BLKOFF_FMT	"Lu"
+typedef unsigned long long	blkoff_t;
+#else
+#define BLKOFF_FMT	"lu"
+typedef unsigned long		blkoff_t;
+#endif
 #endif

 #include <linux/posix_types.h>
diff -ur /md0/kernels/2.4/v2.4.6-pre8/mm/page_io.c lb-2.4.6-pre8/mm/page_io.c
--- /md0/kernels/2.4/v2.4.6-pre8/mm/page_io.c	Thu May  3 11:22:20 2001
+++ lb-2.4.6-pre8/mm/page_io.c	Sat Jun 30 14:22:13 2001
@@ -36,7 +36,7 @@
 static int rw_swap_page_base(int rw, swp_entry_t entry, struct page *page)
 {
 	unsigned long offset;
-	int zones[PAGE_SIZE/512];
+	blkoff_t zones[PAGE_SIZE/512];
 	int zones_used;
 	kdev_t dev = 0;
 	int block_size;


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH] first cut 64 bit block support
  2001-07-01  4:53 [RFC][PATCH] first cut 64 bit block support Ben LaHaise
@ 2001-07-03  4:53 ` Ragnar Kjørstad
  2001-07-04  2:19   ` [PATCH] 64 bit scsi read/write Ben LaHaise
  2001-07-04 10:16 ` [RFC][PATCH] first cut 64 bit block support Chris Wedgwood
  1 sibling, 1 reply; 71+ messages in thread
From: Ragnar Kjørstad @ 2001-07-03  4:53 UTC (permalink / raw)
  To: Ben LaHaise; +Cc: linux-fsdevel, linux-kernel, mike, kevin

On Sun, Jul 01, 2001 at 12:53:25AM -0400, Ben LaHaise wrote:
> Hey folks,
> 
> Below is the first cut at making the block size limit configurable to 64
> bits on x86, as well as always 64 bits on 64 bit machines.  The audit
> isn't complete yet, but a good chunk of it is done.

Great !

> Filesystem           1k-blocks      Used Available Use% Mounted on
> /dev/md1             7508125768        20 7476280496   1% /mnt/3
> 
> This is a 7TB ext2 filesystem on 4KB blocks.  The 7TB /dev/md1 consists of
> 7x 1TB sparse files on loop devices raid0'd together.  The current patch
> does not have the fixes in the SCSI layer or IDE driver yet; expect the
> SCSI fixes in the next version, although I'll need a tester.  The
> following should be 64 bit clean now: nbd, loop, raid0, raid1, raid5.

What about LVM?

We'll see what we can do to test the scsi-code. Please send it to us
when you have code. I guess there are fixes for both generic-scsi code
and for each controller, right? What controllers are you planning on
fixing first?
What tests do you recommend?
mkfs on a big device, and then putting >2TB data on it?



-- 
Ragnar Kjorstad
Big Storage

^ permalink raw reply	[flat|nested] 71+ messages in thread

* [PATCH] 64 bit scsi read/write
  2001-07-03  4:53 ` Ragnar Kjørstad
@ 2001-07-04  2:19   ` Ben LaHaise
  2001-07-04  7:11     ` Alan Cox
                       ` (2 more replies)
  0 siblings, 3 replies; 71+ messages in thread
From: Ben LaHaise @ 2001-07-04  2:19 UTC (permalink / raw)
  To: Ragnar Kjørstad; +Cc: linux-fsdevel, linux-kernel, mike, kevin

On Tue, 3 Jul 2001, Ragnar Kjørstad wrote:

> What about LVM?

Errr, I'll refrain from talking about LVM.

> We'll see what we can do to test the scsi-code. Please send it to us
> when you have code. I guess there are fixes for both generic-scsi code
> and for each controller, right? What controllers are you planning on
> fixing first?
> What tests do you recommend?
> mkfs on a big device, and then putting >2TB data on it?

Here's the [completely untested] generic scsi fixup, but I'm told that
some controllers will break with it.  Give it a whirl and let me know how
many pieces you're left holding. =)  Please note that msdos partitions do
*not* work on devices larger than 2TB, so you'll have to use the scsi disk
directly.  This patch applies on top of v2.4.6-pre8-largeblock4.diff.

Testing wise, I'm looking for tests on ext2, the block device and raw
devices that write out enough data to fill the device and then reads the
data back looking for any corruption.  There are a few test programs I've
got to this end, but I need to clean them up before releasing them.  If
anyone wants to help sort out issues on other filesystems, I'll certainly
track patches and feedback.  Cheers,

		-ben

.... ~/patches/v2.4.6-pre8-lb-scsi.diff ....
diff -ur lb-2.4.6-pre8/drivers/scsi/scsi.h lb-2.4.6-pre8.scsi/drivers/scsi/scsi.h
--- lb-2.4.6-pre8/drivers/scsi/scsi.h	Tue Jul  3 01:31:47 2001
+++ lb-2.4.6-pre8.scsi/drivers/scsi/scsi.h	Tue Jul  3 22:03:16 2001
@@ -351,7 +351,7 @@
 #define DRIVER_MASK         0x0f
 #define SUGGEST_MASK        0xf0

-#define MAX_COMMAND_SIZE    12
+#define MAX_COMMAND_SIZE    16
 #define SCSI_SENSE_BUFFERSIZE   64

 /*
@@ -613,6 +613,7 @@
 	unsigned expecting_cc_ua:1;	/* Expecting a CHECK_CONDITION/UNIT_ATTN
 					 * because we did a bus reset. */
 	unsigned device_blocked:1;	/* Device returned QUEUE_FULL. */
+	unsigned sixteen:1;		/* use 16 byte read / write */
 	unsigned ten:1;		/* support ten byte read / write */
 	unsigned remap:1;	/* support remapping  */
 	unsigned starved:1;	/* unable to process commands because
diff -ur lb-2.4.6-pre8/drivers/scsi/sd.c lb-2.4.6-pre8.scsi/drivers/scsi/sd.c
--- lb-2.4.6-pre8/drivers/scsi/sd.c	Tue Jul  3 22:08:28 2001
+++ lb-2.4.6-pre8.scsi/drivers/scsi/sd.c	Tue Jul  3 22:05:46 2001
@@ -277,11 +277,12 @@

 static int sd_init_command(Scsi_Cmnd * SCpnt)
 {
-	int dev, devm, block, this_count;
+	int dev, devm, this_count;
 	Scsi_Disk *dpnt;
 #if CONFIG_SCSI_LOGGING
 	char nbuff[6];
 #endif
+	blkoff_t block;

 	devm = SD_PARTITION(SCpnt->request.rq_dev);
 	dev = DEVICE_NR(SCpnt->request.rq_dev);
@@ -289,7 +290,7 @@
 	block = SCpnt->request.sector;
 	this_count = SCpnt->request_bufflen >> 9;

-	SCSI_LOG_HLQUEUE(1, printk("Doing sd request, dev = %d, block = %d\n", devm, block));
+	SCSI_LOG_HLQUEUE(1, printk("Doing sd request, dev = %d, block = %"BLKOFF_FMT"\n", devm, block));

 	dpnt = &rscsi_disks[dev];
 	if (devm >= (sd_template.dev_max << 4) ||
@@ -374,7 +375,21 @@

 	SCpnt->cmnd[1] = (SCpnt->lun << 5) & 0xe0;

-	if (((this_count > 0xff) || (block > 0x1fffff)) || SCpnt->device->ten) {
+	if (SCpnt->device->sixteen) {
+		SCpnt->cmnd[0] += READ_16 - READ_6;
+		SCpnt->cmnd[2] = (unsigned char) (block >> 56) & 0xff;
+		SCpnt->cmnd[3] = (unsigned char) (block >> 48) & 0xff;
+		SCpnt->cmnd[4] = (unsigned char) (block >> 40) & 0xff;
+		SCpnt->cmnd[5] = (unsigned char) (block >> 32) & 0xff;
+		SCpnt->cmnd[6] = (unsigned char) (block >> 24) & 0xff;
+		SCpnt->cmnd[7] = (unsigned char) (block >> 16) & 0xff;
+		SCpnt->cmnd[8] = (unsigned char) (block >> 8) & 0xff;
+		SCpnt->cmnd[9] = (unsigned char) block & 0xff;
+		SCpnt->cmnd[10] = (unsigned char) (this_count >> 24) & 0xff;
+		SCpnt->cmnd[11] = (unsigned char) (this_count >> 16) & 0xff;
+		SCpnt->cmnd[12] = (unsigned char) (this_count >> 8) & 0xff;
+		SCpnt->cmnd[13] = (unsigned char) this_count & 0xff;
+	} else if (SCpnt->device->ten || (this_count > 0xff) || (block > 0x1fffff)) {
 		if (this_count > 0xffff)
 			this_count = 0xffff;

@@ -882,14 +897,61 @@
 		 */
 		rscsi_disks[i].ready = 1;

-		rscsi_disks[i].capacity = 1 + ((buffer[0] << 24) |
-					       (buffer[1] << 16) |
-					       (buffer[2] << 8) |
-					       buffer[3]);
+		rscsi_disks[i].capacity = buffer[0];
+		rscsi_disks[i].capacity <<= 8;
+		rscsi_disks[i].capacity |= buffer[1];
+		rscsi_disks[i].capacity <<= 8;
+		rscsi_disks[i].capacity |= buffer[2];
+		rscsi_disks[i].capacity <<= 8;
+		rscsi_disks[i].capacity |= buffer[3];
+		rscsi_disks[i].capacity += 1;

 		sector_size = (buffer[4] << 24) |
 		    (buffer[5] << 16) | (buffer[6] << 8) | buffer[7];

+
+		/* Is this disk larger than 32 bits? */
+		if (rscsi_disks[i].capacity == 0x100000000) {
+			cmd[0] = READ_CAPACITY;
+			cmd[1] = (rscsi_disks[i].device->lun << 5) & 0xe0;
+			cmd[1] |= 0x2;	/* Longlba */
+			memset((void *) &cmd[2], 0, 8);
+			memset((void *) buffer, 0, 8);
+			SRpnt->sr_cmd_len = 0;
+			SRpnt->sr_sense_buffer[0] = 0;
+			SRpnt->sr_sense_buffer[2] = 0;
+
+			SRpnt->sr_data_direction = SCSI_DATA_READ;
+			scsi_wait_req(SRpnt, (void *) cmd, (void *) buffer,
+				    8, SD_TIMEOUT, MAX_RETRIES);
+
+			/* cool!  64 bit goodness... */
+			if (!SRpnt->sr_result) {
+				rscsi_disks[i].capacity = buffer[0];
+				rscsi_disks[i].capacity <<= 8;
+				rscsi_disks[i].capacity |= buffer[1];
+				rscsi_disks[i].capacity <<= 8;
+				rscsi_disks[i].capacity |= buffer[2];
+				rscsi_disks[i].capacity <<= 8;
+				rscsi_disks[i].capacity |= buffer[3];
+				rscsi_disks[i].capacity <<= 8;
+				rscsi_disks[i].capacity |= buffer[4];
+				rscsi_disks[i].capacity <<= 8;
+				rscsi_disks[i].capacity |= buffer[5];
+				rscsi_disks[i].capacity <<= 8;
+				rscsi_disks[i].capacity |= buffer[6];
+				rscsi_disks[i].capacity <<= 8;
+				rscsi_disks[i].capacity |= buffer[7];
+				rscsi_disks[i].capacity += 1;
+
+				sector_size = (buffer[8] << 24) |
+				    (buffer[9] << 16) | (buffer[10] << 8) |
+				     buffer[11];
+
+				SRpnt->sr_device->sixteen = 1;
+			}
+		}
+
 		if (sector_size == 0) {
 			sector_size = 512;
 			printk("%s : sector size 0 reported, assuming 512.\n",
@@ -930,7 +992,7 @@
 			 */
 			int m;
 			int hard_sector = sector_size;
-			int sz = rscsi_disks[i].capacity * (hard_sector/256);
+			blkoff_t sz = rscsi_disks[i].capacity * (hard_sector/256);

 			/* There are 16 minors allocated for each major device */
 			for (m = i << 4; m < ((i + 1) << 4); m++) {
@@ -938,7 +1000,7 @@
 			}

 			printk("SCSI device %s: "
-			       "%d %d-byte hdwr sectors (%d MB)\n",
+			       "%"BLKOFF_FMT" %d-byte hdwr sectors (%"BLKOFF_FMT" MB)\n",
 			       nbuff, rscsi_disks[i].capacity,
 			       hard_sector, (sz/2 - sz/1250 + 974)/1950);
 		}
diff -ur lb-2.4.6-pre8/drivers/scsi/sd.h lb-2.4.6-pre8.scsi/drivers/scsi/sd.h
--- lb-2.4.6-pre8/drivers/scsi/sd.h	Tue Jul  3 01:31:47 2001
+++ lb-2.4.6-pre8.scsi/drivers/scsi/sd.h	Tue Jul  3 22:03:16 2001
@@ -26,7 +26,7 @@
 extern struct hd_struct *sd;

 typedef struct scsi_disk {
-	unsigned capacity;	/* size in blocks */
+	u64 capacity;	/* size in blocks */
 	Scsi_Device *device;
 	unsigned char ready;	/* flag ready for FLOPTICAL */
 	unsigned char write_prot;	/* flag write_protect for rmvable dev */
diff -ur lb-2.4.6-pre8/include/scsi/scsi.h lb-2.4.6-pre8.scsi/include/scsi/scsi.h
--- lb-2.4.6-pre8/include/scsi/scsi.h	Thu May  3 11:22:20 2001
+++ lb-2.4.6-pre8.scsi/include/scsi/scsi.h	Tue Jul  3 18:06:43 2001
@@ -78,6 +78,9 @@
 #define MODE_SENSE_10         0x5a
 #define PERSISTENT_RESERVE_IN 0x5e
 #define PERSISTENT_RESERVE_OUT 0x5f
+#define READ_16               0x88
+#define WRITE_16              0x8a
+#define WRITE_VERIFY_16       0x8e
 #define MOVE_MEDIUM           0xa5
 #define READ_12               0xa8
 #define WRITE_12              0xaa


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-04  2:19   ` [PATCH] 64 bit scsi read/write Ben LaHaise
@ 2001-07-04  7:11     ` Alan Cox
  2001-07-05  6:34     ` Ragnar Kjørstad
  2001-07-26  2:18     ` Ragnar Kjørstad
  2 siblings, 0 replies; 71+ messages in thread
From: Alan Cox @ 2001-07-04  7:11 UTC (permalink / raw)
  To: Ben LaHaise
  Cc: Ragnar Kjørstad, linux-fsdevel, linux-kernel, mike, kevin

> --- lb-2.4.6-pre8/drivers/scsi/scsi.h	Tue Jul  3 01:31:47 2001
> +++ lb-2.4.6-pre8.scsi/drivers/scsi/scsi.h	Tue Jul  3 22:03:16 2001
> @@ -351,7 +351,7 @@
>  #define DRIVER_MASK         0x0f
>  #define SUGGEST_MASK        0xf0
> 
> -#define MAX_COMMAND_SIZE    12
> +#define MAX_COMMAND_SIZE    16

Please talk to Khalid at HP who has already submitted patches to handle
16 byte comamnd blocks on some controllers cleanly. I think you need to
combine both patches to get the right result

> +	if (SCpnt->device->sixteen) {

[and controller]

Alan


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH] first cut 64 bit block support
  2001-07-01  4:53 [RFC][PATCH] first cut 64 bit block support Ben LaHaise
  2001-07-03  4:53 ` Ragnar Kjørstad
@ 2001-07-04 10:16 ` Chris Wedgwood
  2001-07-04 16:59   ` Ben LaHaise
  1 sibling, 1 reply; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-04 10:16 UTC (permalink / raw)
  To: Ben LaHaise; +Cc: linux-fsdevel, linux-kernel

On Sun, Jul 01, 2001 at 12:53:25AM -0400, Ben LaHaise wrote:

> Ugly bits: I had to add libgcc.a to satisfy the need for 64 bit
> division.  Yeah, it sucks, but RAID needs some more massaging before
> I can remove the 64 bit division completely.  This will be fixed.

I would rather see this code removed from libgcc and put into a
function (optionally inline) such that code like:

__u64 foo(__u64 a, __u64 b)
{
        __u64 t;


        t = a * SOME_CONST + b;

        return t / BLEM;
}

would really look like:

__64 foo(__u64 a, __u64 b)
{
        __u64 t;

        t = 64b_mul(a, SOME_CONST) + b;

        return 64b_udiv(t, BLEM);
}


such that for peopel to use 64-bit operations on the kernel, they have
to explicity code them in, not just accidentialyl change a variable
type and have gcc/libgcc hide this fact from them.

Note, I use __u64 not "long long" as I'm not 100% "long long" will
mean 64-bits on all future architectures (it would be cool, for
example, if it was 128-bit on some!).


What do you think? Would you accept patches for either of these?




  --cw

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH] first cut 64 bit block support
  2001-07-04 10:16 ` [RFC][PATCH] first cut 64 bit block support Chris Wedgwood
@ 2001-07-04 16:59   ` Ben LaHaise
  0 siblings, 0 replies; 71+ messages in thread
From: Ben LaHaise @ 2001-07-04 16:59 UTC (permalink / raw)
  To: Chris Wedgwood; +Cc: linux-fsdevel, linux-kernel

On Wed, 4 Jul 2001, Chris Wedgwood wrote:

> On Sun, Jul 01, 2001 at 12:53:25AM -0400, Ben LaHaise wrote:
>
> > Ugly bits: I had to add libgcc.a to satisfy the need for 64 bit
> > division.  Yeah, it sucks, but RAID needs some more massaging before
> > I can remove the 64 bit division completely.  This will be fixed.
>
> I would rather see this code removed from libgcc and put into a
> function (optionally inline) such that code like:

I'm getting rid of the need for libgcc entirely.  That's what "This will
be fixed" means.  If you want to expedite the process, send a patch.
Until then, this is Good Enough for testing purposes.

		-ben


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-04  2:19   ` [PATCH] 64 bit scsi read/write Ben LaHaise
  2001-07-04  7:11     ` Alan Cox
@ 2001-07-05  6:34     ` Ragnar Kjørstad
  2001-07-05  7:35       ` Ben LaHaise
  2001-07-26  2:18     ` Ragnar Kjørstad
  2 siblings, 1 reply; 71+ messages in thread
From: Ragnar Kjørstad @ 2001-07-05  6:34 UTC (permalink / raw)
  To: Ben LaHaise; +Cc: linux-fsdevel, linux-kernel, mike, kevin, linux-lvm

On Tue, Jul 03, 2001 at 10:19:36PM -0400, Ben LaHaise wrote:
> > > [ patch to make md and nbd work for >2TB devices ]
> > What about LVM?
> 
> Errr, I'll refrain from talking about LVM.

What do you mean?
Is it not feasible to fix this in LVM as well, or do you just not know
what needs to be done to LVM?


-- 
Ragnar Kjorstad
Big Storage

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-05  6:34     ` Ragnar Kjørstad
@ 2001-07-05  7:35       ` Ben LaHaise
  2001-07-13 18:20         ` Albert D. Cahalan
  0 siblings, 1 reply; 71+ messages in thread
From: Ben LaHaise @ 2001-07-05  7:35 UTC (permalink / raw)
  To: Ragnar Kjørstad; +Cc: linux-fsdevel, linux-kernel, mike, kevin, linux-lvm

On Thu, 5 Jul 2001, Ragnar Kjørstad wrote:

> What do you mean?
> Is it not feasible to fix this in LVM as well, or do you just not know
> what needs to be done to LVM?

Fixing LVM is not on the radar of my priorities.  The code is sorely in
need of a rewrite and violates several of the basic planning tenents that
any good code in the block layer should follow.  Namely, it should have 1)
planned on supporting 64 bit offsets, 2) never used multiplication,
division or modulus on block numbers, and 3) don't allocate memory
structures that are indexed by block numbers.  LVM failed on all three of
these -- and this si just what I noticed in a quick 5 minute glance
through the code.  Sorry, but LVM is obsolete by design.  It will continue
to work on 32 bit block devices, but if you try to use it beyond that, it
will fail.  That said, we'll have to make sure these failures are graceful
and occur prior to the user having a chance at loosing any data.

Now, thankfully there are alternatives like ELVM, which are working on
getting the details right from the lessons learned.  Given that, I think
we'll be in good shape during the 2.5 cycle.

		-ben


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-05  7:35       ` Ben LaHaise
@ 2001-07-13 18:20         ` Albert D. Cahalan
  2001-07-13 20:41           ` Andreas Dilger
  0 siblings, 1 reply; 71+ messages in thread
From: Albert D. Cahalan @ 2001-07-13 18:20 UTC (permalink / raw)
  To: Ben LaHaise
  Cc: Ragnar Kjørstad, linux-fsdevel, linux-kernel, mike, kevin,
	linux-lvm

Ben LaHaise writes:
> On Thu, 5 Jul 2001, Ragnar Kj\370rstad wrote:

>> What do you mean?
>> Is it not feasible to fix this in LVM as well, or do you just not know
>> what needs to be done to LVM?
>
> Fixing LVM is not on the radar of my priorities.  The code is sorely in
> need of a rewrite and violates several of the basic planning tenents that
> any good code in the block layer should follow.  Namely, it should have 1)
> planned on supporting 64 bit offsets, 2) never used multiplication,
> division or modulus on block numbers, and 3) don't allocate memory
> structures that are indexed by block numbers.  LVM failed on all three of
> these -- and this si just what I noticed in a quick 5 minute glance
> through the code.  Sorry, but LVM is obsolete by design.  It will continue
> to work on 32 bit block devices, but if you try to use it beyond that, it
> will fail.  That said, we'll have to make sure these failures are graceful
> and occur prior to the user having a chance at loosing any data.
> 
> Now, thankfully there are alternatives like ELVM, which are working on
> getting the details right from the lessons learned.  Given that, I think
> we'll be in good shape during the 2.5 cycle.

How does can any of this even work?

Say I have N disks, mirrored, or maybe with parity. I'm trying
to have a reliable system. I change a file. The write goes out
to my disks, and power is lost. Some number M, such that 0<M<N,
of the disks are written before the power loss. The rest of the
disks don't complete the write. Maybe worse, this is more than
one sector, and some disks have partial writes.

Doesn't RAID need a journal or the phase-tree algorithm?
How does one tell what data is old and what data is new?


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-13 18:20         ` Albert D. Cahalan
@ 2001-07-13 20:41           ` Andreas Dilger
  2001-07-13 21:07             ` Chris Wedgwood
  2001-07-13 21:14             ` Alan Cox
  0 siblings, 2 replies; 71+ messages in thread
From: Andreas Dilger @ 2001-07-13 20:41 UTC (permalink / raw)
  To: Albert D. Cahalan
  Cc: Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike,
	kevin, linux-lvm

Albert writes:
> How does can any of this even work?
> 
> Say I have N disks, mirrored, or maybe with parity. I'm trying
> to have a reliable system. I change a file. The write goes out
> to my disks, and power is lost. Some number M, such that 0<M<N,
> of the disks are written before the power loss. The rest of the
> disks don't complete the write. Maybe worse, this is more than
> one sector, and some disks have partial writes.
> 
> Doesn't RAID need a journal or the phase-tree algorithm?
> How does one tell what data is old and what data is new?

Yes, RAID should have a journal or other ordering enforcement, but
it really isn't any worse in this regard than a single disk.  Even
on a single disk you don't have any guarantees of data ordering, so
if you change the file and the power is lost, some of the sectors
will make it to disk and some will not => fsck, with possible data
corrpution or loss.

That's why the journaled filesystems have multi-stage commit of I/O,
first to the journal and then to the disk, so no chance of corruption
of the metadata, and if you journal data also, then the data cannot
be corrupted (but some may be lost).

RAID 5 throws a wrench into this by not guaranteeing that all of the
blocks in a stripe are consistent (you don't know which blocks and/or
parity were written and which not).  Ideally, you want a multi-stage
commit for RAID as well, so that you write the data first, and the
parity afterwards (so on reboot you trust the data first, and not the
parity).  You have a problem if there is a bad disk and you crash.

With a data-journaled fs you don't care what RAID does because the fs
journal knows which transactions were in progress.  If an I/O was being
written into the journal and did not complete, it is discarded.  If it
was written into the journal and did not finish the write into the fs,
it will re-write it on recovery.  In both cases you don't care if the
RAID finished the write or not.

Note that for LVM (the original topic), it does NOT do any RAID stuff
at all, it is just a virtually contiguous disk, made up of one or more
real disks (or stacked on top of RAID).

Cheers, Andreas
-- 
Andreas Dilger  \ "If a man ate a pound of pasta and a pound of antipasto,
                 \  would they cancel out, leaving him still hungry?"
http://www-mddsp.enel.ucalgary.ca/People/adilger/               -- Dogbert

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-13 20:41           ` Andreas Dilger
@ 2001-07-13 21:07             ` Chris Wedgwood
  2001-07-13 22:04               ` Andreas Dilger
  2001-07-13 21:14             ` Alan Cox
  1 sibling, 1 reply; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-13 21:07 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel,
	linux-kernel, mike, kevin, linux-lvm

On Fri, Jul 13, 2001 at 02:41:52PM -0600, Andreas Dilger wrote:

    Yes, RAID should have a journal or other ordering enforcement, but
    it really isn't any worse in this regard than a single disk.  Even
    on a single disk you don't have any guarantees of data ordering,
    so if you change the file and the power is lost, some of the
    sectors will make it to disk and some will not => fsck, with
    possible data corrpution or loss.

How so? On a single disk you can either disable write-caching or for
SCSI disks you can use barriers of sorts.

At which time, you can either assume a sector is written or not.


   --cw

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-13 20:41           ` Andreas Dilger
  2001-07-13 21:07             ` Chris Wedgwood
@ 2001-07-13 21:14             ` Alan Cox
  2001-07-14  3:23               ` Andrew Morton
  1 sibling, 1 reply; 71+ messages in thread
From: Alan Cox @ 2001-07-13 21:14 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel,
	linux-kernel, mike, kevin, linux-lvm

> RAID 5 throws a wrench into this by not guaranteeing that all of the
> blocks in a stripe are consistent (you don't know which blocks and/or
> parity were written and which not).  Ideally, you want a multi-stage
> commit for RAID as well, so that you write the data first, and the
> parity afterwards (so on reboot you trust the data first, and not the
> parity).  You have a problem if there is a bad disk and you crash.

Well to be honest so does most disk firmware. IDE especially. For one thing
the logical sector size the drives writes need not match the illusions
provided upstream, and the write flush commands are frequently not implemented
because they damage benchmarketing numbers from folks like Zdnet..



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-13 21:07             ` Chris Wedgwood
@ 2001-07-13 22:04               ` Andreas Dilger
  2001-07-14  0:49                 ` Jonathan Lundell
  2001-07-14 12:27                 ` Paul Jakma
  0 siblings, 2 replies; 71+ messages in thread
From: Andreas Dilger @ 2001-07-13 22:04 UTC (permalink / raw)
  To: Chris Wedgwood
  Cc: Andreas Dilger, Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad,
	linux-fsdevel, linux-kernel, mike, kevin, linux-lvm

Chris writes:
> On Fri, Jul 13, 2001 at 02:41:52PM -0600, Andreas Dilger wrote:
> 
>     Yes, RAID should have a journal or other ordering enforcement, but
>     it really isn't any worse in this regard than a single disk.  Even
>     on a single disk you don't have any guarantees of data ordering,
>     so if you change the file and the power is lost, some of the
>     sectors will make it to disk and some will not => fsck, with
>     possible data corrpution or loss.
> 
> How so? On a single disk you can either disable write-caching or for
> SCSI disks you can use barriers of sorts.
> 
> At which time, you can either assume a sector is written or not.

Well, I _think_ your statement is only true if you are using rawio.
Otherwise, you have a minimum block size of 1kB (for filesystems at
least) so you can't write less than that, and you could potentially
write one sector and not another.

I'm not sure of the exact MD RAID implementation, but I suspect that
if you write a single sector*, it will be exactly the same situation.
However, it also has to write the parity to disk, so if you crash at
this point what you get back depends on the RAID implementation**.

As Alan said in another reply, with IDE disks, you have no guarantee
about write caching on the disk, even if you try to turn it off.

If you are doing synchronous I/O from your application, then I don't
think a RAID write will not complete until all of the data+parity I/O
is complete, so you should again be as safe as with a single disk.

If you want safety, but async I/O, use ext3 with full data journaling
and a large journal.  Andrew Morton has just done some testing with
this and the performance is very good, as long as your journal is big
enough to hold your largest write bursts, and you have < 50% duty
cycle for disk I/O (i.e. you have to have enough spare I/O bandwidth
to write everything to disk twice, but it will go to the journal in a
single contiguous (synchronous) write and can go to the filesystem
asynchronously at a later time when there is no other I/O).  If you
put your journal on NVRAM, you will have blazing synchronous I/O.

Cheers, Andreas

*) You _may_ be limited to a larger minimum write, depending on the stripe
   size, I haven't looked closely at the code.  AFAIK, MD RAID does not
   let you stripe a single sector across multiple disks (nor would you
   want to), so all disk I/O would still be one or more single sector I/Os
   to one or more disks.  This means the sector I/O to each individual
   disk is still atomic, so it is not any worse than writes to a single
   disk (the parity is NOT atomic, but then you don't have parity at
   all on a single disk...).

**) As I said in my previous posting, it depends on if/how MD RAID does
   write ordering of I/O to the data sector and the parity sector.  If
   it holds back the parity write until the data I/O(s) are complete, and
   trusts the data over parity on recovery, you should be OK unless you
   have multiple failures (i.e. bad disk + crash).  If it doesn't do this
   ordering, or trusts parity over data, then you are F***ed (I doubt it
   would have this problem).
-- 
Andreas Dilger  \ "If a man ate a pound of pasta and a pound of antipasto,
                 \  would they cancel out, leaving him still hungry?"
http://www-mddsp.enel.ucalgary.ca/People/adilger/               -- Dogbert

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-13 22:04               ` Andreas Dilger
@ 2001-07-14  0:49                 ` Jonathan Lundell
  2001-07-14 12:27                 ` Paul Jakma
  1 sibling, 0 replies; 71+ messages in thread
From: Jonathan Lundell @ 2001-07-14  0:49 UTC (permalink / raw)
  To: Andreas Dilger, Chris Wedgwood
  Cc: Andreas Dilger, Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad,
	linux-fsdevel, linux-kernel, mike, kevin, linux-lvm

At 4:04 PM -0600 2001-07-13, Andreas Dilger wrote:
>**) As I said in my previous posting, it depends on if/how MD RAID does
>    write ordering of I/O to the data sector and the parity sector.  If
>    it holds back the parity write until the data I/O(s) are complete, and
>    trusts the data over parity on recovery, you should be OK unless you
>    have multiple failures (i.e. bad disk + crash).  If it doesn't do this
>    ordering, or trusts parity over data, then you are F***ed (I doubt it
>    would have this problem).

That wouldn't help, would it, if >1 data sectors were being written.

The fault mode of a sector simply not being written seems like a real 
weak point of both RAID-1 and RAID-5. Not that RAID-5 parity ever 
gets checked, I think, under normal circumstances, nor RAID-1 mirrors 
get compared, but if they were check and there was an parity or 
mirror-compare error and no other indication of a fault (eg CRC), 
there's no way to recover correct data.
-- 
/Jonathan Lundell.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-13 21:14             ` Alan Cox
@ 2001-07-14  3:23               ` Andrew Morton
  2001-07-14  8:45                 ` Alan Cox
  0 siblings, 1 reply; 71+ messages in thread
From: Andrew Morton @ 2001-07-14  3:23 UTC (permalink / raw)
  To: Alan Cox
  Cc: Andreas Dilger, Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad,
	linux-fsdevel, linux-kernel, mike, kevin, linux-lvm

Alan Cox wrote:
> 
> > RAID 5 throws a wrench into this by not guaranteeing that all of the
> > blocks in a stripe are consistent (you don't know which blocks and/or
> > parity were written and which not).  Ideally, you want a multi-stage
> > commit for RAID as well, so that you write the data first, and the
> > parity afterwards (so on reboot you trust the data first, and not the
> > parity).  You have a problem if there is a bad disk and you crash.
> 
> Well to be honest so does most disk firmware. IDE especially. For one thing
> the logical sector size the drives writes need not match the illusions
> provided upstream, and the write flush commands are frequently not implemented
> because they damage benchmarketing numbers from folks like Zdnet..

If, after a power outage, the IDE disk can keep going for long enough
to write its write cache out to the reserved vendor area (which will
only take 20-30 milliseconds) then the data may be considered *safe*
as soon as it hits writecache.

In which case it is perfectly legitimate and sensible for the drive
to ignore flush commands, and to ack data as soon as it hits cache.

Yes?

If I'm right then the only open question is: which disks do and
do not do the right thing when the lights go out.

-

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14  3:23               ` Andrew Morton
@ 2001-07-14  8:45                 ` Alan Cox
  2001-07-14 14:50                   ` Chris Wedgwood
                                     ` (4 more replies)
  0 siblings, 5 replies; 71+ messages in thread
From: Alan Cox @ 2001-07-14  8:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Alan Cox, Andreas Dilger, Albert D. Cahalan, Ben LaHaise,
	Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike, kevin,
	linux-lvm

> If, after a power outage, the IDE disk can keep going for long enough
> to write its write cache out to the reserved vendor area (which will
> only take 20-30 milliseconds) then the data may be considered *safe*
> as soon as it hits writecache.

Hohohoho.

> In which case it is perfectly legitimate and sensible for the drive
> to ignore flush commands, and to ack data as soon as it hits cache.

Since the flushing commands are 'optional' it can legitimately ignore them

> If I'm right then the only open question is: which disks do and
> do not do the right thing when the lights go out.

As far as I can tell none of them at least in the IDE world

Alan


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-13 22:04               ` Andreas Dilger
  2001-07-14  0:49                 ` Jonathan Lundell
@ 2001-07-14 12:27                 ` Paul Jakma
  2001-07-14 14:48                   ` Chris Wedgwood
  2001-07-16 18:53                   ` Andreas Dilger
  1 sibling, 2 replies; 71+ messages in thread
From: Paul Jakma @ 2001-07-14 12:27 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: linux-kernel

On Fri, 13 Jul 2001, Andreas Dilger wrote:

> put your journal on NVRAM, you will have blazing synchronous I/O.

so ext3 supports having the journal somewhere else then. question: can
the journal be on tmpfs?

> Cheers, Andreas

--paulj


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14 12:27                 ` Paul Jakma
@ 2001-07-14 14:48                   ` Chris Wedgwood
  2001-07-14 15:42                     ` Paul Jakma
  2001-07-16 18:53                   ` Andreas Dilger
  1 sibling, 1 reply; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-14 14:48 UTC (permalink / raw)
  To: Paul Jakma; +Cc: Andreas Dilger, linux-kernel

On Sat, Jul 14, 2001 at 01:27:37PM +0100, Paul Jakma wrote:

    so ext3 supports having the journal somewhere else then. question: can
    the journal be on tmpfs?

*why* would you want to to do this?


  --cw

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14  8:45                 ` Alan Cox
@ 2001-07-14 14:50                   ` Chris Wedgwood
  2001-07-14 20:11                     ` Daniel Phillips
  2001-07-14 15:41                   ` Jonathan Lundell
                                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-14 14:50 UTC (permalink / raw)
  To: Alan Cox
  Cc: Andrew Morton, Andreas Dilger, Albert D. Cahalan, Ben LaHaise,
	Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike, kevin,
	linux-lvm

On Sat, Jul 14, 2001 at 09:45:44AM +0100, Alan Cox wrote:

    As far as I can tell none of them at least in the IDE world

SCSI disk must, or at least some... if not, how to peopel like NetApp
get these cool HA certifications?



  --cw

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14  8:45                 ` Alan Cox
  2001-07-14 14:50                   ` Chris Wedgwood
@ 2001-07-14 15:41                   ` Jonathan Lundell
  2001-07-14 17:00                     ` Chris Wedgwood
  2001-07-14 17:33                   ` Jonathan Lundell
                                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 71+ messages in thread
From: Jonathan Lundell @ 2001-07-14 15:41 UTC (permalink / raw)
  To: Chris Wedgwood, Alan Cox
  Cc: Andrew Morton, Andreas Dilger, Albert D. Cahalan, Ben LaHaise,
	Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike, kevin,
	linux-lvm

At 2:50 AM +1200 2001-07-15, Chris Wedgwood wrote:
>On Sat, Jul 14, 2001 at 09:45:44AM +0100, Alan Cox wrote:
>
>     As far as I can tell none of them at least in the IDE world
>
>SCSI disk must, or at least some... if not, how to peopel like NetApp
>get these cool HA certifications?

NetApp uses a large system-local NVRAM buffer, do they not?
-- 
/Jonathan Lundell.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14 14:48                   ` Chris Wedgwood
@ 2001-07-14 15:42                     ` Paul Jakma
  2001-07-14 17:18                       ` Chris Wedgwood
  2001-07-20 17:03                       ` Stephen C. Tweedie
  0 siblings, 2 replies; 71+ messages in thread
From: Paul Jakma @ 2001-07-14 15:42 UTC (permalink / raw)
  To: Chris Wedgwood; +Cc: Andreas Dilger, linux-kernel

On Sun, 15 Jul 2001, Chris Wedgwood wrote:

> *why* would you want to to do this?

:)

to test performance advantage of journal on RAM before going to spend
money on NVRAM...

>   --cw

--paulj


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14 15:41                   ` Jonathan Lundell
@ 2001-07-14 17:00                     ` Chris Wedgwood
  0 siblings, 0 replies; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-14 17:00 UTC (permalink / raw)
  To: Jonathan Lundell
  Cc: Alan Cox, Andrew Morton, Andreas Dilger, Albert D. Cahalan,
	Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike,
	kevin, linux-lvm

On Sat, Jul 14, 2001 at 08:41:52AM -0700, Jonathan Lundell wrote:

    NetApp uses a large system-local NVRAM buffer, do they not?

Yes... and for clusters its chared via some kind of NUMA interconnect.
Anyhow, thats doesn't prevent disk/fs corruption alone, I suspect it
might be one of the reasons they use raid4 and not raid5 (plus they
also get better LVM management).



  --cw


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14 15:42                     ` Paul Jakma
@ 2001-07-14 17:18                       ` Chris Wedgwood
  2001-07-20 17:03                       ` Stephen C. Tweedie
  1 sibling, 0 replies; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-14 17:18 UTC (permalink / raw)
  To: Paul Jakma; +Cc: Andreas Dilger, linux-kernel

tmpfs is going to be _much_ faster than any external bus-connected
NVRAM solution

create a ram disk on a PCI connected video card and journal to that to
compare if you like (PCI bulk writes suck for speed)




  --cw





On Sat, Jul 14, 2001 at 04:42:04PM +0100, Paul Jakma wrote:
    On Sun, 15 Jul 2001, Chris Wedgwood wrote:
    
    > *why* would you want to to do this?
    
    :)
    
    to test performance advantage of journal on RAM before going to spend
    money on NVRAM...
    
    >   --cw
    
    --paulj
    

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14  8:45                 ` Alan Cox
  2001-07-14 14:50                   ` Chris Wedgwood
  2001-07-14 15:41                   ` Jonathan Lundell
@ 2001-07-14 17:33                   ` Jonathan Lundell
  2001-07-15  4:02                     ` Chris Wedgwood
  2001-07-15  5:46                     ` Jonathan Lundell
  2001-07-15 17:10                   ` Chris Wedgwood
  2001-07-15 17:39                   ` Jonathan Lundell
  4 siblings, 2 replies; 71+ messages in thread
From: Jonathan Lundell @ 2001-07-14 17:33 UTC (permalink / raw)
  To: Alan Cox, Andrew Morton
  Cc: Alan Cox, Andreas Dilger, Albert D. Cahalan, Ben LaHaise,
	Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike, kevin,
	linux-lvm

At 9:45 AM +0100 2001-07-14, Alan Cox wrote:
>  > If, after a power outage, the IDE disk can keep going for long enough
>>  to write its write cache out to the reserved vendor area (which will
>>  only take 20-30 milliseconds) then the data may be considered *safe*
>>  as soon as it hits writecache.
>
>Hohohoho.
>
>>  In which case it is perfectly legitimate and sensible for the drive
>>  to ignore flush commands, and to ack data as soon as it hits cache.
>
>Since the flushing commands are 'optional' it can legitimately ignore them
>
>>  If I'm right then the only open question is: which disks do and
>>  do not do the right thing when the lights go out.
>
>As far as I can tell none of them at least in the IDE world

It's not so great in the SCSI world either. Here's a bit from the 
Ultrastar 73LZX functional spec (this is the current-technology 
Ultra160 73GB family):

>5.0 Data integrity
>The drive retains recorded information under all non-write operations.
>No more than one sector will be lost by power down during write 
>operation while write cache is
>disabled.
>If power down occurs before completion of data transfer from write 
>cache to disk while write cache is
>enabled, the data remaining in write cache will be lost. To prevent 
>this data loss at power off, the
>following action is recommended:
>* Confirm successful completion of SYNCHRONIZE CACHE (35h) command.

What's worse, though the spec is not explicit on this point, it 
appears that the write cache is lost on a SCSI reset, which is 
typically used by drivers for last-resort error recovery. And of 
course a SCSI bus reset affects all the drives on the bus, not just 
the offending one.
-- 
/Jonathan Lundell.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14 14:50                   ` Chris Wedgwood
@ 2001-07-14 20:11                     ` Daniel Phillips
  2001-07-15  1:21                       ` Andrew Morton
  2001-07-15  3:36                       ` Chris Wedgwood
  0 siblings, 2 replies; 71+ messages in thread
From: Daniel Phillips @ 2001-07-14 20:11 UTC (permalink / raw)
  To: Chris Wedgwood, Alan Cox
  Cc: Andrew Morton, Andreas Dilger, Albert D. Cahalan, Ben LaHaise,
	Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike, kevin,
	linux-lvm

On Saturday 14 July 2001 16:50, Chris Wedgwood wrote:
> On Sat, Jul 14, 2001 at 09:45:44AM +0100, Alan Cox wrote:
>
>     As far as I can tell none of them at least in the IDE world
>
> SCSI disk must, or at least some... if not, how to peopel like NetApp
> get these cool HA certifications?

Atomic commit.  The superblock, which references the updated version 
of the filesystem, carries a sequence number and a checksum.  It is 
written to one of two alternating locations.  On restart, both
locations are read and the highest numbered superblock with a correct
checksum is chosen as the new filesystem root.

--
Daniel

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14 20:11                     ` Daniel Phillips
@ 2001-07-15  1:21                       ` Andrew Morton
  2001-07-15  1:53                         ` Daniel Phillips
  2001-07-15  3:36                       ` Chris Wedgwood
  1 sibling, 1 reply; 71+ messages in thread
From: Andrew Morton @ 2001-07-15  1:21 UTC (permalink / raw)
  To: Daniel Phillips; +Cc: linux-fsdevel, linux-kernel, linux-lvm

Daniel Phillips wrote:
> 
> On Saturday 14 July 2001 16:50, Chris Wedgwood wrote:
> > On Sat, Jul 14, 2001 at 09:45:44AM +0100, Alan Cox wrote:
> >
> >     As far as I can tell none of them at least in the IDE world
> >
> > SCSI disk must, or at least some... if not, how to peopel like NetApp
> > get these cool HA certifications?
> 
> Atomic commit.  The superblock, which references the updated version
> of the filesystem, carries a sequence number and a checksum.  It is
> written to one of two alternating locations.  On restart, both
> locations are read and the highest numbered superblock with a correct
> checksum is chosen as the new filesystem root.

But this assumes that it is the most-recently-written sector/block
which gets lost in a power failure.

The disk will be reordering writes - so when it fails it may have
written the commit block but *not* the data which that block is
committing.

You need a barrier or a full synchronous flush prior to writing
the commit block.  A `don't-reorder-past-me' barrier is very much
preferable, of course.

-

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15  1:21                       ` Andrew Morton
@ 2001-07-15  1:53                         ` Daniel Phillips
  0 siblings, 0 replies; 71+ messages in thread
From: Daniel Phillips @ 2001-07-15  1:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-fsdevel, linux-kernel, linux-lvm

On Sunday 15 July 2001 03:21, Andrew Morton wrote:
> Daniel Phillips wrote:
> > On Saturday 14 July 2001 16:50, Chris Wedgwood wrote:
> > > On Sat, Jul 14, 2001 at 09:45:44AM +0100, Alan Cox wrote:
> > >
> > >     As far as I can tell none of them at least in the IDE world
> > >
> > > SCSI disk must, or at least some... if not, how to peopel like
> > > NetApp get these cool HA certifications?
> >
> > Atomic commit.  The superblock, which references the updated
> > version of the filesystem, carries a sequence number and a
> > checksum.  It is written to one of two alternating locations.  On
> > restart, both locations are read and the highest numbered
> > superblock with a correct checksum is chosen as the new filesystem
> > root.
>
> But this assumes that it is the most-recently-written sector/block
> which gets lost in a power failure.
>
> The disk will be reordering writes - so when it fails it may have
> written the commit block but *not* the data which that block is
> committing.
>
> You need a barrier or a full synchronous flush prior to writing
> the commit block.  A `don't-reorder-past-me' barrier is very much
> preferable, of course.

Oh yes, absolutely, that's very much part of the puzzle.  Any disk
that doesn't support a real write barrier or write cache flush is
fundamentally broken as far as failsafe operation goes.  A disk that
claims to provide such support and doesn't is an even worse offender.
I find Alan's comment there worrisome.  We need to know which disks
devliver on this and which don't.

--
Daniel

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14 20:11                     ` Daniel Phillips
  2001-07-15  1:21                       ` Andrew Morton
@ 2001-07-15  3:36                       ` Chris Wedgwood
  2001-07-15  6:05                         ` John Alvord
  2001-07-15 13:44                         ` Daniel Phillips
  1 sibling, 2 replies; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-15  3:36 UTC (permalink / raw)
  To: Daniel Phillips
  Cc: Alan Cox, Andrew Morton, Andreas Dilger, Albert D. Cahalan,
	Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike,
	kevin, linux-lvm

On Sat, Jul 14, 2001 at 10:11:30PM +0200, Daniel Phillips wrote:

    Atomic commit.  The superblock, which references the updated
    version of the filesystem, carries a sequence number and a
    checksum.  It is written to one of two alternating locations.  On
    restart, both locations are read and the highest numbered
    superblock with a correct checksum is chosen as the new filesystem
    root.

Yes... and which ever part of the superblock contains the sequence
number must be written atomically.

The point is, you _NEED_ to be sure that data written before the
superblock (or indeed anywhere further up the tree, you can make
changes in theory which don't require super-block updates) are written
firmly to the platters before any thing which refers to it is updated.

Alan was saying with IDE you cannot reliably do this, I assume you can
with SCSI was my point.



  --cw

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14 17:33                   ` Jonathan Lundell
@ 2001-07-15  4:02                     ` Chris Wedgwood
  2001-07-15  5:46                     ` Jonathan Lundell
  1 sibling, 0 replies; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-15  4:02 UTC (permalink / raw)
  To: Jonathan Lundell
  Cc: Alan Cox, Andrew Morton, Andreas Dilger, Albert D. Cahalan,
	Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike,
	kevin, linux-lvm

On Sat, Jul 14, 2001 at 10:33:44AM -0700, Jonathan Lundell wrote:

    What's worse, though the spec is not explicit on this point, it
    appears that the write cache is lost on a SCSI reset, which is
    typically used by drivers for last-resort error recovery. And of
    course a SCSI bus reset affects all the drives on the bus, not
    just the offending one.

Doesn't SCSI have a notion of write barriers?

Even if this is required, the above still works because for anything
requiring a barrier, you wait of a positive SYNCHRONIZE CACHE



  --cw



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14 17:33                   ` Jonathan Lundell
  2001-07-15  4:02                     ` Chris Wedgwood
@ 2001-07-15  5:46                     ` Jonathan Lundell
  1 sibling, 0 replies; 71+ messages in thread
From: Jonathan Lundell @ 2001-07-15  5:46 UTC (permalink / raw)
  To: Chris Wedgwood
  Cc: Alan Cox, Andrew Morton, Andreas Dilger, Albert D. Cahalan,
	Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike,
	kevin, linux-lvm

At 4:02 PM +1200 2001-07-15, Chris Wedgwood wrote:
>On Sat, Jul 14, 2001 at 10:33:44AM -0700, Jonathan Lundell wrote:
>
>     What's worse, though the spec is not explicit on this point, it
>     appears that the write cache is lost on a SCSI reset, which is
>     typically used by drivers for last-resort error recovery. And of
>     course a SCSI bus reset affects all the drives on the bus, not
>     just the offending one.
>
>Doesn't SCSI have a notion of write barriers?
>
>Even if this is required, the above still works because for anything
>requiring a barrier, you wait for a positive SYNCHRONIZE CACHE

Sure, if you keep all your write buffers around until then, so you 
can re-write if the sync fails. And if you don't crash in the 
meantime.
-- 
/Jonathan Lundell.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15  3:36                       ` Chris Wedgwood
@ 2001-07-15  6:05                         ` John Alvord
  2001-07-15  6:07                           ` Chris Wedgwood
  2001-07-15 13:44                         ` Daniel Phillips
  1 sibling, 1 reply; 71+ messages in thread
From: John Alvord @ 2001-07-15  6:05 UTC (permalink / raw)
  To: Chris Wedgwood
  Cc: Daniel Phillips, Alan Cox, Andrew Morton, Andreas Dilger,
	Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel,
	linux-kernel, mike, kevin, linux-lvm



On Sun, 15 Jul 2001, Chris Wedgwood wrote:

> On Sat, Jul 14, 2001 at 10:11:30PM +0200, Daniel Phillips wrote:
> 
>     Atomic commit.  The superblock, which references the updated
>     version of the filesystem, carries a sequence number and a
>     checksum.  It is written to one of two alternating locations.  On
>     restart, both locations are read and the highest numbered
>     superblock with a correct checksum is chosen as the new filesystem
>     root.
> 
> Yes... and which ever part of the superblock contains the sequence
> number must be written atomically.
> 
> The point is, you _NEED_ to be sure that data written before the
> superblock (or indeed anywhere further up the tree, you can make
> changes in theory which don't require super-block updates) are written
> firmly to the platters before any thing which refers to it is updated.
> 
> Alan was saying with IDE you cannot reliably do this, I assume you can
> with SCSI was my point.

In the IBM solution to this (1977-78, VM/CMS) the critical data was
written at the begining and the end of the block. If the two data items
didn't match then the block was rejected.

john alvord
 > 
> 
> 
>   --cw
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15  6:05                         ` John Alvord
@ 2001-07-15  6:07                           ` Chris Wedgwood
  2001-07-15 13:16                             ` Ken Hirsch
  2001-07-17  0:31                             ` Juan Quintela
  0 siblings, 2 replies; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-15  6:07 UTC (permalink / raw)
  To: John Alvord
  Cc: Daniel Phillips, Alan Cox, Andrew Morton, Andreas Dilger,
	Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel,
	linux-kernel, mike, kevin, linux-lvm

On Sat, Jul 14, 2001 at 11:05:36PM -0700, John Alvord wrote:

    In the IBM solution to this (1977-78, VM/CMS) the critical data was
    written at the begining and the end of the block. If the two data items
    didn't match then the block was rejected.

Neat.


Simple and effective.  Presumably you can also checksum the block, and
check that.



  --cw

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15  6:07                           ` Chris Wedgwood
@ 2001-07-15 13:16                             ` Ken Hirsch
  2001-07-15 14:50                               ` Chris Wedgwood
  2001-07-15 22:14                               ` Daniel Phillips
  2001-07-17  0:31                             ` Juan Quintela
  1 sibling, 2 replies; 71+ messages in thread
From: Ken Hirsch @ 2001-07-15 13:16 UTC (permalink / raw)
  To: Chris Wedgwood, John Alvord
  Cc: Daniel Phillips, Alan Cox, Andrew Morton, Andreas Dilger,
	Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel,
	linux-kernel, mike, kevin, linux-lvm

Chris Wedgwood <cw@f00f.org> wrote:
> On Sat, Jul 14, 2001 at 11:05:36PM -0700, John Alvord wrote:
>
>     In the IBM solution to this (1977-78, VM/CMS) the critical data was
>     written at the begining and the end of the block. If the two data
items
>     didn't match then the block was rejected.
>
> Neat.
>
>
> Simple and effective.  Presumably you can also checksum the block, and
> check that.

The first technique is not sufficient with modern disk controllers, which
may reorder sector writes within a block.  A checksum, especially a robust
CRC32, is sufficient, but rather expensive.

Mohan has a clever technique that is computationally trivial and only uses
one bit per sector: http://www.almaden.ibm.com/u/mohan/ICDE95.pdf

Unfortunately, it's also patented:
http://www.delphion.com/details?pn=US05418940__

Perhaps IBM will clarify their position with respect to free software and
patents in the upcoming conference.




^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15  3:36                       ` Chris Wedgwood
  2001-07-15  6:05                         ` John Alvord
@ 2001-07-15 13:44                         ` Daniel Phillips
  2001-07-15 14:39                           ` Chris Wedgwood
                                             ` (2 more replies)
  1 sibling, 3 replies; 71+ messages in thread
From: Daniel Phillips @ 2001-07-15 13:44 UTC (permalink / raw)
  To: Chris Wedgwood
  Cc: Alan Cox, Andrew Morton, Andreas Dilger, Albert D. Cahalan,
	Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike,
	kevin, linux-lvm

On Sunday 15 July 2001 05:36, Chris Wedgwood wrote:
> On Sat, Jul 14, 2001 at 10:11:30PM +0200, Daniel Phillips wrote:
>
>     Atomic commit.  The superblock, which references the updated
>     version of the filesystem, carries a sequence number and a
>     checksum.  It is written to one of two alternating locations.  On
>     restart, both locations are read and the highest numbered
>     superblock with a correct checksum is chosen as the new
> filesystem root.
>
> Yes... and which ever part of the superblock contains the sequence
> number must be written atomically.

The only requirement here is that the checksum be correct.  And sure,
that's not a hard guarantee because, on average, you will get a good
checksum for bad data once every 4 billion power events that mess up
the final superblock transfer.  Let me see, if that happens once a year,
your data should still be good when the warrantee on the sun expires.
:-)

> The point is, you _NEED_ to be sure that data written before the
> superblock (or indeed anywhere further up the tree, you can make
> changes in theory which don't require super-block updates) are
> written firmly to the platters before any thing which refers to it is
> updated.

Since the updated tree is created non-destructively with respect to
the original tree, the only priority relationship that matters is the
requirement that all blocks of the updated tree be securely committed
before the new superblock is written.

> Alan was saying with IDE you cannot reliably do this, I assume you
> can with SCSI was my point.

Surely it can't be that *all* IDE disks can fail in that way?  And it
seems the jury is still out on SCSI, I'm interested to see where that
discussion goes.

--
Daniel

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15 13:44                         ` Daniel Phillips
@ 2001-07-15 14:39                           ` Chris Wedgwood
  2001-07-15 15:32                             ` Alan Cox
  2001-07-15 15:06                           ` Jonathan Lundell
  2001-07-16  1:08                           ` Albert D. Cahalan
  2 siblings, 1 reply; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-15 14:39 UTC (permalink / raw)
  To: Daniel Phillips
  Cc: Alan Cox, Andrew Morton, Andreas Dilger, Albert D. Cahalan,
	Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike,
	kevin, linux-lvm

On Sun, Jul 15, 2001 at 03:44:14PM +0200, Daniel Phillips wrote:

    The only requirement here is that the checksum be correct.  And
    sure, that's not a hard guarantee because, on average, you will
    get a good checksum for bad data once every 4 billion power events
    that mess up the final superblock transfer.  Let me see, if that
    happens once a year, your data should still be good when the
    warrantee on the sun expires.  :-)

the sun will probably last a tad longer than that even contuing to
burn hydrogen, if you allow for helium burning, you will probably get
errors to sneak by

    Surely it can't be that *all* IDE disks can fail in that way?  And
    it seems the jury is still out on SCSI, I'm interested to see
    where that discussion goes.

Alan said *ALL* disks appear to lie, and I'm not going to argue with
him :)

I only have SCSI disks to test with, but they are hot-plug, so I guess
I can write a whole bunch of blocks with different numbers on them,
all over the disk, if I can figure out how to place SCSI barriers and
then pull the drive and see what gives?



   --cw

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15 13:16                             ` Ken Hirsch
@ 2001-07-15 14:50                               ` Chris Wedgwood
  2001-07-15 22:14                               ` Daniel Phillips
  1 sibling, 0 replies; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-15 14:50 UTC (permalink / raw)
  To: Ken Hirsch
  Cc: John Alvord, Daniel Phillips, Alan Cox, Andrew Morton,
	Andreas Dilger, Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad,
	linux-fsdevel, linux-kernel, mike, kevin, linux-lvm

On Sun, Jul 15, 2001 at 09:16:09AM -0400, Ken Hirsch wrote:

    The first technique is not sufficient with modern disk
    controllers, which may reorder sector writes within a block.  A
    checksum, especially a robust CRC32, is sufficient, but rather
    expensive.

So you write the number to the start and end of each sector, or, you
only assume sector-wide 'block-sizes' for integrity.

A 32-bit CRC is plenty cheap enough on modern CPUs and especially
considering how often you need to calculate it.

    Mohan has a clever technique that is computationally trivial and
    only uses one bit per sector:
    http://www.almaden.ibm.com/u/mohan/ICDE95.pdf
    
    Unfortunately, it's also patented:
    http://www.delphion.com/details?pn=US05418940__
    
    Perhaps IBM will clarify their position with respect to free
    software and patents in the upcoming conference.

Wow... pretty neat, but fortunately not necessary.



  --cw

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15 13:44                         ` Daniel Phillips
  2001-07-15 14:39                           ` Chris Wedgwood
@ 2001-07-15 15:06                           ` Jonathan Lundell
  2001-07-15 15:22                             ` Chris Wedgwood
                                               ` (2 more replies)
  2001-07-16  1:08                           ` Albert D. Cahalan
  2 siblings, 3 replies; 71+ messages in thread
From: Jonathan Lundell @ 2001-07-15 15:06 UTC (permalink / raw)
  To: Chris Wedgwood, Daniel Phillips
  Cc: Alan Cox, Andrew Morton, Andreas Dilger, Albert D. Cahalan,
	Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike,
	kevin, linux-lvm

At 2:39 AM +1200 2001-07-16, Chris Wedgwood wrote:
>On Sun, Jul 15, 2001 at 03:44:14PM +0200, Daniel Phillips wrote:
>
>     The only requirement here is that the checksum be correct.  And
>     sure, that's not a hard guarantee because, on average, you will
>     get a good checksum for bad data once every 4 billion power events
>     that mess up the final superblock transfer.  Let me see, if that
>     happens once a year, your data should still be good when the
>     warrantee on the sun expires.  :-)
>
>the sun will probably last a tad longer than that even contuing to
>burn hydrogen, if you allow for helium burning, you will probably get
>errors to sneak by
>
>     Surely it can't be that *all* IDE disks can fail in that way?  And
>     it seems the jury is still out on SCSI, I'm interested to see
>     where that discussion goes.
>
>Alan said *ALL* disks appear to lie, and I'm not going to argue with
>him :)
>
>I only have SCSI disks to test with, but they are hot-plug, so I guess
>I can write a whole bunch of blocks with different numbers on them,
>all over the disk, if I can figure out how to place SCSI barriers and
>then pull the drive and see what gives?

Consider the possibility (probability, I think) that SCSI drives blow 
away their (unwritten) write cache buffers on a SCSI bus reset, and 
that a SCSI bus reset is a routine, albeit last-resort, error 
recovery technique. (It's also necessary; by the time a driver gets 
to a bus reset, all else has failed. It's also, in my experience, not 
especially rare.)

The fix for that particular problem--disabling write caching--is 
simple enough, though it presumably has a performance consequence. A 
second benefit of disabling write caching is that the drive can't 
reorder writes (though of course the system still might).

At first glance, by the way, the only write barrier I see in the SCSI 
command set is the synchronize-cache command, which completes only 
after all the drive's dirty buffers are written out. Of course, 
without write caching, it's not an issue.
-- 
/Jonathan Lundell.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15 15:06                           ` Jonathan Lundell
@ 2001-07-15 15:22                             ` Chris Wedgwood
  2001-07-15 17:44                             ` Jonathan Lundell
  2001-07-15 17:47                             ` Justin T. Gibbs
  2 siblings, 0 replies; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-15 15:22 UTC (permalink / raw)
  To: Jonathan Lundell
  Cc: Daniel Phillips, Alan Cox, Andrew Morton, Andreas Dilger,
	Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel,
	linux-kernel, mike, kevin, linux-lvm

On Sun, Jul 15, 2001 at 08:06:39AM -0700, Jonathan Lundell wrote:

    At first glance, by the way, the only write barrier I see in the
    SCSI command set is the synchronize-cache command, which completes
    only after all the drive's dirty buffers are written out. Of
    course, without write caching, it's not an issue.

Is the spec you have distributable? I believe some of the early drafts
were, but the final spec isn't.

I'd really like to check it out myself, I alwasy assumed SCSI had the
smarts for write-barriers and force-unit-access but I guess I was
wrong.

Anyhow, I'd like to see the spec for myself if it is something I can
get hold of.



  --cw

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15 14:39                           ` Chris Wedgwood
@ 2001-07-15 15:32                             ` Alan Cox
  2001-07-15 15:33                               ` Chris Wedgwood
  2001-07-15 16:24                               ` Chris Wedgwood
  0 siblings, 2 replies; 71+ messages in thread
From: Alan Cox @ 2001-07-15 15:32 UTC (permalink / raw)
  To: Chris Wedgwood
  Cc: Daniel Phillips, Alan Cox, Andrew Morton, Andreas Dilger,
	Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel,
	linux-kernel, mike, kevin, linux-lvm

> I only have SCSI disks to test with, but they are hot-plug, so I guess
> I can write a whole bunch of blocks with different numbers on them,
> all over the disk, if I can figure out how to place SCSI barriers and
> then pull the drive and see what gives?

Another way is to time

	write block
	write barrier
	write same block
	write barrier
	repeat

If the write barrier is working you should be able to measure the drive rpm 8)


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15 15:32                             ` Alan Cox
@ 2001-07-15 15:33                               ` Chris Wedgwood
  2001-07-15 16:24                               ` Chris Wedgwood
  1 sibling, 0 replies; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-15 15:33 UTC (permalink / raw)
  To: Alan Cox
  Cc: Daniel Phillips, Andrew Morton, Andreas Dilger,
	Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel,
	linux-kernel, mike, kevin, linux-lvm

On Sun, Jul 15, 2001 at 04:32:59PM +0100, Alan Cox wrote:

    Another way is to time
    
    	write block
    	write barrier
    	write same block
    	write barrier
    	repeat
    
    If the write barrier is working you should be able to measure the
    drive rpm 8)

Yeah, I was thinking of doing this with caches turned off, since I
know how to do that, but not a write-barrier.




  --cs

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15 15:32                             ` Alan Cox
  2001-07-15 15:33                               ` Chris Wedgwood
@ 2001-07-15 16:24                               ` Chris Wedgwood
  1 sibling, 0 replies; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-15 16:24 UTC (permalink / raw)
  To: Alan Cox
  Cc: Daniel Phillips, Andrew Morton, Andreas Dilger,
	Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel,
	linux-kernel, mike, kevin, linux-lvm

[-- Attachment #1: Type: text/plain, Size: 677 bytes --]

On Sun, Jul 15, 2001 at 04:32:59PM +0100, Alan Cox wrote:

    Another way is to time

    	write block
    	write barrier
    	write same block
    	write barrier
    	repeat

    If the write barrier is working you should be able to measure the
    drive rpm 8)

OK, I just wrote this in order to test just that, test on a raw device
and turn caching off if you can.

For my drives, I cannot disable caching (I don't know if it is on or
not) and I get abysmal speed, but nothing unrealistic,

Anyhow, I just wrote this and tested it a couple of times, if it
breaks or east your disk, don't bitch at me.

Otherwise, flames and comments on my god awful code welcome.



  --cw

[-- Attachment #2: More of Blondie's awful code --]
[-- Type: text/x-csrc, Size: 2687 bytes --]

/*
 * write-bench.c
 *
 * test write-performance to a device for n loops, designed for
 * testing to a raw device where the underlying physical device has
 * caching turned off
 *
 * cw@f00f.org --- Mon Jul 16 04:21:12 NZST 2001
 *
 * USE AT YOUR OWN RISK! NO WARRANTY GIVEN OR IMPLIED
 */

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
/* #include <publib.h> */

/* XXX edit for your CPU */
const float MHz = 860.947 * 1000 * 1000;

/* some kind of 64-bit tsc is required */
#ifdef __i386__
#define rdtsc(low,high) \
    __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
#else /* __i386__ */
#error "Please define rdtsc(low32,high32) for your architecture"
#endif /* __i386__ */


int main(int argc,char *argv[])
{
    const int secsize = 512;
    int f, i, loops;
    char *buf;
    double sum = 0.0, sumsq = 0.0, duration;
    union jack {
        unsigned int l[2];
        unsigned long long v;
    }before_io, after_io;

    if(argc != 3){
        fprintf(stderr,"please supply two arguments; "
                "the device and the number of loops\n");
        return 2;
    }

    loops = atoi(argv[2]);

    if(!(buf = malloc(secsize))){
        perror("malloc");
        return 1;
    }
    if(((long)buf) % secsize){
        free(buf);
        if(!(buf = malloc(secsize*2 - 1))){
            perror("malloc");
            return 1;
        }
        (long)buf = ((long)buf + secsize - 1) & ~(secsize - 1);
    }
    /* memfill(buf, secsize, "wibble", 6); */
    memset(buf, 'r', secsize);

    if(-1 == (f = open(argv[1], O_RDWR))){
        perror("open");
        return 1;
    }

    rdtsc(before_io.l[0],before_io.l[1]);
    for(i=0;i<loops;++i){
#ifdef PWRITE_WORKS
        if(-1 == pwrite(f, buf, secsize, 0)){
            perror("pwrite");
            return 1;
        }
#else /* PWRITE_WORKS */
        if(-1 == lseek(f, 0, SEEK_SET)){
            perror("lseek");
            return 1;
        }
        if(-1 == write(f, buf, secsize)){
            perror("write");
            return 1;
        }
#endif /* PWRITE_WORKS */
        rdtsc(after_io.l[0],after_io.l[1]);

        duration = (after_io.v - before_io.v) / MHz;
        sum += duration;
        sumsq += duration * duration;

        before_io.v = after_io.v;
    }

    printf("Loops              %-8d\n", loops);
    printf("Total time taken   %-8.4f s\n", sum);
    printf("Average write time %-8.4f ms\n", 1000.0 * sum / loops);
    printf("  (std. dev)       %-8f ms\n", 1000.0 * sqrt(sumsq - sum*sum/loops) / (loops - 1));
    printf("Writes/second      %-8.4f s^-1\n", loops / sum);

    return 0;
}

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14  8:45                 ` Alan Cox
                                     ` (2 preceding siblings ...)
  2001-07-14 17:33                   ` Jonathan Lundell
@ 2001-07-15 17:10                   ` Chris Wedgwood
  2001-07-15 17:39                   ` Jonathan Lundell
  4 siblings, 0 replies; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-15 17:10 UTC (permalink / raw)
  To: Alan Cox
  Cc: Andrew Morton, Andreas Dilger, Albert D. Cahalan, Ben LaHaise,
	Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike, kevin,
	linux-lvm

On Sat, Jul 14, 2001 at 09:45:44AM +0100, Alan Cox wrote:

    As far as I can tell none of them at least in the IDE world

Can you test with the code I posted a hour or so ago please?

I ask this because I tested writes to:

  -- buffered devices

  -- ide with caching on

  -- ide with caching off

  -- scsi (caching on?)

To a buffered device, I get something silly like 63000
writes/second. No big surprises there (other than Linux is bloody lean
these days).

To a SCSI device (10K RPM SCSI-3 160 drive), I get something like 167
writes/second, which seems moderately sane if caching is disabled.

To a cheap IDE drive (5400 RPM?) with caching off, I get about 87
writes/second.

To the same drive, with caching on, I get almost 4000 writes/second.

This seems to imply, at least for my test IDE drive, you can turn
caching off --- and its about half as fast as my SCSI drives which
rotate at about twice the speed (sanity check).

IDE drive:  IBM-DTTA-351010, ATA DISK drive
SCSI drive: SEAGATE ST318404LC




   --cw



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14  8:45                 ` Alan Cox
                                     ` (3 preceding siblings ...)
  2001-07-15 17:10                   ` Chris Wedgwood
@ 2001-07-15 17:39                   ` Jonathan Lundell
  4 siblings, 0 replies; 71+ messages in thread
From: Jonathan Lundell @ 2001-07-15 17:39 UTC (permalink / raw)
  To: Chris Wedgwood, Alan Cox
  Cc: Andrew Morton, Andreas Dilger, Albert D. Cahalan, Ben LaHaise,
	Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike, kevin

At 5:10 AM +1200 2001-07-16, Chris Wedgwood wrote:
>On Sat, Jul 14, 2001 at 09:45:44AM +0100, Alan Cox wrote:
>
>     As far as I can tell none of them at least in the IDE world
>
>Can you test with the code I posted a hour or so ago please?

AC's comment was about whether the drive's cache would be written out 
on power failure, which is another issue, a little harder to test 
(and not easily testable by writing a single sector). I raise the 
related question of what happens to the write cache on a bus reset on 
SCSI drives.

>I ask this because I tested writes to:
>
>   -- buffered devices
>
>   -- ide with caching on
>
>   -- ide with caching off
>
>   -- scsi (caching on?)
>
>To a buffered device, I get something silly like 63000
>writes/second. No big surprises there (other than Linux is bloody lean
>these days).
>
>To a SCSI device (10K RPM SCSI-3 160 drive), I get something like 167
>writes/second, which seems moderately sane if caching is disabled.

My impression, based a a little but not much research, is that most 
SCSI drives disable write caching by default. IBM SCSI drives may be 
an exception to this.

>To a cheap IDE drive (5400 RPM?) with caching off, I get about 87
>writes/second.
>
>To the same drive, with caching on, I get almost 4000 writes/second.
>
>This seems to imply, at least for my test IDE drive, you can turn
>caching off --- and its about half as fast as my SCSI drives which
>rotate at about twice the speed (sanity check).
>
>IDE drive:  IBM-DTTA-351010, ATA DISK drive
>SCSI drive: SEAGATE ST318404LC


-- 
/Jonathan Lundell.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15 15:06                           ` Jonathan Lundell
  2001-07-15 15:22                             ` Chris Wedgwood
@ 2001-07-15 17:44                             ` Jonathan Lundell
  2001-07-15 17:47                             ` Justin T. Gibbs
  2 siblings, 0 replies; 71+ messages in thread
From: Jonathan Lundell @ 2001-07-15 17:44 UTC (permalink / raw)
  To: Chris Wedgwood
  Cc: Daniel Phillips, Alan Cox, Andrew Morton, Andreas Dilger,
	Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel,
	linux-kernel, mike, kevin, linux-lvm

At 3:22 AM +1200 2001-07-16, Chris Wedgwood wrote:
>On Sun, Jul 15, 2001 at 08:06:39AM -0700, Jonathan Lundell wrote:
>
>     At first glance, by the way, the only write barrier I see in the
>     SCSI command set is the synchronize-cache command, which completes
>     only after all the drive's dirty buffers are written out. Of
>     course, without write caching, it's not an issue.
>
>Is the spec you have distributable? I believe some of the early drafts
>were, but the final spec isn't.
>
>I'd really like to check it out myself, I alwasy assumed SCSI had the
>smarts for write-barriers and force-unit-access but I guess I was
>wrong.
>
>Anyhow, I'd like to see the spec for myself if it is something I can
>get hold of.

I was referring to IBM's spec, as implemented in their recent SCSI 
and FC drives. You can find a copy at 
http://www.storage.ibm.com/techsup/hddtech/prodspec/ddyf_spi.pdf

WRITE EXTENDED has a bit (FUA) that will let you force that 
particular write to go to disk immediately, independent of write 
caching, but there's no suggestion that it otherwise acts as a write 
barrier for cached writes.

WRITE VERIFY implies a CACHE SYNCHRONIZE, so it's a write barrier, 
but an expensive (because synchronous) one.
-- 
/Jonathan Lundell.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15 15:06                           ` Jonathan Lundell
  2001-07-15 15:22                             ` Chris Wedgwood
  2001-07-15 17:44                             ` Jonathan Lundell
@ 2001-07-15 17:47                             ` Justin T. Gibbs
  2001-07-15 23:14                               ` Rod Van Meter
  2001-07-16  8:56                               ` Chris Wedgwood
  2 siblings, 2 replies; 71+ messages in thread
From: Justin T. Gibbs @ 2001-07-15 17:47 UTC (permalink / raw)
  To: Jonathan Lundell
  Cc: Chris Wedgwood, Daniel Phillips, Alan Cox, Andrew Morton,
	Andreas Dilger, Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad,
	linux-fsdevel, linux-kernel, mike, kevin, linux-lvm

>Consider the possibility (probability, I think) that SCSI drives blow 
>away their (unwritten) write cache buffers on a SCSI bus reset, and 
>that a SCSI bus reset is a routine, albeit last-resort, error 
>recovery technique. (It's also necessary; by the time a driver gets 
>to a bus reset, all else has failed. It's also, in my experience, not 
>especially rare.)

I have never seen this to be the case.  The SCSI spec is quite clear
in stating that a bus reset only affects "I/O processes that have not
completed, SCSI device reservations, and SCSI device operating modes".
The soft reset section clarifies the meaning of "completed commands"
as:
	e) An initiator shall consider an I/O process to be completed
	   when it negates ACK for a successfully received COMMAND
	   COMPLETE message.
	f) A target shall consider an I/O process to be completed when
	   it detects the transition of ACK to false for the COMMAND
	   COMPLETE message with the ATN signal false.

As the soft reset section also specifies how to deal with initiators
that are not expecting soft reset semantics, I believe this applies to
either reset model.

If we look at the section on caching for direct access devices we see,
"[write-back cached] data may be lost if power to the device is lost or
a hardware failure occurs".  There is no mention of a bus reset having
any effect on commands already acked as completed to the intiator.

>The fix for that particular problem--disabling write caching--is 
>simple enough, though it presumably has a performance consequence. A 
>second benefit of disabling write caching is that the drive can't 
>reorder writes (though of course the system still might).

Simply disabling the write cache does not guarantee the order of writes.
For one, with tagged I/O and the use of the SIMPLE_Q tag qualifier,
commands may be completed in any order.  If you want some semblance of
order, either disable the write cache or use the FUA bit in all writes,
and use the ORDERED tag qualifier.  Even when using these options,
it is not clear that the drive cannot reorder writes "slightly" to
make track writes more efficient (e.g. two separate commands to write
sequential sectors on the same track may be written in reverse order).

>At first glance, by the way, the only write barrier I see in the SCSI 
>command set is the synchronize-cache command, which completes only 
>after all the drive's dirty buffers are written out. Of course, 
>without write caching, it's not an issue.

The ordered tag qualifier gives you barier semantics with the caveats
listed above.

--
Justin

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15 13:16                             ` Ken Hirsch
  2001-07-15 14:50                               ` Chris Wedgwood
@ 2001-07-15 22:14                               ` Daniel Phillips
  1 sibling, 0 replies; 71+ messages in thread
From: Daniel Phillips @ 2001-07-15 22:14 UTC (permalink / raw)
  To: Ken Hirsch, Chris Wedgwood, John Alvord
  Cc: Alan Cox, Andrew Morton, Andreas Dilger, Albert D. Cahalan,
	Ben LaHaise, Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike,
	kevin, linux-lvm

On Sunday 15 July 2001 15:16, Ken Hirsch wrote:
> Chris Wedgwood <cw@f00f.org> wrote:
> > On Sat, Jul 14, 2001 at 11:05:36PM -0700, John Alvord wrote:
> > >
> > >     In the IBM solution to this (1977-78, VM/CMS) the critical data
> > > was written at the begining and the end of the block. If the two
> > > data items didn't match then the block was rejected.
> >
> > Neat.
> >
> > Simple and effective.  Presumably you can also checksum the block,
> > and check that.
>
> The first technique is not sufficient with modern disk controllers,
> which may reorder sector writes within a block.  A checksum,
> especially a robust CRC32, is sufficient, but rather expensive.

As somebody else pointed out, not if you don't have to compute it on
every block, as with journalling or atomic commit.

> Mohan has a clever technique that is computationally trivial and only
> uses one bit per sector:
> http://www.almaden.ibm.com/u/mohan/ICDE95.pdf
>
> Unfortunately, it's also patented:
> http://www.delphion.com/details?pn=US05418940__

Fortunately, it's clunky and unappealing compared to the simple 
checksum method, applied only to those blocks that define consistency
points.  I don't think this is patented.  I'd be disturbed if it was,
since it's obvious.

> Perhaps IBM will clarify their position with respect to free software
> and patents in the upcoming conference.

Wouldn't that be nice.  Imagine, IBM comes out and says, we admit it,
patents are a net burden on everybody, even us - from now on, we use
them only against those who use them against us, and we'll put that
in writing.  Right.

--
Daniel

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15 17:47                             ` Justin T. Gibbs
@ 2001-07-15 23:14                               ` Rod Van Meter
  2001-07-16  0:37                                 ` Jonathan Lundell
  2001-07-16  8:56                               ` Chris Wedgwood
  1 sibling, 1 reply; 71+ messages in thread
From: Rod Van Meter @ 2001-07-15 23:14 UTC (permalink / raw)
  To: Justin T. Gibbs; +Cc: linux-fsdevel, linux-kernel, linux-lvm

I don't have the SCSI spec in front of me (though, as noted, some
drafts are available online; try t10.org somewhere), but as I
understand it (having worked, briefly, for a major disk manufacturer):

You can commit an individual write with the FUA (force unit access)
bit.  The command for this is not WRITE EXTENDED, but WRITE(10) or
WRITE(12).  I don't think WRITE(6) has room for the bit, and WRITE(6)
is useless nowadays, anyway.  WRITE EXTENDED lets you write over the
ECC bits -- it's a raw write to the platter.  Dunno that anyone
implements it any more.

That does NOT get you ordering with respect to other commands.  You
can use the complex tagging stuff to get that, but most disk drives
didn't implement it properly in the SCSI-2 days, and there are
significant differences in SCSI-3.

Otherwise, your choice, as noted, is SYNCHRONIZE CACHE before the root
block write, and after.  AFAIK, all drives treat that the way it's
meant to be done; everything's on platter when you get a COMMAND
COMPLETE back from it, but they weren't necessarily done in order.

Even within a command, I don't believe there is a guarantee that the
blocks will go to platter in order.  Say you write blocks 0-7; the
drive will start the transfer to buffer immediately, as the seek is
begun.  When the seek completes, the write gate will enable writes
from buffer to platter, and a state machine takes care of that.
However, the seek and settle may complete when the head is over block
3, so the first write to platter would be block 4, then 5-7.  This is
followed by almost an entire revolution's delay(*see note) to get back
to block 0, and 3 will be the last block written.

I have had this exact conversation with disk drive folks (of which I
am not one), but I haven't seen the firmware and state machines
myself, so treat this as an educated guess.  The folks I was talking
to may have been wrong, or more likely, misunderstood what I was
asking.

Some manufacturers can put either IDE or SCSI on a drive, and this
behavior is likely to be the same on both.  It may not apply to all
members of a family, and probably doesn't apply across families from
the same manufacturer.

Most disk drives, as recently as two years ago, were a lot dumber than
you think, and I doubt the situation has improved much.  For the most
part, disk manufacturers get paid for capacity, not smarts, but
there's an entire year-long argument there.

	       --Rod

* Note: In theory, that rotational delay doesn't have to be idle.  I
  believe any blocks between 7 and 0 that are also in cache will be
  written as the head passes over them.  Thus, the drive might
  literally interleave writes from multiple commands.  It's also
  possible, in theory, to switch tracks for a short time and come back
  to the first track before block 0 rolls around, but I don't believe
  existing controllers are that sophisticated.

P.S.  I gotta put in another plug here -- you have until Friday to
write this behavior up and submit it as a paper to USENIX FAST --
Conference on File and Storage Technology.  See
http://www.usenix.org/events/fast/

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15 23:14                               ` Rod Van Meter
@ 2001-07-16  0:37                                 ` Jonathan Lundell
  2001-07-16 15:11                                   ` Rod Van Meter
  0 siblings, 1 reply; 71+ messages in thread
From: Jonathan Lundell @ 2001-07-16  0:37 UTC (permalink / raw)
  To: rdv, Justin T. Gibbs; +Cc: linux-fsdevel, linux-kernel

At 4:14 PM -0700 2001-07-15, Rod Van Meter wrote:
>You can commit an individual write with the FUA (force unit access)
>bit.  The command for this is not WRITE EXTENDED, but WRITE(10) or
>WRITE(12).  I don't think WRITE(6) has room for the bit, and WRITE(6)
>is useless nowadays, anyway.  WRITE EXTENDED lets you write over the
>ECC bits -- it's a raw write to the platter.  Dunno that anyone
>implements it any more.

WRITE EXTENDED is WRITE(10), I believe. The ECC-writing version is 
WRITE LONG; IBM (at least) implements it.

At 11:47 AM -0600 2001-07-15, Justin T. Gibbs wrote:
>As the soft reset section also specifies how to deal with initiators
>that are not expecting soft reset semantics, I believe this applies to
>either reset model.
>
>If we look at the section on caching for direct access devices we see,
>"[write-back cached] data may be lost if power to the device is lost or
>a hardware failure occurs".  There is no mention of a bus reset having
>any effect on commands already acked as completed to the intiator.

I'd very much like to think so; thanks for the reference. I'd feel a 
little more sanguine about the subject if there were some explicit 
guarantee of the desired behavior, either in the SCSI spec or in an 
implementer's functional spec. Nonetheless, it's testable behavior, 
and it's a reasonable inference that drives should behave correctly. 
Thanks again.
-- 
/Jonathan Lundell.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15 13:44                         ` Daniel Phillips
  2001-07-15 14:39                           ` Chris Wedgwood
  2001-07-15 15:06                           ` Jonathan Lundell
@ 2001-07-16  1:08                           ` Albert D. Cahalan
  2001-07-16  8:49                             ` Chris Wedgwood
  2001-07-21 19:18                             ` Alexander Griesser
  2 siblings, 2 replies; 71+ messages in thread
From: Albert D. Cahalan @ 2001-07-16  1:08 UTC (permalink / raw)
  To: Daniel Phillips; +Cc: cw, linux-kernel

Daniel Phillips writes:
> On Sunday 15 July 2001 05:36, Chris Wedgwood wrote:
>> On Sat, Jul 14, 2001 at 10:11:30PM +0200, Daniel Phillips wrote:

>>> Atomic commit.  The superblock, which references the updated
>>> version of the filesystem, carries a sequence number and a
>>> checksum.  It is written to one of two alternating locations.  On
>>> restart, both locations are read and the highest numbered
>>> superblock with a correct checksum is chosen as the new
>>> filesystem root.
>>
>> Yes... and which ever part of the superblock contains the sequence
>> number must be written atomically.
>
> The only requirement here is that the checksum be correct.  And sure,
> that's not a hard guarantee because, on average, you will get a good
> checksum for bad data once every 4 billion power events that mess up
> the final superblock transfer.  Let me see, if that happens once a year,

In a tree-structured filesystem, checksums on everything would only
cost you space similar to the number of pointers you have. Whenever
a non-leaf node points to a child, it can hold a checksum for that
child as well.

This gives a very reliable way to spot filesystem errors, including
corrupt data blocks.


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-16  1:08                           ` Albert D. Cahalan
@ 2001-07-16  8:49                             ` Chris Wedgwood
  2001-07-21 19:18                             ` Alexander Griesser
  1 sibling, 0 replies; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-16  8:49 UTC (permalink / raw)
  To: Albert D. Cahalan; +Cc: Daniel Phillips, linux-kernel

On Sun, Jul 15, 2001 at 09:08:41PM -0400, Albert D. Cahalan wrote:

    In a tree-structured filesystem, checksums on everything would
    only cost you space similar to the number of pointers you
    have. Whenever a non-leaf node points to a child, it can hold a
    checksum for that child as well.

    This gives a very reliable way to spot filesystem errors,
    including corrupt data blocks.

Actually, this is a really nice concept... have additional checksums
and such floating about. When filesystems get to several terabytes, it
would allws background consistency checking (as checking on boot would
be far to slow).

It would also allow the fs layer to fsck the filesystem _as_ data was
accessed if need be, which would be the case more often.



  --cw


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15 17:47                             ` Justin T. Gibbs
  2001-07-15 23:14                               ` Rod Van Meter
@ 2001-07-16  8:56                               ` Chris Wedgwood
  2001-07-16 13:19                                 ` Daniel Phillips
  1 sibling, 1 reply; 71+ messages in thread
From: Chris Wedgwood @ 2001-07-16  8:56 UTC (permalink / raw)
  To: Justin T. Gibbs
  Cc: Jonathan Lundell, Daniel Phillips, Alan Cox, Andrew Morton,
	Andreas Dilger, Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad,
	linux-fsdevel, linux-kernel, mike, kevin, linux-lvm

On Sun, Jul 15, 2001 at 11:47:10AM -0600, Justin T. Gibbs wrote:

    Simply disabling the write cache does not guarantee the order of
    writes.  For one, with tagged I/O and the use of the SIMPLE_Q tag
    qualifier, commands may be completed in any order.  If you want
    some semblance of order, either disable the write cache or use the
    FUA bit in all writes, and use the ORDERED tag qualifier.  Even
    when using these options, it is not clear that the drive cannot
    reorder writes "slightly" to make track writes more efficient
    (e.g. two separate commands to write sequential sectors on the
    same track may be written in reverse order).

ORDERED sounds like the trick...  I assume this is some kind of
write-barrier? If so, then I assume it has some kind of strict
temporal ordering, even between command issues to the drive.

If so, that would be idea if we can have the fs communicate this all
the way down to the device layer, making it work for soft-raid and LVM
be a little harder perhaps.



  --cw

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-16  8:56                               ` Chris Wedgwood
@ 2001-07-16 13:19                                 ` Daniel Phillips
  0 siblings, 0 replies; 71+ messages in thread
From: Daniel Phillips @ 2001-07-16 13:19 UTC (permalink / raw)
  To: Chris Wedgwood, Justin T. Gibbs
  Cc: Jonathan Lundell <jlundell@pobox.com> Alan Cox,
	Andrew Morton, Andreas Dilger, Albert D. Cahalan, Ben LaHaise,
	Ragnar Kjxrstad, linux-fsdevel, linux-kernel, mike, kevin,
	linux-lvm

On Monday 16 July 2001 10:56, Chris Wedgwood wrote:
> On Sun, Jul 15, 2001 at 11:47:10AM -0600, Justin T. Gibbs wrote:
>
>     Simply disabling the write cache does not guarantee the order of
>     writes.  For one, with tagged I/O and the use of the SIMPLE_Q tag
>     qualifier, commands may be completed in any order.  If you want
>     some semblance of order, either disable the write cache or use
> the FUA bit in all writes, and use the ORDERED tag qualifier.  Even
> when using these options, it is not clear that the drive cannot
> reorder writes "slightly" to make track writes more efficient (e.g.
> two separate commands to write sequential sectors on the same track
> may be written in reverse order).
>
> ORDERED sounds like the trick...  I assume this is some kind of
> write-barrier? If so, then I assume it has some kind of strict
> temporal ordering, even between command issues to the drive.
>
> If so, that would be idea if we can have the fs communicate this all
> the way down to the device layer, making it work for soft-raid and
> LVM be a little harder perhaps.

There was general agreement amongst filesystem developers at San Jose
that we need some kind of internal interface at the filesystem level
for this, independent of the type of underlying block device - IDE,
SCSI or "other".  That's as far as it got.

--
Daniel

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-16  0:37                                 ` Jonathan Lundell
@ 2001-07-16 15:11                                   ` Rod Van Meter
  0 siblings, 0 replies; 71+ messages in thread
From: Rod Van Meter @ 2001-07-16 15:11 UTC (permalink / raw)
  To: Jonathan Lundell; +Cc: rdv, Justin T. Gibbs, linux-fsdevel, linux-kernel

At 4:14 PM -0700 2001-07-15, Rod Van Meter wrote:
> >You can commit an individual write with the FUA (force unit access)
> >bit.  The command for this is not WRITE EXTENDED, but WRITE(10) or
> >WRITE(12).  I don't think WRITE(6) has room for the bit, and WRITE(6)
> >is useless nowadays, anyway.  WRITE EXTENDED lets you write over the
> >ECC bits -- it's a raw write to the platter.  Dunno that anyone
> >implements it any more.
> 
> WRITE EXTENDED is WRITE(10), I believe. The ECC-writing version is 
> WRITE LONG; IBM (at least) implements it.
> 

Whoops, you're right!  Brain fart.  We never used the term WRITE
EXTENDED; we always just called it WRITE(10) or WRITE(12).

	     --Rod

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14 12:27                 ` Paul Jakma
  2001-07-14 14:48                   ` Chris Wedgwood
@ 2001-07-16 18:53                   ` Andreas Dilger
  2001-07-16 19:13                     ` Ragnar Kjørstad
  1 sibling, 1 reply; 71+ messages in thread
From: Andreas Dilger @ 2001-07-16 18:53 UTC (permalink / raw)
  To: Paul Jakma; +Cc: Andreas Dilger, linux-kernel

Paul Jamka writes:
> On Fri, 13 Jul 2001, Andreas Dilger wrote:
> > put your journal on NVRAM, you will have blazing synchronous I/O.
> 
> so ext3 supports having the journal somewhere else then. question: can
> the journal be on tmpfs?

There are patches for this (2.2 only, not 2.4) but it is not in the core
ext3 code yet.  The ext3 design and on-disk layout allow for it (and the
e2fsprogs have the basic support for it), so it will not be a major change
to start using external devices for journals.

If you are keen to do performance testing (on a temporary filesystem, for
sure), you can hack around the current lack of ext3 support for journal
devices by doing the following (works for reiserfs also) with LVM:

1) create a PV on NVRAM/SSD/ramdisk (needs hacks to LVM code to support the
   NVRAM device, or you can loopback mount the device ;-)  It should be big
   enough to hold the entire journal + a bit of overhead*
2) create a VG and LV on the ramdisk
3) create a PV on a regular disk, add it to the above VG
4) extend the LV with the new PV space**
5) create a 4kB blocksize ext2 filesystem on this LV
6) use "dumpe2fs <LV NAME>" to find the free blocks count in the first group
7) use "tune2fs -J size=<blocks * blocksize> <LV name>" to create the
   journal, where "blocks" <= number of free blocks in first group and
   also <= (number of blocks on NVRAM device - overhead*)

You _should_ have the journal on NVRAM now, along with the superblock and
all of the metadata for the first group.  This will also improve performance
as the superblock and group descriptor tables are hot spots as well.

Of course, once support for external journal devices is added to ext3, it
will simply be a matter of doing "tune2fs -J device=<NVRAM device>".

Cheers, Andreas
---------------
*) For ext3, you need enough extra space for the superblock, group descriptors,
   one block and inode bitmap, the first inode table, (and lost+found if
   you don't want to do extra work deleting lost+found before creating the
   journal, and re-creating it afterwards).  The output from "dumpe2fs"
   will tell you the number of inode blocks and group descriptor blocks.
   For reiserfs it is hard to tell exactly where the file will go, but if
   you had, say, a 64MB NVRAM device and a new filesystem, you could expect
   the journal to be put entirely on the NVRAM device.

**) The LV will have the NVRAM device as the first Logical Extent, so this
   will also be logically the first part of the filesystem.  The PEs added
   to the LV will be appended to the NVRAM device.
-- 
Andreas Dilger  \ "If a man ate a pound of pasta and a pound of antipasto,
                 \  would they cancel out, leaving him still hungry?"
http://www-mddsp.enel.ucalgary.ca/People/adilger/               -- Dogbert

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-16 18:53                   ` Andreas Dilger
@ 2001-07-16 19:13                     ` Ragnar Kjørstad
  0 siblings, 0 replies; 71+ messages in thread
From: Ragnar Kjørstad @ 2001-07-16 19:13 UTC (permalink / raw)
  To: Paul Jakma; +Cc: Andreas Dilger, linux-kernel

> Cheers, Andreas
> ---------------
> *) For ext3, you need enough extra space for the superblock, group descriptors,
>    one block and inode bitmap, the first inode table, (and lost+found if
>    you don't want to do extra work deleting lost+found before creating the
>    journal, and re-creating it afterwards).  The output from "dumpe2fs"
>    will tell you the number of inode blocks and group descriptor blocks.
>    For reiserfs it is hard to tell exactly where the file will go, but if
>    you had, say, a 64MB NVRAM device and a new filesystem, you could expect
>    the journal to be put entirely on the NVRAM device.

You can use the LVM tools to see what extents are written the most times
- I'm sure that after having used the filesystem a little bit it will be
clear wich extents hold the journal. (and then you can move them to
NVRAM).

For reiserfs, I believe you can no specify a seperate device for your
journal and don't need lvm. Not sure if this code entered the kernel yet
though - maybe you need a patch.


When doing you testing, you should be aware that the results will be
very much dependent on the device you use for the filesystem. One thing
is that if you use a slow ide-drive, then the NVRAM/disk performance
will be higher than if you used a fast scsi-drive. But more importantly,
if you use a highend RAID, it will include NVRAM of it's own. So if you
really want to know if seperate NVRAM makes sense for you highend
server - don't test this on a regular disk and assume the results will
be the same.




-- 
Ragnar Kjorstad
Big Storage

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-15  6:07                           ` Chris Wedgwood
  2001-07-15 13:16                             ` Ken Hirsch
@ 2001-07-17  0:31                             ` Juan Quintela
  1 sibling, 0 replies; 71+ messages in thread
From: Juan Quintela @ 2001-07-17  0:31 UTC (permalink / raw)
  To: Chris Wedgwood
  Cc: John Alvord, Daniel Phillips, Alan Cox, Andrew Morton,
	Andreas Dilger, Albert D. Cahalan, Ben LaHaise, Ragnar Kjxrstad,
	linux-fsdevel, linux-kernel, mike, kevin, linux-lvm

>>>>> "chris" == Chris Wedgwood <cw@f00f.org> writes:

chris> On Sat, Jul 14, 2001 at 11:05:36PM -0700, John Alvord wrote:
chris> In the IBM solution to this (1977-78, VM/CMS) the critical data was
chris> written at the begining and the end of the block. If the two data items
chris> didn't match then the block was rejected.

chris> Neat.


chris> Simple and effective.  Presumably you can also checksum the block, and
chris> check that.

There is the rumor (I can't confirm that), that you need checksums,
that some disks are able to write well the beginning & the end of the
sector and put garbage in the middle in the case of problems.  I
have never been able to reproduce that errors, but ....

Later, Juan.


-- 
In theory, practice and theory are the same, but in practice they 
are different -- Larry McVoy

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-14 15:42                     ` Paul Jakma
  2001-07-14 17:18                       ` Chris Wedgwood
@ 2001-07-20 17:03                       ` Stephen C. Tweedie
  1 sibling, 0 replies; 71+ messages in thread
From: Stephen C. Tweedie @ 2001-07-20 17:03 UTC (permalink / raw)
  To: Paul Jakma; +Cc: Chris Wedgwood, Andreas Dilger, linux-kernel

Hi,

On Sat, Jul 14, 2001 at 04:42:04PM +0100, Paul Jakma wrote:
> 
> > *why* would you want to to do this?
> 
> :)
> 
> to test performance advantage of journal on RAM before going to spend
> money on NVRAM...

Journaling to ramdisk has been tried, yes.  The result was faster than
ext2 doing the same jobs.  Of course, the support for journal to
external devices is still only really at prototype stage.

Cheers,
 Stephen

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-16  1:08                           ` Albert D. Cahalan
  2001-07-16  8:49                             ` Chris Wedgwood
@ 2001-07-21 19:18                             ` Alexander Griesser
  2001-07-22  3:52                               ` Albert D. Cahalan
  1 sibling, 1 reply; 71+ messages in thread
From: Alexander Griesser @ 2001-07-21 19:18 UTC (permalink / raw)
  To: Albert D. Cahalan; +Cc: Daniel Phillips, cw, linux-kernel

On Sun, Jul 15, 2001 at 09:08:41PM -0400, you wrote:
> > The only requirement here is that the checksum be correct.  And sure,
> > that's not a hard guarantee because, on average, you will get a good
> > checksum for bad data once every 4 billion power events that mess up
> > the final superblock transfer.  Let me see, if that happens once a year,
> In a tree-structured filesystem, checksums on everything would only
> cost you space similar to the number of pointers you have. Whenever
> a non-leaf node points to a child, it can hold a checksum for that
> child as well.
> This gives a very reliable way to spot filesystem errors, including
> corrupt data blocks.

Hmm, maybe this is crap, but:
If the checksum-calculation for one node fails, wouldn't that mean, that
the data in this node, is not to be trusted? therefore also the checksum
of this node could be corrupted and so the node, 2 hops away, can't be
validated with 100% certitude...

regards, alexx
-- 
|   .-.   | Alexander Griesser <tuxx@aon.at> -=- ICQ:63180135 |  .''`. |
|   /v\   |  http://www.tuxx-home.at -=- Linux Version 2.4.7  | : :' : |
| /(   )\ |  FAQ zu at.linux:  http://alfie.ist.org/LinuxFAQ  | `. `'  |
|  ^^ ^^  `---------------------------------------------------´   `-   |

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-21 19:18                             ` Alexander Griesser
@ 2001-07-22  3:52                               ` Albert D. Cahalan
  2001-07-23 14:41                                 ` Daniel Phillips
  0 siblings, 1 reply; 71+ messages in thread
From: Albert D. Cahalan @ 2001-07-22  3:52 UTC (permalink / raw)
  To: Alexander Griesser; +Cc: Albert D. Cahalan, Daniel Phillips, cw, linux-kernel

Alexander Griesser writes:
> On Sun, Jul 15, 2001 at 09:08:41PM -0400, you wrote:

>> In a tree-structured filesystem, checksums on everything would only
>> cost you space similar to the number of pointers you have. Whenever
>> a non-leaf node points to a child, it can hold a checksum for that
>> child as well.  This gives a very reliable way to spot filesystem
>> errors, including corrupt data blocks.
>
> Hmm, maybe this is crap, but: If the checksum-calculation for one
> node fails, wouldn't that mean, that the data in this node, is not
> to be trusted? therefore also the checksum of this node could be
> corrupted and so the node, 2 hops away, can't be validated with 100%
> certitude...

If I understand you right ("one"? "this"?), yes and we want that.

Node 1 has children 2, 3, and 4.
Node 3 has children 5, 6, and 7.
Node 6 has children 8, 9, and 10. (children might be data blocks)

To have a child is to have a checksum+pointer pair.

If node 3 contains a corrupt pointer to node 6, then it is unlikely
that the checksum will match. So node 6 is bad, along 8, 9, and 10.
(actually we might not be able to know that 8, 9, and 10 exist)
This result is wonderful, since it prevents interpreting random
disk blocks as useful data.

If node 3 contains a corrupt checksum for node 6, same thing. Damn.
This case should be rare, since why for node 1 have a checksum
that is OK for node 3 if node 3 has corruption?

If node 6 itself is corrupt, same thing. Good, we are stopped from
using bad data.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-22  3:52                               ` Albert D. Cahalan
@ 2001-07-23 14:41                                 ` Daniel Phillips
  2001-07-24  4:29                                   ` Albert D. Cahalan
  0 siblings, 1 reply; 71+ messages in thread
From: Daniel Phillips @ 2001-07-23 14:41 UTC (permalink / raw)
  To: Albert D. Cahalan, Alexander Griesser
  Cc: Albert D. Cahalan, Daniel Phillips, cw, linux-kernel

On Sunday 22 July 2001 05:52, Albert D. Cahalan wrote:
> Alexander Griesser writes:
> > On Sun, Jul 15, 2001 at 09:08:41PM -0400, you wrote:
> >> In a tree-structured filesystem, checksums on everything would
> >> only cost you space similar to the number of pointers you have.
> >> Whenever a non-leaf node points to a child, it can hold a checksum
> >> for that child as well.  This gives a very reliable way to spot
> >> filesystem errors, including corrupt data blocks.
> >
> > Hmm, maybe this is crap, but: If the checksum-calculation for one
> > node fails, wouldn't that mean, that the data in this node, is not
> > to be trusted? therefore also the checksum of this node could be
> > corrupted and so the node, 2 hops away, can't be validated with
> > 100% certitude...
>
> If I understand you right ("one"? "this"?), yes and we want that.
>
> Node 1 has children 2, 3, and 4.
> Node 3 has children 5, 6, and 7.
> Node 6 has children 8, 9, and 10. (children might be data blocks)
>
> To have a child is to have a checksum+pointer pair.
>
> If node 3 contains a corrupt pointer to node 6, then it is unlikely
> that the checksum will match. So node 6 is bad, along 8, 9, and 10.
> (actually we might not be able to know that 8, 9, and 10 exist)
> This result is wonderful, since it prevents interpreting random
> disk blocks as useful data.
>
> If node 3 contains a corrupt checksum for node 6, same thing. Damn.
> This case should be rare, since why for node 1 have a checksum
> that is OK for node 3 if node 3 has corruption?
>
> If node 6 itself is corrupt, same thing. Good, we are stopped from
> using bad data.

I agree that your suggestion will work and that doubling the size of 
the metadata isn't an enormous cost, especially if you'd already 
compressed it using extents.  On the other hand, sometimes I just feel 
like trusting the hardware a little.  Both atomic-commit and 
journalling strategies take care of normal failure modes, and the disk 
hardware is supposed to flag other failures by ecc'ing each sector on 
disk.

--
Daniel

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-23 14:41                                 ` Daniel Phillips
@ 2001-07-24  4:29                                   ` Albert D. Cahalan
  2001-07-24 11:45                                     ` Daniel Phillips
  0 siblings, 1 reply; 71+ messages in thread
From: Albert D. Cahalan @ 2001-07-24  4:29 UTC (permalink / raw)
  To: Daniel Phillips
  Cc: Albert D. Cahalan, Alexander Griesser, Daniel Phillips, cw, linux-kernel

Daniel Phillips writes:
> On Sunday 22 July 2001 05:52, Albert D. Cahalan wrote:
>> [...]
>>> On Sun, Jul 15, 2001 at 09:08:41PM -0400, you wrote:

>>>> In a tree-structured filesystem, checksums on everything would
>>>> only cost you space similar to the number of pointers you have.
>>>> Whenever a non-leaf node points to a child, it can hold a checksum
>>>> for that child as well.  This gives a very reliable way to spot
>>>> filesystem errors, including corrupt data blocks.
...
>> To have a child is to have a checksum+pointer pair.
...
> I agree that your suggestion will work and that doubling the size
> of the metadata isn't an enormous cost, especially if you'd already
> compressed it using extents.  On the other hand, sometimes I just
> feel like trusting the hardware a little.  Both atomic-commit and
> journalling strategies take care of normal failure modes, and the
> disk hardware is supposed to flag other failures by ecc'ing each
> sector on disk.

Maybe you should discuss power-loss behavior with Theodore T'so.
For whatever reason, it seems that many drives and/or controllers
like to scribble on random unrelated sectors as power is lost.

For the atomic-commit case, an additional defense against this
sort of problem might be to keep a few extra trees on disk,
using a generation counter to pick the latest one. This does
bring us back to scanning the whole filesystem at boot though,
in order to disregard snapshots that have been damaged.



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-24  4:29                                   ` Albert D. Cahalan
@ 2001-07-24 11:45                                     ` Daniel Phillips
  0 siblings, 0 replies; 71+ messages in thread
From: Daniel Phillips @ 2001-07-24 11:45 UTC (permalink / raw)
  To: Albert D. Cahalan, Daniel Phillips
  Cc: Albert D. Cahalan, Alexander Griesser, Daniel Phillips, cw, linux-kernel

On Tuesday 24 July 2001 06:29, Albert D. Cahalan wrote:
> Daniel Phillips writes:
> > On Sunday 22 July 2001 05:52, Albert D. Cahalan wrote:
> >> [...]
> >>> On Sun, Jul 15, 2001 at 09:08:41PM -0400, you wrote:
> >>>> In a tree-structured filesystem, checksums on everything would
> >>>> only cost you space similar to the number of pointers you have.
> >>>> Whenever a non-leaf node points to a child, it can hold a
> >>>> checksum for that child as well.  This gives a very reliable way
> >>>> to spot filesystem errors, including corrupt data blocks.
> ...
> >> To have a child is to have a checksum+pointer pair.
> ...
> > I agree that your suggestion will work and that doubling the size
> > of the metadata isn't an enormous cost, especially if you'd already
> > compressed it using extents.  On the other hand, sometimes I just
> > feel like trusting the hardware a little.  Both atomic-commit and
> > journalling strategies take care of normal failure modes, and the
> > disk hardware is supposed to flag other failures by ecc'ing each
> > sector on disk.
>
> Maybe you should discuss power-loss behavior with Theodore T'so.
> For whatever reason, it seems that many drives and/or controllers
> like to scribble on random unrelated sectors as power is lost.

Last time we discussed this on lkml - I don't think Ted was involved
that time - the concensus was that only the last sector written is
in danger of being scribbled on.  (Sometimes because of reordering
we don't know which the last sector is, that's another story.)  If
you have experience with any disk that scribbled on a sector other
than the last written, I'd really appreciate knowing the model and
manufacturer - so that I can stay far away from such a POS.

As for silently feeding you corrupted sectors - that's clearly a
firmware bug, or outright omission.  Again, the term POS applies.

> For the atomic-commit case, an additional defense against this
> sort of problem might be to keep a few extra trees on disk,
> using a generation counter to pick the latest one. This does
> bring us back to scanning the whole filesystem at boot though,
> in order to disregard snapshots that have been damaged.

Unfortunately, most of the blocks are shared between trees so this
doesn't provide any extra protection.  RAID, or some RAID-like
thing (a little birdie told me that something may be in the works)
is probably the way to go, for dealing with substandard hardware
that you can't avoid using or weren't warned about.

--
Daniel

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-04  2:19   ` [PATCH] 64 bit scsi read/write Ben LaHaise
  2001-07-04  7:11     ` Alan Cox
  2001-07-05  6:34     ` Ragnar Kjørstad
@ 2001-07-26  2:18     ` Ragnar Kjørstad
  2001-07-26 16:24       ` Andreas Dilger
                         ` (3 more replies)
  2 siblings, 4 replies; 71+ messages in thread
From: Ragnar Kjørstad @ 2001-07-26  2:18 UTC (permalink / raw)
  To: Ben LaHaise; +Cc: linux-fsdevel, linux-kernel, mike, kevin

On Tue, Jul 03, 2001 at 10:19:36PM -0400, Ben LaHaise wrote:
> Here's the [completely untested] generic scsi fixup, but I'm told that
> some controllers will break with it.  Give it a whirl and let me know how
> many pieces you're left holding. =)  Please note that msdos partitions do
> *not* work on devices larger than 2TB, so you'll have to use the scsi disk
> directly.  This patch applies on top of v2.4.6-pre8-largeblock4.diff.

I just trid this, but when I can't load the md modules becuase of
missing symbols for __divdi3 and __umoddi3. 

Theese are the messages from make install:
find kernel -path '*/pcmcia/*' -name '*.o' | xargs -i -r ln -sf ../{}
pcmcia
if [ -r System.map ]; then /sbin/depmod -ae -F System.map  2.4.6-pre8;
fi
depmod: *** Unresolved symbols in
/lib/modules/2.4.6-pre8/kernel/drivers/md/linear.o
depmod: 	__udivdi3
depmod: 	__umoddi3
depmod: *** Unresolved symbols in
/lib/modules/2.4.6-pre8/kernel/drivers/md/lvm-mod.o
depmod: 	__udivdi3
depmod: 	__umoddi3
depmod: *** Unresolved symbols in
/lib/modules/2.4.6-pre8/kernel/drivers/md/md.o
depmod: 	__udivdi3
depmod: *** Unresolved symbols in
/lib/modules/2.4.6-pre8/kernel/drivers/md/raid0.o
depmod: 	__udivdi3
depmod: 	__umoddi3
depmod: *** Unresolved symbols in
/lib/modules/2.4.6-pre8/kernel/drivers/md/raid5.o
depmod: 	__udivdi3
depmod: 	__umoddi3


Did you forget something in your patch, or was it not supposed to work
on ia32?

This is kind of urgent, because I will temporarely be without testing
equipment pretty soon. Tips are appreciated!



-- 
Ragnar Kjorstad
Big Storage

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-26  2:18     ` Ragnar Kjørstad
@ 2001-07-26 16:24       ` Andreas Dilger
  2001-08-10 19:42       ` Ben LaHaise
                         ` (2 subsequent siblings)
  3 siblings, 0 replies; 71+ messages in thread
From: Andreas Dilger @ 2001-07-26 16:24 UTC (permalink / raw)
  To: Ragnar Kjørstad
  Cc: Ben LaHaise, linux-fsdevel, linux-kernel, mike, kevin

Ragnar writes:
> Theese are the messages from make install:
> depmod: *** Unresolved symbols in
> /lib/modules/2.4.6-pre8/kernel/drivers/md/lvm-mod.o
> /lib/modules/2.4.6-pre8/kernel/drivers/md/linear.o
> /lib/modules/2.4.6-pre8/kernel/drivers/md/md.o
> /lib/modules/2.4.6-pre8/kernel/drivers/md/raid0.o
> /lib/modules/2.4.6-pre8/kernel/drivers/md/raid5.o
> depmod: 	__udivdi3
> depmod: 	__umoddi3

These drivers do division and/or modulus on block numbers, instead of
doing shift/mask.  Someone has to fix them.

Cheers, Andreas
-- 
Andreas Dilger                               Turbolinux filesystem development
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-26  2:18     ` Ragnar Kjørstad
  2001-07-26 16:24       ` Andreas Dilger
@ 2001-08-10 19:42       ` Ben LaHaise
  2001-08-10 19:51       ` Ragnar Kjørstad
  2001-08-11 21:44       ` Matti Aarnio
  3 siblings, 0 replies; 71+ messages in thread
From: Ben LaHaise @ 2001-08-10 19:42 UTC (permalink / raw)
  To: Ragnar Kjørstad; +Cc: linux-fsdevel, linux-kernel, mike, kevin

On Thu, 26 Jul 2001, Ragnar Kjørstad wrote:

> Did you forget something in your patch, or was it not supposed to work
> on ia32?
>
> This is kind of urgent, because I will temporarely be without testing
> equipment pretty soon. Tips are appreciated!

Please try it without a modular kernel.

		-ben


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-26  2:18     ` Ragnar Kjørstad
  2001-07-26 16:24       ` Andreas Dilger
  2001-08-10 19:42       ` Ben LaHaise
@ 2001-08-10 19:51       ` Ragnar Kjørstad
  2001-08-10 20:02         ` Ben LaHaise
  2001-08-11 21:44       ` Matti Aarnio
  3 siblings, 1 reply; 71+ messages in thread
From: Ragnar Kjørstad @ 2001-08-10 19:51 UTC (permalink / raw)
  To: Ben LaHaise; +Cc: linux-fsdevel, linux-kernel, mike, kevin

On Thu, Jul 26, 2001 at 04:18:21AM +0200, Ragnar Kjørstad wrote:
> On Tue, Jul 03, 2001 at 10:19:36PM -0400, Ben LaHaise wrote:
> > Here's the [completely untested] generic scsi fixup, but I'm told that
> > some controllers will break with it.  Give it a whirl and let me know how
> > many pieces you're left holding. =)  Please note that msdos partitions do
> > *not* work on devices larger than 2TB, so you'll have to use the scsi disk
> > directly.  This patch applies on top of v2.4.6-pre8-largeblock4.diff.
> 
> I just trid this, but when I can't load the md modules becuase of
> missing symbols for __divdi3 and __umoddi3. 

I compiled md and lvm into the kernel rather than modules and got a
little futher:

* raid 0 over 4*600GB devices:
  * made filesystem
  * tried reading of the end of the device (dd skip=xx)
  all tests successful

* >1TB devices over scsi.
  * /proc/partitions report incorrect sizes
    [root@K2 /root]# cat /proc/partitions 
    major minor  #blocks  name
       8     0   17921835 sda
       8     1      56196 sda1
       8     2          1 sda2
       8     5   13076878 sda5
       8     6     530113 sda6
       8    16 9223372035816620928 sdb
       8    32 9223372035975108096 sdc
  * mkreiserfs fails: "mkreiserfs: can not create filesystem on that
    small device (0 blocks)."
  * mkfs.xfs fails: "warning - cannot set blocksize on block device
    /dev/sdb: Invalid argument"
  I assume both mkreiserfs and mkfs.xfs use ioctl to get the size
  of the device, and that ioctl uses an unsigned int? How is 
  userspace supposed to get the devicesize of >2GB devices with
  your code?
  * mkfs.ext2 makes the machine panic after a while.
    Unfortenately I don't have the panic message anymore, and at the
    moment I don't have the hardware to redo the test.
  * fdisk bails out with 'Unable to read /dev/sdb'
    Strace shows:
    open("/dev/sdb", O_RDWR)                = 3
    uname({sys="Linux", node="K2.torque.com", ...}) = 0
    ioctl(3, 0x1268, 0xbffff8f4)            = 0
    fstat64(3, {st_mode=S_IFBLK|0660, st_rdev=makedev(8, 16), ...}) = 0
    ioctl(3, BLKGETSIZE, 0xbffff924)        = 0
    ioctl(3, HDIO_GETGEO, 0xbffff918)       = 0
    read(3, "", 512)                        = 0



-- 
Ragnar Kjorstad
Big Storage

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-08-10 19:51       ` Ragnar Kjørstad
@ 2001-08-10 20:02         ` Ben LaHaise
  2001-08-11  0:18           ` Steve Lord
  0 siblings, 1 reply; 71+ messages in thread
From: Ben LaHaise @ 2001-08-10 20:02 UTC (permalink / raw)
  To: Ragnar Kjørstad; +Cc: linux-fsdevel, linux-kernel, mike, kevin

On Fri, 10 Aug 2001, Ragnar Kjørstad wrote:

> * >1TB devices over scsi.
>   * /proc/partitions report incorrect sizes

Okay, interesting, I'll have to dig through that.

>   * mkreiserfs fails: "mkreiserfs: can not create filesystem on that
>     small device (0 blocks)."
>   * mkfs.xfs fails: "warning - cannot set blocksize on block device
>     /dev/sdb: Invalid argument"

Someone needs to patch reiserfs/xfs.

>   I assume both mkreiserfs and mkfs.xfs use ioctl to get the size
>   of the device, and that ioctl uses an unsigned int? How is
>   userspace supposed to get the devicesize of >2GB devices with
>   your code?

See the e2fsprogs patch (again, below).

>   * mkfs.ext2 makes the machine panic after a while.
>     Unfortenately I don't have the panic message anymore, and at the
>     moment I don't have the hardware to redo the test.

That would've been a useful bug report.

>   * fdisk bails out with 'Unable to read /dev/sdb'

MS-DOS partitions do not work on huge devices, so at best we can make it
report a more informative message.

The amount of response I've received is absolutely dismal for a feature
lots of people are clamouring on about needing.  At this rate, I doubt
we'll have any of it decently tested before we start advertising it as a
supported feature in 2.6.

		-ben

-- 
"The world would be a better place if Larry Wall had been born in
Iceland, or any other country where the native language actually
has syntax" -- Peter da Silva


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-08-10 20:02         ` Ben LaHaise
@ 2001-08-11  0:18           ` Steve Lord
  0 siblings, 0 replies; 71+ messages in thread
From: Steve Lord @ 2001-08-11  0:18 UTC (permalink / raw)
  To: Ben LaHaise
  Cc: Ragnar Kjørstad, linux-fsdevel, linux-kernel, mike, kevin

> On Fri, 10 Aug 2001, Ragnar Kjxrstad wrote:
> 
> > * >1TB devices over scsi.
> >   * /proc/partitions report incorrect sizes
> 
> Okay, interesting, I'll have to dig through that.
> 
> >   * mkreiserfs fails: "mkreiserfs: can not create filesystem on that
> >     small device (0 blocks)."
> >   * mkfs.xfs fails: "warning - cannot set blocksize on block device
> >     /dev/sdb: Invalid argument"
> 
> Someone needs to patch reiserfs/xfs.

XFS should continue with the mkfs after this, this is a 'warning' not an
error. We do use the BLKGETSIZE ioctl to get the device size. The function
which needs modifying is findsize() in cmd/xfsprogs/libxfs/init.c

Steve

> 
> >   I assume both mkreiserfs and mkfs.xfs use ioctl to get the size
> >   of the device, and that ioctl uses an unsigned int? How is
> >   userspace supposed to get the devicesize of >2GB devices with
> >   your code?
> 
> See the e2fsprogs patch (again, below).
> 
> >   * mkfs.ext2 makes the machine panic after a while.
> >     Unfortenately I don't have the panic message anymore, and at the
> >     moment I don't have the hardware to redo the test.
> 
> That would've been a useful bug report.
> 
> >   * fdisk bails out with 'Unable to read /dev/sdb'
> 
> MS-DOS partitions do not work on huge devices, so at best we can make it
> report a more informative message.
> 
> The amount of response I've received is absolutely dismal for a feature
> lots of people are clamouring on about needing.  At this rate, I doubt
> we'll have any of it decently tested before we start advertising it as a
> supported feature in 2.6.
> 
> 		-ben
> 
> -- 
> "The world would be a better place if Larry Wall had been born in
> Iceland, or any other country where the native language actually
> has syntax" -- Peter da Silva
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
  2001-07-26  2:18     ` Ragnar Kjørstad
                         ` (2 preceding siblings ...)
  2001-08-10 19:51       ` Ragnar Kjørstad
@ 2001-08-11 21:44       ` Matti Aarnio
  3 siblings, 0 replies; 71+ messages in thread
From: Matti Aarnio @ 2001-08-11 21:44 UTC (permalink / raw)
  To: Ragnar Kjørstad
  Cc: Ben LaHaise, linux-fsdevel, linux-kernel, mike, kevin

On Thu, Jul 26, 2001 at 04:18:21AM +0200, Ragnar Kjørstad wrote:
> On Tue, Jul 03, 2001 at 10:19:36PM -0400, Ben LaHaise wrote:
> > Here's the [completely untested] generic scsi fixup, but I'm told that
> > some controllers will break with it.  Give it a whirl and let me know how
> > many pieces you're left holding. =)  Please note that msdos partitions do
> > *not* work on devices larger than 2TB, so you'll have to use the scsi disk
> > directly.  This patch applies on top of v2.4.6-pre8-largeblock4.diff.
> 
> I just trid this, but when I can't load the md modules becuase of
> missing symbols for __divdi3 and __umoddi3. 

   This should, most likely, be linux-kernel FAQ item -- it might even be
   there..

   You need to develop an additional patch to the MD module code so
   that it won't do careless arbitrary divisions of  long64 / int32  kind.
   It will be more efficient to do that with suitable shifts, after all.

/Matti Aarnio

> /lib/modules/2.4.6-pre8/kernel/drivers/md/linear.o
> depmod: 	__udivdi3
> depmod: 	__umoddi3
> depmod: *** Unresolved symbols in
> /lib/modules/2.4.6-pre8/kernel/drivers/md/lvm-mod.o
> depmod: 	__udivdi3
> depmod: 	__umoddi3
> depmod: *** Unresolved symbols in
> /lib/modules/2.4.6-pre8/kernel/drivers/md/md.o
> depmod: 	__udivdi3
> depmod: *** Unresolved symbols in
> /lib/modules/2.4.6-pre8/kernel/drivers/md/raid0.o
> depmod: 	__udivdi3
> depmod: 	__umoddi3
> depmod: *** Unresolved symbols in
> /lib/modules/2.4.6-pre8/kernel/drivers/md/raid5.o
> depmod: 	__udivdi3
> depmod: 	__umoddi3
> 
> 
> Did you forget something in your patch, or was it not supposed to work
> on ia32?
> 
> This is kind of urgent, because I will temporarely be without testing
> equipment pretty soon. Tips are appreciated!
> 
> 
> 
> -- 
> Ragnar Kjorstad
> Big Storage
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit SCSI read/write
@ 2001-07-19  7:35 Andre Hedrick
  0 siblings, 0 replies; 71+ messages in thread
From: Andre Hedrick @ 2001-07-19  7:35 UTC (permalink / raw)
  To: linux-kernel


Well to answer AC's question...we changed the rules in ATA-ATAPI-6 such
that all ATA (harddrives) devices are required to support write/flush
cache commands.


Andre Hedrick
Linux ATA Development
ASL Kernel Development
-----------------------------------------------------------------------------
ASL, Inc.                                     Toll free: 1-877-ASL-3535
1757 Houret Court                             Fax: 1-408-941-2071
Milpitas, CA 95035                            Web: www.aslab.com


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [PATCH] 64 bit scsi read/write
@ 2001-07-14 15:08 Ed Tomlinson
  0 siblings, 0 replies; 71+ messages in thread
From: Ed Tomlinson @ 2001-07-14 15:08 UTC (permalink / raw)
  To: linux-kernel

On Fri, 13 Jul 2001, Paul Jakma wrote:
>On Fri, 13 Jul 2001, Andreas Dilger wrote:

>> put your journal on NVRAM, you will have blazing synchronous I/O.

>so ext3 supports having the journal somewhere else then. question: can
>the journal be on tmpfs?

Why would you want too?  You _need_ the journal after a crash to recover
without an fsck - if its on tmpfs you are SOL...

Ed Tomlinson

^ permalink raw reply	[flat|nested] 71+ messages in thread

end of thread, other threads:[~2001-08-11 21:44 UTC | newest]

Thread overview: 71+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2001-07-01  4:53 [RFC][PATCH] first cut 64 bit block support Ben LaHaise
2001-07-03  4:53 ` Ragnar Kjørstad
2001-07-04  2:19   ` [PATCH] 64 bit scsi read/write Ben LaHaise
2001-07-04  7:11     ` Alan Cox
2001-07-05  6:34     ` Ragnar Kjørstad
2001-07-05  7:35       ` Ben LaHaise
2001-07-13 18:20         ` Albert D. Cahalan
2001-07-13 20:41           ` Andreas Dilger
2001-07-13 21:07             ` Chris Wedgwood
2001-07-13 22:04               ` Andreas Dilger
2001-07-14  0:49                 ` Jonathan Lundell
2001-07-14 12:27                 ` Paul Jakma
2001-07-14 14:48                   ` Chris Wedgwood
2001-07-14 15:42                     ` Paul Jakma
2001-07-14 17:18                       ` Chris Wedgwood
2001-07-20 17:03                       ` Stephen C. Tweedie
2001-07-16 18:53                   ` Andreas Dilger
2001-07-16 19:13                     ` Ragnar Kjørstad
2001-07-13 21:14             ` Alan Cox
2001-07-14  3:23               ` Andrew Morton
2001-07-14  8:45                 ` Alan Cox
2001-07-14 14:50                   ` Chris Wedgwood
2001-07-14 20:11                     ` Daniel Phillips
2001-07-15  1:21                       ` Andrew Morton
2001-07-15  1:53                         ` Daniel Phillips
2001-07-15  3:36                       ` Chris Wedgwood
2001-07-15  6:05                         ` John Alvord
2001-07-15  6:07                           ` Chris Wedgwood
2001-07-15 13:16                             ` Ken Hirsch
2001-07-15 14:50                               ` Chris Wedgwood
2001-07-15 22:14                               ` Daniel Phillips
2001-07-17  0:31                             ` Juan Quintela
2001-07-15 13:44                         ` Daniel Phillips
2001-07-15 14:39                           ` Chris Wedgwood
2001-07-15 15:32                             ` Alan Cox
2001-07-15 15:33                               ` Chris Wedgwood
2001-07-15 16:24                               ` Chris Wedgwood
2001-07-15 15:06                           ` Jonathan Lundell
2001-07-15 15:22                             ` Chris Wedgwood
2001-07-15 17:44                             ` Jonathan Lundell
2001-07-15 17:47                             ` Justin T. Gibbs
2001-07-15 23:14                               ` Rod Van Meter
2001-07-16  0:37                                 ` Jonathan Lundell
2001-07-16 15:11                                   ` Rod Van Meter
2001-07-16  8:56                               ` Chris Wedgwood
2001-07-16 13:19                                 ` Daniel Phillips
2001-07-16  1:08                           ` Albert D. Cahalan
2001-07-16  8:49                             ` Chris Wedgwood
2001-07-21 19:18                             ` Alexander Griesser
2001-07-22  3:52                               ` Albert D. Cahalan
2001-07-23 14:41                                 ` Daniel Phillips
2001-07-24  4:29                                   ` Albert D. Cahalan
2001-07-24 11:45                                     ` Daniel Phillips
2001-07-14 15:41                   ` Jonathan Lundell
2001-07-14 17:00                     ` Chris Wedgwood
2001-07-14 17:33                   ` Jonathan Lundell
2001-07-15  4:02                     ` Chris Wedgwood
2001-07-15  5:46                     ` Jonathan Lundell
2001-07-15 17:10                   ` Chris Wedgwood
2001-07-15 17:39                   ` Jonathan Lundell
2001-07-26  2:18     ` Ragnar Kjørstad
2001-07-26 16:24       ` Andreas Dilger
2001-08-10 19:42       ` Ben LaHaise
2001-08-10 19:51       ` Ragnar Kjørstad
2001-08-10 20:02         ` Ben LaHaise
2001-08-11  0:18           ` Steve Lord
2001-08-11 21:44       ` Matti Aarnio
2001-07-04 10:16 ` [RFC][PATCH] first cut 64 bit block support Chris Wedgwood
2001-07-04 16:59   ` Ben LaHaise
2001-07-14 15:08 [PATCH] 64 bit scsi read/write Ed Tomlinson
2001-07-19  7:35 [PATCH] 64 bit SCSI read/write Andre Hedrick

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).