linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] [PATCH] 2.5.35 patch for making DIO async
@ 2002-09-17 21:03 Badari Pulavarty
  2002-09-18 11:51 ` Benjamin LaHaise
  0 siblings, 1 reply; 8+ messages in thread
From: Badari Pulavarty @ 2002-09-17 21:03 UTC (permalink / raw)
  To: linux-kernel, linux-aio; +Cc: pbadari

Hi Ben,

Here is a 2.5.35 patch to make DIO async. Basically, DIO does not
wait for io completion. Waiting will be done at the higher level
(same way generic_file_read does).

Here is what I did:

1) pass kiocb *iocb to DIO
2) allocated a "dio" (instead of using stack)
3) after submitting bios, dio code returns with -EIOCBQUEUED
4) removed dio_bio_end_io(), dio_await_one(), dio_await_completion()
5) added dio_bio_end_aio() which calls dio_bio_complete() for
   each bio and if the dio_biocount becomes 0, it calls aio_complete()
6) changed raw_read/raw_write/.. routines to pass a sync_cb and wait
   on it.

Any feedback on approach/patch is appreciated.

Thanks,
Badari

diff -Naur -X dontdiff linux-2.5.35/drivers/char/raw.c linux-2.5.35.aio/drivers/char/raw.c
--- linux-2.5.35/drivers/char/raw.c	Sun Sep 15 19:18:37 2002
+++ linux-2.5.35.aio/drivers/char/raw.c	Tue Sep 17 09:00:39 2002
@@ -201,8 +201,9 @@
 }
 
 static ssize_t
-rw_raw_dev(int rw, struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
+rw_raw_aio_dev(int rw, struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
 {
+	struct file *filp = iocb->ki_filp;
 	const int minor = minor(filp->f_dentry->d_inode->i_rdev);
 	struct block_device *bdev = raw_devices[minor].binding;
 	struct inode *inode = bdev->bd_inode;
@@ -222,7 +223,7 @@
 		count = inode->i_size - *offp;
 		nr_segs = iov_shorten((struct iovec *)iov, nr_segs, count);
 	}
-	ret = generic_file_direct_IO(rw, inode, iov, *offp, nr_segs);
+	ret = generic_file_direct_IO(rw, iocb, inode, iov, *offp, nr_segs);
 
 	if (ret > 0)
 		*offp += ret;
@@ -231,6 +232,19 @@
 }
 
 static ssize_t
+rw_raw_dev(int rw, struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, filp);
+	ret = rw_raw_aio_dev(rw, &kiocb, iov, nr_segs, offp);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&kiocb);
+	return ret;
+}
+
+static ssize_t
 raw_read(struct file *filp, char *buf, size_t size, loff_t *offp)
 {
 	struct iovec local_iov = { .iov_base = buf, .iov_len = size};
@@ -246,6 +260,21 @@
 	return rw_raw_dev(WRITE, filp, &local_iov, 1, offp);
 }
 
+static ssize_t
+raw_aio_read(struct kiocb *iocb, const char *buf, size_t size, loff_t *offp)
+{
+	struct iovec local_iov = { .iov_base = buf, .iov_len = size};
+
+	return rw_raw_aio_dev(READ, iocb, &local_iov, 1, offp);
+}
+
+static ssize_t
+raw_aio_write(struct kiocb *iocb, const char *buf, size_t size, loff_t *offp)
+{
+	struct iovec local_iov = { .iov_base = buf, .iov_len = size};
+
+	return rw_raw_aio_dev(WRITE, iocb, &local_iov, 1, offp);
+}
 static ssize_t 
 raw_readv(struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp) 
 {
@@ -260,7 +289,9 @@
 
 static struct file_operations raw_fops = {
 	.read	=	raw_read,
+	.aio_read=	raw_aio_read,
 	.write	=	raw_write,
+	.aio_write=	raw_aio_write,
 	.open	=	raw_open,
 	.release=	raw_release,
 	.ioctl	=	raw_ioctl,
diff -Naur -X dontdiff linux-2.5.35/fs/block_dev.c linux-2.5.35.aio/fs/block_dev.c
--- linux-2.5.35/fs/block_dev.c	Sun Sep 15 19:19:09 2002
+++ linux-2.5.35.aio/fs/block_dev.c	Tue Sep 17 09:00:26 2002
@@ -116,10 +116,10 @@
 }
 
 static int
-blkdev_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
-			loff_t offset, unsigned long nr_segs)
+blkdev_direct_IO(int rw, struct kiocb *iocb, struct inode * inode, 
+		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
-	return generic_direct_IO(rw, inode, iov, offset,
+	return generic_direct_IO(rw, iocb, inode, iov, offset,
 				nr_segs, blkdev_get_blocks);
 }
 
diff -Naur -X dontdiff linux-2.5.35/fs/direct-io.c linux-2.5.35.aio/fs/direct-io.c
--- linux-2.5.35/fs/direct-io.c	Sun Sep 15 19:18:30 2002
+++ linux-2.5.35.aio/fs/direct-io.c	Tue Sep 17 09:48:16 2002
@@ -20,6 +20,7 @@
 #include <linux/err.h>
 #include <linux/buffer_head.h>
 #include <linux/rwsem.h>
+#include <linux/slab.h>
 #include <asm/atomic.h>
 
 /*
@@ -68,6 +69,8 @@
 	spinlock_t bio_list_lock;	/* protects bio_list */
 	struct bio *bio_list;		/* singly linked via bi_private */
 	struct task_struct *waiter;	/* waiting task (NULL if none) */
+	struct kiocb *iocb;		/* iocb ptr */
+	int results;
 };
 
 /*
@@ -145,25 +148,41 @@
 }
 
 /*
- * The BIO completion handler simply queues the BIO up for the process-context
- * handler.
- *
- * During I/O bi_private points at the dio.  After I/O, bi_private is used to
- * implement a singly-linked list of completed BIOs, at dio->bio_list.
+ * Process one completed BIO.  No locks are held.
  */
-static void dio_bio_end_io(struct bio *bio)
+static int dio_bio_complete(struct dio *dio, struct bio *bio)
 {
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int page_no;
+
+	for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
+		struct page *page = bvec[page_no].bv_page;
+
+		if (dio->rw == READ)
+			set_page_dirty(page);
+		page_cache_release(page);
+	}
+	atomic_dec(&dio->bio_count);
+	bio_put(bio);
+	return uptodate ? 0 : -EIO;
+}
+
+static void dio_bio_end_aio(struct bio *bio)
+{
+	int ret;
 	struct dio *dio = bio->bi_private;
-	unsigned long flags;
+	ret = dio_bio_complete(dio, bio);
+	if (ret)
+		dio->results = ret;
 
-	spin_lock_irqsave(&dio->bio_list_lock, flags);
-	bio->bi_private = dio->bio_list;
-	dio->bio_list = bio;
-	if (dio->waiter)
-		wake_up_process(dio->waiter);
-	spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+	if (atomic_read(&dio->bio_count) == 0) {
+		aio_complete(dio->iocb, dio->results, 0);
+		kfree(dio);
+	}
 }
 
+
 static int
 dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 		sector_t first_sector, int nr_vecs)
@@ -180,7 +199,7 @@
 	bio->bi_size = 0;
 	bio->bi_sector = first_sector;
 	bio->bi_io_vec[0].bv_page = NULL;
-	bio->bi_end_io = dio_bio_end_io;
+	bio->bi_end_io = dio_bio_end_aio;
 
 	dio->bio = bio;
 	dio->bvec = NULL;		/* debug */
@@ -212,75 +231,6 @@
 }
 
 /*
- * Wait for the next BIO to complete.  Remove it and return it.
- */
-static struct bio *dio_await_one(struct dio *dio)
-{
-	unsigned long flags;
-	struct bio *bio;
-
-	spin_lock_irqsave(&dio->bio_list_lock, flags);
-	while (dio->bio_list == NULL) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (dio->bio_list == NULL) {
-			dio->waiter = current;
-			spin_unlock_irqrestore(&dio->bio_list_lock, flags);
-			blk_run_queues();
-			schedule();
-			spin_lock_irqsave(&dio->bio_list_lock, flags);
-			dio->waiter = NULL;
-		}
-		set_current_state(TASK_RUNNING);
-	}
-	bio = dio->bio_list;
-	dio->bio_list = bio->bi_private;
-	spin_unlock_irqrestore(&dio->bio_list_lock, flags);
-	return bio;
-}
-
-/*
- * Process one completed BIO.  No locks are held.
- */
-static int dio_bio_complete(struct dio *dio, struct bio *bio)
-{
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec;
-	int page_no;
-
-	for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
-		struct page *page = bvec[page_no].bv_page;
-
-		if (dio->rw == READ)
-			set_page_dirty(page);
-		page_cache_release(page);
-	}
-	atomic_dec(&dio->bio_count);
-	bio_put(bio);
-	return uptodate ? 0 : -EIO;
-}
-
-/*
- * Wait on and process all in-flight BIOs.
- */
-static int dio_await_completion(struct dio *dio)
-{
-	int ret = 0;
-
-	if (dio->bio)
-		dio_bio_submit(dio);
-
-	while (atomic_read(&dio->bio_count)) {
-		struct bio *bio = dio_await_one(dio);
-		int ret2;
-
-		ret2 = dio_bio_complete(dio, bio);
-		if (ret == 0)
-			ret = ret2;
-	}
-	return ret;
-}
-
-/*
  * A really large O_DIRECT read or write can generate a lot of BIOs.  So
  * to keep the memory consumption sane we periodically reap any completed BIOs
  * during the BIO generation phase.
@@ -528,77 +478,83 @@
 }
 
 int
-direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, 
-	loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
+direct_io_worker(int rw, struct kiocb *iocb, struct inode * inode, 
+	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
+	get_blocks_t get_blocks)
 {
 	const unsigned blkbits = inode->i_blkbits;
 	unsigned long user_addr; 
-	int seg, ret2, ret = 0;
-	struct dio dio;
-	size_t bytes, tot_bytes = 0;
-
-	dio.bio = NULL;
-	dio.bvec = NULL;
-	dio.inode = inode;
-	dio.rw = rw;
-	dio.blkbits = blkbits;
-	dio.block_in_file = offset >> blkbits;
-	dio.blocks_available = 0;
-
-	dio.boundary = 0;
-	dio.reap_counter = 0;
-	dio.get_blocks = get_blocks;
-	dio.last_block_in_bio = -1;
-	dio.next_block_in_bio = -1;
+	int seg, ret = 0;
+	struct dio *dio;
+	size_t bytes;
+
+	dio = (struct dio *)kmalloc(sizeof(struct dio), GFP_KERNEL);
+	if (!dio)
+		return -ENOMEM;
+
+	dio->bio = NULL;
+	dio->bvec = NULL;
+	dio->inode = inode;
+	dio->iocb = iocb;
+	dio->rw = rw;
+	dio->blkbits = blkbits;
+	dio->block_in_file = offset >> blkbits;
+	dio->blocks_available = 0;
 
-	dio.page_errors = 0;
+	dio->results = 0;
+	dio->boundary = 0;
+	dio->reap_counter = 0;
+	dio->get_blocks = get_blocks;
+	dio->last_block_in_bio = -1;
+	dio->next_block_in_bio = -1;
+
+	dio->page_errors = 0;
 
 	/* BIO completion state */
-	atomic_set(&dio.bio_count, 0);
-	spin_lock_init(&dio.bio_list_lock);
-	dio.bio_list = NULL;
-	dio.waiter = NULL;
+	atomic_set(&dio->bio_count, 0);
+	spin_lock_init(&dio->bio_list_lock);
+	dio->bio_list = NULL;
+	dio->waiter = NULL;
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		user_addr = (unsigned long)iov[seg].iov_base;
 		bytes = iov[seg].iov_len;
 
 		/* Index into the first page of the first block */
-		dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
-		dio.final_block_in_request = dio.block_in_file + (bytes >> blkbits);
+		dio->first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
+		dio->final_block_in_request = dio->block_in_file + (bytes >> blkbits);
 		/* Page fetching state */
-		dio.head = 0;
-		dio.tail = 0;
-		dio.curr_page = 0;
+		dio->head = 0;
+		dio->tail = 0;
+		dio->curr_page = 0;
 
-		dio.total_pages = 0;
+		dio->total_pages = 0;
 		if (user_addr & (PAGE_SIZE-1)) {
-			dio.total_pages++;
+			dio->total_pages++;
 			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
 		}
-		dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
-		dio.curr_user_address = user_addr;
+		dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+		dio->curr_user_address = user_addr;
 	
-		ret = do_direct_IO(&dio);
+		ret = do_direct_IO(dio);
 
 		if (ret) {
-			dio_cleanup(&dio);
+			dio_cleanup(dio);
 			break;
 		}
 
-		tot_bytes += iov[seg].iov_len - ((dio.final_block_in_request -
-					dio.block_in_file) << blkbits);
+		dio->results += iov[seg].iov_len - ((dio->final_block_in_request -
+					dio->block_in_file) << blkbits);
 
 	} /* end iovec loop */
 
-	ret2 = dio_await_completion(&dio);
-	if (ret == 0)
-		ret = ret2;
+	if (dio->bio)
+                dio_bio_submit(dio);
+
 	if (ret == 0)
-		ret = dio.page_errors;
+		ret = dio->page_errors;
 	if (ret == 0)
-		ret = tot_bytes; 
-
+		ret = -EIOCBQUEUED;
 	return ret;
 }
 
@@ -606,8 +562,9 @@
  * This is a library function for use by filesystem drivers.
  */
 int
-generic_direct_IO(int rw, struct inode *inode, const struct iovec *iov, 
-	loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
+generic_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 
+	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
+	get_blocks_t get_blocks)
 {
 	int seg;
 	size_t size;
@@ -636,19 +593,21 @@
 			goto out;
 	}
 
-	retval = direct_io_worker(rw, inode, iov, offset, nr_segs, get_blocks);
+	retval = direct_io_worker(rw, iocb, inode, iov, offset, 
+				nr_segs, get_blocks);
 out:
 	return retval;
 }
 
 ssize_t
-generic_file_direct_IO(int rw, struct inode *inode, const struct iovec *iov, 
-	loff_t offset, unsigned long nr_segs)
+generic_file_direct_IO(int rw, struct  kiocb *iocb, struct inode *inode, 
+	const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
 	struct address_space *mapping = inode->i_mapping;
 	ssize_t retval;
 
-	retval = mapping->a_ops->direct_IO(rw, inode, iov, offset, nr_segs);
+	retval = mapping->a_ops->direct_IO(rw, iocb, inode, iov, 
+						offset, nr_segs);
 	if (inode->i_mapping->nrpages)
 		invalidate_inode_pages2(inode->i_mapping);
 	return retval;
diff -Naur -X dontdiff linux-2.5.35/fs/ext2/inode.c linux-2.5.35.aio/fs/ext2/inode.c
--- linux-2.5.35/fs/ext2/inode.c	Sun Sep 15 19:18:19 2002
+++ linux-2.5.35.aio/fs/ext2/inode.c	Tue Sep 17 09:00:26 2002
@@ -619,10 +619,10 @@
 }
 
 static int
-ext2_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
-			loff_t offset, unsigned long nr_segs)
+ext2_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 
+		const struct iovec *iov,loff_t offset, unsigned long nr_segs)
 {
-	return generic_direct_IO(rw, inode, iov,
+	return generic_direct_IO(rw, iocb, inode, iov,
 				offset, nr_segs, ext2_get_blocks);
 }
 
diff -Naur -X dontdiff linux-2.5.35/fs/ext3/inode.c linux-2.5.35.aio/fs/ext3/inode.c
--- linux-2.5.35/fs/ext3/inode.c	Sun Sep 15 19:18:41 2002
+++ linux-2.5.35.aio/fs/ext3/inode.c	Tue Sep 17 09:00:26 2002
@@ -1399,7 +1399,7 @@
  * If the O_DIRECT write is intantiating holes inside i_size and the machine
  * crashes then stale disk data _may_ be exposed inside the file.
  */
-static int ext3_direct_IO(int rw, struct inode *inode,
+static int ext3_direct_IO(int rw, struct kiocb *iocb, struct inode * inode,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)
 {
@@ -1430,7 +1430,7 @@
 		}
 	}
 
-	ret = generic_direct_IO(rw, inode, iov, offset,
+	ret = generic_direct_IO(rw, iocb, inode, iov, offset,
 				nr_segs, ext3_direct_io_get_blocks);
 
 out_stop:
diff -Naur -X dontdiff linux-2.5.35/fs/jfs/inode.c linux-2.5.35.aio/fs/jfs/inode.c
--- linux-2.5.35/fs/jfs/inode.c	Sun Sep 15 19:18:15 2002
+++ linux-2.5.35.aio/fs/jfs/inode.c	Tue Sep 17 09:00:26 2002
@@ -309,10 +309,11 @@
 	return generic_block_bmap(mapping, block, jfs_get_block);
 }
 
-static int jfs_direct_IO(int rw, struct inode *inode, const struct iovec *iov, 
-			loff_t offset, unsigned long nr_segs)
+static int jfs_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 
+			const struct iovec *iov, loff_t offset, 
+			unsigned long nr_segs)
 {
-	return generic_direct_IO(rw, inode, iov,
+	return generic_direct_IO(rw, iocb, inode, iov,
 				offset, nr_segs, jfs_get_blocks);
 }
 
diff -Naur -X dontdiff linux-2.5.35/include/linux/fs.h linux-2.5.35.aio/include/linux/fs.h
--- linux-2.5.35/include/linux/fs.h	Sun Sep 15 19:18:24 2002
+++ linux-2.5.35.aio/include/linux/fs.h	Tue Sep 17 09:04:21 2002
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <linux/radix-tree.h>
 #include <linux/bitops.h>
+#include <linux/fs.h>
 
 #include <asm/atomic.h>
 
@@ -279,6 +280,7 @@
  */
 struct page;
 struct address_space;
+struct kiocb;
 
 struct address_space_operations {
 	int (*writepage)(struct page *);
@@ -307,7 +309,7 @@
 	int (*bmap)(struct address_space *, long);
 	int (*invalidatepage) (struct page *, unsigned long);
 	int (*releasepage) (struct page *, int);
-	int (*direct_IO)(int, struct inode *, const struct iovec *iov, loff_t offset, unsigned long nr_segs);
+	int (*direct_IO)(int, struct kiocb *, struct inode *, const struct iovec *iov, loff_t offset, unsigned long nr_segs);
 };
 
 struct backing_dev_info;
@@ -743,7 +745,6 @@
  * read, write, poll, fsync, readv, writev can be called
  *   without the big kernel lock held in all filesystems.
  */
-struct kiocb;
 struct file_operations {
 	struct module *owner;
 	loff_t (*llseek) (struct file *, loff_t, int);
@@ -1248,10 +1249,10 @@
 				unsigned long nr_segs, loff_t *ppos);
 extern ssize_t generic_file_sendfile(struct file *, struct file *, loff_t *, size_t);
 extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
-extern ssize_t generic_file_direct_IO(int rw, struct inode *inode, 
-	const struct iovec *iov, loff_t offset, unsigned long nr_segs);
-extern int generic_direct_IO(int rw, struct inode *inode, const struct iovec 
-	*iov, loff_t offset, unsigned long nr_segs, get_blocks_t *get_blocks);
+extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, struct inode *	 inode,	const struct iovec *iov, loff_t offset, unsigned long nr_segs);
+extern int generic_direct_IO(int rw, struct kiocb *iocb, struct inode * inode, 
+	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
+	get_blocks_t *get_blocks);
 extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, 
 	unsigned long nr_segs, loff_t *ppos);
 ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, 
diff -Naur -X dontdiff linux-2.5.35/mm/filemap.c linux-2.5.35.aio/mm/filemap.c
--- linux-2.5.35/mm/filemap.c	Sun Sep 15 19:18:26 2002
+++ linux-2.5.35.aio/mm/filemap.c	Tue Sep 17 09:00:26 2002
@@ -1153,7 +1153,7 @@
 				nr_segs = iov_shorten((struct iovec *)iov,
 							nr_segs, count);
 			}
-			retval = generic_file_direct_IO(READ, inode, 
+			retval = generic_file_direct_IO(READ, iocb, inode, 
 					iov, pos, nr_segs);
 			if (retval > 0)
 				*ppos = pos + retval;
@@ -1989,7 +1989,9 @@
 					   current iovec */
 	unsigned long	seg;
 	char		*buf;
+	struct	kiocb	kiocb;
 
+	init_sync_kiocb(&kiocb, file);
 	if (unlikely((ssize_t)count < 0))
 		return -EINVAL;
 
@@ -2099,8 +2101,11 @@
 		if (count != ocount)
 			nr_segs = iov_shorten((struct iovec *)iov,
 						nr_segs, count);
-		written = generic_file_direct_IO(WRITE, inode, 
+		written = generic_file_direct_IO(WRITE, &kiocb, inode, 
 					iov, pos, nr_segs);
+		if (written == -EIOCBQUEUED)
+			written = wait_on_sync_kiocb(&kiocb);
+		
 		if (written > 0) {
 			loff_t end = pos + written;
 			if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] [PATCH] 2.5.35 patch for making DIO async
  2002-09-17 21:03 [RFC] [PATCH] 2.5.35 patch for making DIO async Badari Pulavarty
@ 2002-09-18 11:51 ` Benjamin LaHaise
  2002-09-18 15:58   ` Badari Pulavarty
  0 siblings, 1 reply; 8+ messages in thread
From: Benjamin LaHaise @ 2002-09-18 11:51 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: linux-kernel, linux-aio

On Tue, Sep 17, 2002 at 02:03:02PM -0700, Badari Pulavarty wrote:
> Hi Ben,
> 
> Here is a 2.5.35 patch to make DIO async. Basically, DIO does not
> wait for io completion. Waiting will be done at the higher level
> (same way generic_file_read does).

This looks pretty good.  Andrew Morton has had a look over it too and 
agrees that it should go in after a bit of testing.  Does someone want 
to try comparing throughput of dio based aio vs normal read/write?  It 
would be interesting to see what kind of an effect pipelining aios makes 
and if there are any performance niggles we need to squash.  Cheer,

		-ben

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] [PATCH] 2.5.35 patch for making DIO async
  2002-09-18 11:51 ` Benjamin LaHaise
@ 2002-09-18 15:58   ` Badari Pulavarty
  2002-09-18 16:04     ` Benjamin LaHaise
  0 siblings, 1 reply; 8+ messages in thread
From: Badari Pulavarty @ 2002-09-18 15:58 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: Badari Pulavarty, linux-kernel, linux-aio

> 
> On Tue, Sep 17, 2002 at 02:03:02PM -0700, Badari Pulavarty wrote:
> > Hi Ben,
> > 
> > Here is a 2.5.35 patch to make DIO async. Basically, DIO does not
> > wait for io completion. Waiting will be done at the higher level
> > (same way generic_file_read does).
> 
> This looks pretty good.  Andrew Morton has had a look over it too and 
> agrees that it should go in after a bit of testing.  Does someone want 
> to try comparing throughput of dio based aio vs normal read/write?  It 
> would be interesting to see what kind of an effect pipelining aios makes 
> and if there are any performance niggles we need to squash.  Cheer,
> 
> 		-ben

Thanks for looking at the patch. Patch needed few cleanups to get it
working. Here is the status so far..

1) I tested synchronous raw read/write. They perform almost same as
   without this patch. I am looking at why I can't match the performance.
   (I get 380MB/sec on 40 dd's on 40 disks without this patch.
    I get 350MB/sec with this patch).

2) wait_on_sync_kiocb() needed blk_run_queues() to make regular read/
   write perform well.

3) I am testing aio read/writes. I am using libaio.0.3.92. 
   When I try aio_read/aio_write on raw device, I am get OOPS.
   Can I use libaio.0.3.92 on 2.5 ? Are there any interface
   changes I need to worry  between 2.4 and 2.5 ?

Once I get aio read/writes working, I will provide you the performance
numbers.

Thanks,
Badari


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] [PATCH] 2.5.35 patch for making DIO async
  2002-09-18 15:58   ` Badari Pulavarty
@ 2002-09-18 16:04     ` Benjamin LaHaise
  2002-09-18 16:30       ` Badari Pulavarty
  0 siblings, 1 reply; 8+ messages in thread
From: Benjamin LaHaise @ 2002-09-18 16:04 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: linux-kernel, linux-aio

On Wed, Sep 18, 2002 at 08:58:22AM -0700, Badari Pulavarty wrote:
> Thanks for looking at the patch. Patch needed few cleanups to get it
> working. Here is the status so far..
> 
> 1) I tested synchronous raw read/write. They perform almost same as
>    without this patch. I am looking at why I can't match the performance.
>    (I get 380MB/sec on 40 dd's on 40 disks without this patch.
>     I get 350MB/sec with this patch).

Hmm, we should try to improve that.  Have you tried profiling a long run?

> 2) wait_on_sync_kiocb() needed blk_run_queues() to make regular read/
>    write perform well.

That should be somewhat conditional, but for now it sounds like the right 
thing to do.

> 3) I am testing aio read/writes. I am using libaio.0.3.92. 
>    When I try aio_read/aio_write on raw device, I am get OOPS.
>    Can I use libaio.0.3.92 on 2.5 ? Are there any interface
>    changes I need to worry  between 2.4 and 2.5 ?

libaio 0.3.92 should work on 2.5.  Could you send me a copy of the 
decoded Oops?

		-ben
-- 
GMS rules.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] [PATCH] 2.5.35 patch for making DIO async
  2002-09-18 16:04     ` Benjamin LaHaise
@ 2002-09-18 16:30       ` Badari Pulavarty
  2002-09-18 20:47         ` Badari Pulavarty
  0 siblings, 1 reply; 8+ messages in thread
From: Badari Pulavarty @ 2002-09-18 16:30 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: Badari Pulavarty, linux-kernel, linux-aio

> 
> On Wed, Sep 18, 2002 at 08:58:22AM -0700, Badari Pulavarty wrote:
> > Thanks for looking at the patch. Patch needed few cleanups to get it
> > working. Here is the status so far..
> > 
> > 1) I tested synchronous raw read/write. They perform almost same as
> >    without this patch. I am looking at why I can't match the performance.
> >    (I get 380MB/sec on 40 dd's on 40 disks without this patch.
> >     I get 350MB/sec with this patch).
> 
> Hmm, we should try to improve that.  Have you tried profiling a long run?


I took few shortcuts in the patch compared to original DIO code. 
eg., I create a bio all the time.. original code reuses them.
But I don't think that is causing the performance hit. Anyway,
here is the vmstat and profile=2 output.

vmstat:

0 40  2      0 3804628      0  27396   0   0 344909   192 3786  3536   0   5  95
 0 40  1      0 3803268      0  28628   0   0 345686   208 3793  3578   0   5  95
 0 40  1      0 3802452      0  29404   0   0 346983   210 3806  3619   0   5  95
 0 40  1      0 3801888      0  29932   0   0 345178   203 3786  3500   0   5  95
 0 40  1      0 3801804      0  29996   0   0 345792   196 3791  3510   0   5  95
 0 40  1      0 3801652      0  30100   0   0 346048   200 3800  3556   0   5  95

profile:

146796 default_idle                             2293.6875
  1834 __scsi_end_request                        10.4205
  1201 do_softirq                                 6.2552
   891 scsi_dispatch_cmd                          2.3203
    50 __wake_up                                  1.0417
   599 get_user_pages                             0.7341
   362 blk_rq_map_sg                              0.6856
   420 do_direct_IO                               0.5707
    25 dio_prep_bio                               0.5208
   562 __make_request                             0.4878
    55 __scsi_release_command                     0.4167
     6 cap_file_permission                        0.3750
    63 release_console_sem                        0.3580
    55 do_aic7xxx_isr                             0.3438
   141 bio_alloc                                  0.3389
    45 scsi_finish_command                        0.2557
     8 scsi_sense_valid                           0.2500
    16 dio_new_bio                                0.2500
   206 schedule                                   0.2341

> 
> > 2) wait_on_sync_kiocb() needed blk_run_queues() to make regular read/
> >    write perform well.
> 
> That should be somewhat conditional, but for now it sounds like the right 
> thing to do.
> 
> > 3) I am testing aio read/writes. I am using libaio.0.3.92. 
> >    When I try aio_read/aio_write on raw device, I am get OOPS.
> >    Can I use libaio.0.3.92 on 2.5 ? Are there any interface
> >    changes I need to worry  between 2.4 and 2.5 ?
> 
> libaio 0.3.92 should work on 2.5.  Could you send me a copy of the 
> decoded Oops?
> 

My Bad :)

aio_read/aio_write() routines take loff_t not loff_t * (like regular read/writes)

        ssize_t (*read) (struct file *, char *, size_t, loff_t *);
        ssize_t (*aio_read) (struct kiocb *, char *, size_t, loff_t);

I coded for loff_t *. fixed it.  I am testing it now.

Thanks,
Badari



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] [PATCH] 2.5.35 patch for making DIO async
  2002-09-18 16:30       ` Badari Pulavarty
@ 2002-09-18 20:47         ` Badari Pulavarty
  2002-09-19 10:52           ` Suparna Bhattacharya
  0 siblings, 1 reply; 8+ messages in thread
From: Badari Pulavarty @ 2002-09-18 20:47 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: Benjamin LaHaise, Badari Pulavarty, linux-kernel, linux-aio

Ben,

aio_read/aio_write() are now working with a minor fix to fs/aio.c

io_submit_one():
	
	if (likely(EIOCBQUEUED == ret))

		needs to be changed to

	if (likely(-EIOCBQUEUED == ret))
		  ^^^


I was wondering what happens to following case (I think this
happend in my test program).

Lets say, I did an sys_io_submit() and my test program did exit().
When the IO complete happend, it tried to do following and got
an OOPS in aio_complete().

	if (ctx == &ctx->mm->default_kioctx) { 

I think "mm" is freed up, when process exited. Do you think this is
possible ?  How do we handle this ?

- Badari

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] [PATCH] 2.5.35 patch for making DIO async
  2002-09-18 20:47         ` Badari Pulavarty
@ 2002-09-19 10:52           ` Suparna Bhattacharya
  2002-09-19 11:36             ` Suparna Bhattacharya
  0 siblings, 1 reply; 8+ messages in thread
From: Suparna Bhattacharya @ 2002-09-19 10:52 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Benjamin LaHaise, linux-kernel, linux-aio

On Wed, Sep 18, 2002 at 01:47:06PM -0700, Badari Pulavarty wrote:
> Ben,
> 
> aio_read/aio_write() are now working with a minor fix to fs/aio.c
> 
> io_submit_one():
> 	
> 	if (likely(EIOCBQUEUED == ret))
> 
> 		needs to be changed to
> 
> 	if (likely(-EIOCBQUEUED == ret))
> 		  ^^^
> 
> 
> I was wondering what happens to following case (I think this
> happend in my test program).
> 
> Lets say, I did an sys_io_submit() and my test program did exit().
> When the IO complete happend, it tried to do following and got
> an OOPS in aio_complete().
> 
> 	if (ctx == &ctx->mm->default_kioctx) { 
> 
> I think "mm" is freed up, when process exited. Do you think this is
> possible ?  How do we handle this ?

Do you see this only in the sync case ?
init_sync_iocb ought to increment ctx->reqs_active, so that
exit_aio waits for the iocb's to complete.

Regards
Suparna

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] [PATCH] 2.5.35 patch for making DIO async
  2002-09-19 10:52           ` Suparna Bhattacharya
@ 2002-09-19 11:36             ` Suparna Bhattacharya
  0 siblings, 0 replies; 8+ messages in thread
From: Suparna Bhattacharya @ 2002-09-19 11:36 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Benjamin LaHaise, linux-kernel, linux-aio

On Thu, Sep 19, 2002 at 04:22:14PM +0530, Suparna Bhattacharya wrote:
> On Wed, Sep 18, 2002 at 01:47:06PM -0700, Badari Pulavarty wrote:
> > Ben,
> > 
> > aio_read/aio_write() are now working with a minor fix to fs/aio.c
> > 
> > io_submit_one():
> > 	
> > 	if (likely(EIOCBQUEUED == ret))
> > 
> > 		needs to be changed to
> > 
> > 	if (likely(-EIOCBQUEUED == ret))
> > 		  ^^^
> > 
> > 
> > I was wondering what happens to following case (I think this
> > happend in my test program).
> > 
> > Lets say, I did an sys_io_submit() and my test program did exit().
> > When the IO complete happend, it tried to do following and got
> > an OOPS in aio_complete().
> > 
> > 	if (ctx == &ctx->mm->default_kioctx) { 
> > 
> > I think "mm" is freed up, when process exited. Do you think this is
> > possible ?  How do we handle this ?
> 
> Do you see this only in the sync case ?
> init_sync_iocb ought to increment ctx->reqs_active, so that
> exit_aio waits for the iocb's to complete.

Sorry, guess in the sync case, exit_aio shouldn't happen since 
the current thread still has a reference to the mm. 

And your problem is with the io_submit case ... have to look closely
to find out why.

> 
> Regards
> Suparna
> --
> To unsubscribe, send a message with 'unsubscribe linux-aio' in
> the body to majordomo@kvack.org.  For more info on Linux AIO,
> see: http://www.kvack.org/aio/

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2002-09-19 11:31 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-09-17 21:03 [RFC] [PATCH] 2.5.35 patch for making DIO async Badari Pulavarty
2002-09-18 11:51 ` Benjamin LaHaise
2002-09-18 15:58   ` Badari Pulavarty
2002-09-18 16:04     ` Benjamin LaHaise
2002-09-18 16:30       ` Badari Pulavarty
2002-09-18 20:47         ` Badari Pulavarty
2002-09-19 10:52           ` Suparna Bhattacharya
2002-09-19 11:36             ` Suparna Bhattacharya

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).