linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] initial support for NFS direct I/O for 2.5
@ 2002-10-07  2:47 Chuck Lever
  0 siblings, 0 replies; only message in thread
From: Chuck Lever @ 2002-10-07  2:47 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Linux Kernel Mailing List, Linux NFS List

hi Linus-

this patch adds initial support for NFS direct I/O in the 2.5 kernel.  
many have asked for this support to be included in 2.5.  this patch does 
not provide working NFS direct I/O, but i'm sending what i have now so 
that it can be included before October 20.

NFS direct I/O is enabled by its very own kernel config option.  when 
enabled, the NFS client won't build to prevent people from using this and 
possibly corrupting their NFS files.  later i will send a patch that 
finishes the implementation.

this patch is against 2.5.40.


diff -drN -U2 07-odirect2/fs/Config.help 08-odirect3/fs/Config.help
--- 07-odirect2/fs/Config.help	Tue Oct  1 03:07:34 2002
+++ 08-odirect3/fs/Config.help	Sat Oct  5 22:11:51 2002
@@ -515,4 +515,25 @@
   If unsure, say N.
 
+CONFIG_NFS_DIRECTIO
+  This option enables applications to perform uncached I/O on files
+  in NFS file systems using the O_DIRECT open() flag.  When O_DIRECT
+  is set for a file, its data is not cached in the system's page
+  cache.  Data is moved to and from user-level application buffers
+  directly.  Unlike local disk-based file systems, NFS O_DIRECT has
+  no alignment restrictions.
+
+  Unless your program is designed to use O_DIRECT properly, you are
+  much better off allowing the NFS client to manage data caching for
+  you.  Misusing O_DIRECT can cause poor server performance or network
+  storms.  This kernel build option defaults OFF to avoid exposing
+  system administrators unwittingly to a potentially hazardous
+  feature.
+
+  For more details on NFS O_DIRECT, see fs/nfs/direct.c.
+
+  If unsure, say N.  This reduces the size of the NFS client, and
+  causes open() to return EINVAL if a file residing in NFS is
+  opened with the O_DIRECT flag.
+
 CONFIG_ROOT_NFS
   If you want your Linux box to mount its whole root file system (the
diff -drN -U2 07-odirect2/fs/Config.in 08-odirect3/fs/Config.in
--- 07-odirect2/fs/Config.in	Tue Oct  1 03:07:10 2002
+++ 08-odirect3/fs/Config.in	Fri Oct  4 15:30:34 2002
@@ -119,4 +119,5 @@
    dep_mbool '  Provide NFSv3 client support' CONFIG_NFS_V3 $CONFIG_NFS_FS
    dep_bool '  Root file system on NFS' CONFIG_ROOT_NFS $CONFIG_NFS_FS $CONFIG_IP_PNP
+   dep_mbool '  Allow direct I/O on NFS files (EXPERIMENTAL)' CONFIG_NFS_DIRECTIO $CONFIG_NFS_FS $CONFIG_EXPERIMENTAL
 
    dep_tristate 'NFS server support' CONFIG_NFSD $CONFIG_INET
diff -drN -U2 07-odirect2/fs/nfs/Makefile 08-odirect3/fs/nfs/Makefile
--- 07-odirect2/fs/nfs/Makefile	Tue Oct  1 03:06:22 2002
+++ 08-odirect3/fs/nfs/Makefile	Fri Oct  4 15:30:34 2002
@@ -9,4 +9,5 @@
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o mount_clnt.o      
 nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o
+nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
 nfs-objs		:= $(nfs-y)
 
diff -drN -U2 07-odirect2/fs/nfs/direct.c 08-odirect3/fs/nfs/direct.c
--- 07-odirect2/fs/nfs/direct.c	Wed Dec 31 19:00:00 1969
+++ 08-odirect3/fs/nfs/direct.c	Sat Oct  5 22:07:02 2002
@@ -0,0 +1,259 @@
+/*
+ * linux/fs/nfs/direct.c
+ *
+ * Copyright (C) 2001 by Chuck Lever <cel@netapp.com>
+ *
+ * High-performance uncached I/O for the Linux NFS client
+ *
+ * There are important applications whose performance or correctness
+ * depends on uncached access to file data.  Database clusters
+ * (multiple copies of the same instance running on separate hosts) 
+ * implement their own cache coherency protocol that subsumes file
+ * system cache protocols.  Applications that process datasets 
+ * considerably larger than the client's memory do not always benefit 
+ * from a local cache.  A streaming video server, for instance, has no 
+ * need to cache the contents of a file.
+ *
+ * When an application requests uncached I/O, all read and write requests
+ * are made directly to the server; data stored or fetched via these
+ * requests is not cached in the Linux page cache.  The client does not
+ * correct unaligned requests from applications.  All requested bytes are
+ * held on permanent storage before a direct write system call returns to
+ * an application.
+ *
+ * Solaris implements an uncached I/O facility called directio() that
+ * is used for backups and sequential I/O to very large files.  Solaris
+ * also supports uncaching whole NFS partitions with "-o forcedirectio,"
+ * an undocumented mount option.
+ *
+ * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust.
+ *
+ * 18 Dec 2001	Initial implementation for 2.4  --cel
+ * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
+ * 24 Sep 2002	Rewrite to use asynchronous RPCs, port to 2.5  --cel
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/errno.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/sunrpc/clnt.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#define NFSDBG_FACILITY		(NFSDBG_PAGECACHE | NFSDBG_VFS)
+#define VERF_SIZE		(2 * sizeof(__u32))
+
+
+/**
+ * nfs_get_user_pages - find and set up page representing user buffer
+ * addr: user-space address of target buffer
+ * size: total size in bytes of target buffer
+ * @pages: returned array of page struct pointers underlying target buffer
+ * write: whether or not buffer is target of a write operation
+ */
+static inline int
+nfs_get_user_pages(unsigned long addr, size_t size,
+		struct page ***pages, int rw)
+{
+	int result = -ENOMEM;
+	unsigned page_count = (unsigned) size >> PAGE_SHIFT;
+	unsigned array_size = (page_count * sizeof(struct page *)) + 2U;
+
+	*pages = (struct page **) kmalloc(array_size, GFP_KERNEL);
+	if (*pages) {
+		down_read(&current->mm->mmap_sem);
+		result = get_user_pages(current, current->mm, addr,
+					page_count, (rw == WRITE), 0,
+					*pages, NULL);
+		up_read(&current->mm->mmap_sem);
+		if (result < 0)
+			printk(KERN_ERR "%s: get_user_pages result %d\n",
+					__FUNCTION__, result);
+	}
+	return result;
+}
+
+/**
+ * nfs_free_user_pages - tear down page struct array
+ * @pages: array of page struct pointers underlying target buffer
+ */
+static inline void
+nfs_free_user_pages(struct page **pages, unsigned count)
+{
+	unsigned page = 0;
+
+	while (count--)
+		page_cache_release(pages[page++]);
+
+	kfree(pages);
+}
+
+/**
+ * nfs_iov2pagelist - convert an array of iovecs to a list of page requests
+ * @inode: inode of target file
+ * @cred: credentials of user who requested I/O
+ * @iov: array of vectors that define I/O buffer
+ * offset: where in file to begin the read
+ * nr_segs: size of iovec array
+ * @requests: append new page requests to this list head
+ */
+static int
+nfs_iov2pagelist(int rw, const struct inode *inode,
+		const struct rpc_cred *cred,
+		const struct iovec *iov, loff_t offset,
+		unsigned long nr_segs, struct list_head *requests)
+{
+	unsigned seg;
+	int tot_bytes = 0;
+	struct page **pages;
+
+	/* for each iovec in the array... */
+	for (seg = 0; seg < nr_segs; seg++) {
+		const unsigned long user_addr =
+					(unsigned long) iov[seg].iov_base;
+		size_t bytes = iov[seg].iov_len;
+		unsigned int pg_offset = (user_addr & ~PAGE_MASK);
+		int page_count, page = 0;
+
+		page_count = nfs_get_user_pages(user_addr, bytes, &pages, rw);
+		if (page_count < 0) {
+			nfs_release_list(requests);
+			return page_count;
+		}
+
+		/* ...build as many page requests as required */
+		while (bytes > 0) {
+			struct nfs_page *new;
+			const unsigned int pg_bytes = (bytes > PAGE_SIZE) ?
+							PAGE_SIZE : bytes;
+
+			new = nfs_create_request((struct rpc_cred *) cred,
+						 (struct inode *) inode,
+						 pages[page],
+						 pg_offset, pg_bytes);
+			if (IS_ERR(new)) {
+				nfs_free_user_pages(pages, page_count);
+				nfs_release_list(requests);
+				return PTR_ERR(new);
+			}
+			new->wb_index = offset;
+			nfs_list_add_request(new, requests);
+
+			/* after the first page */
+			pg_offset = 0;
+			offset += PAGE_SIZE;
+			tot_bytes += pg_bytes;
+			bytes -= pg_bytes;
+			page++;
+		}
+
+		/* don't release pages here -- I/O completion will do that */
+		nfs_free_user_pages(pages, 0);
+	}
+
+	return tot_bytes;
+}
+
+/**
+ * do_nfs_direct_IO - Read or write data without caching
+ * @inode: inode of target file
+ * @cred: credentials of user who requested I/O
+ * @iov: array of vectors that define I/O buffer
+ * offset: where in file to begin the read
+ * nr_segs: size of iovec array
+ *
+ * Break the passed-in iovec into a series of page-sized or smaller
+ * requests, where each page is mapped for direct user-land I/O.
+ *
+ * For each of these pages, create an NFS page request and
+ * append it to an automatic list of page requests.
+ *
+ * When all page requests have been queued, start the I/O on the
+ * whole list.  The underlying routines coalesce the pages on the
+ * list into a bunch of asynchronous "r/wsize" network requests.
+ *
+ * I/O completion automatically unmaps and releases the pages.
+ */
+static int
+do_nfs_direct_IO(int rw, const struct inode *inode,
+		const struct rpc_cred *cred, const struct iovec *iov,
+		loff_t offset, unsigned long nr_segs)
+{
+	LIST_HEAD(requests);
+	int result, tot_bytes;
+
+	result = nfs_iov2pagelist(rw, inode, cred, iov, offset, nr_segs,
+								&requests);
+	if (result < 0)
+		return result;
+	tot_bytes = result;
+
+	switch (rw) {
+	case READ:
+		if (IS_SYNC(inode) || (NFS_SERVER(inode)->rsize < PAGE_SIZE)) {
+			result = nfs_direct_read_sync(inode, cred, iov, offset, nr_segs);
+			break;
+		}
+		result = nfs_pagein_list(&requests, NFS_SERVER(inode)->rpages);
+		nfs_wait_for_reads(&requests);
+		break;
+	case WRITE:
+		if (IS_SYNC(inode) || (NFS_SERVER(inode)->wsize < PAGE_SIZE))
+			result = nfs_direct_write_sync(inode, cred, iov, offset, nr_segs);
+		else
+			result = nfs_flush_list(&requests,
+					NFS_SERVER(inode)->wpages, FLUSH_WAIT);
+
+		/* invalidate cache so non-direct readers pick up changes */
+		invalidate_inode_pages((struct inode *) inode);
+		break;
+	default:
+		result = -EINVAL;
+		break;
+	}
+
+	if (result < 0)
+		return result;
+	return tot_bytes;
+}
+
+/**
+ * nfs_direct_IO - NFS address space operation for direct I/O
+ * rw: direction (read or write)
+ * @file: file struct of target file
+ * @iov: array of vectors that define I/O buffer
+ * offset: offset in file to begin the operation
+ * nr_segs: size of iovec array
+ *
+ * The inode's i_sem is no longer held by the VFS layer before it calls
+ * this function to do a write.
+ */
+int
+nfs_direct_IO(int rw, struct file *file, const struct iovec *iov,
+		loff_t offset, unsigned long nr_segs)
+{
+	/* None of this works yet, so prevent it from compiling. */
+#if 0
+	int result;
+	struct dentry *dentry = file->f_dentry;
+	const struct inode *inode = dentry->d_inode->i_mapping->host;
+	const struct rpc_cred *cred = nfs_file_cred(file);
+#endif
+
+	dfprintk(VFS, "NFS: direct_IO(%s) (%s/%s) off/no(%Lu/%lu)\n",
+				((rw == READ) ? "READ" : "WRITE"),
+				dentry->d_parent->d_name.name,
+				dentry->d_name.name, offset, nr_segs);
+
+	result = do_nfs_direct_IO(rw, inode, cred, iov, offset, nr_segs);
+
+	dfprintk(VFS, "NFS: direct_IO result = %d\n", result);
+
+	return result;
+}
diff -drN -U2 07-odirect2/fs/nfs/file.c 08-odirect3/fs/nfs/file.c
--- 07-odirect2/fs/nfs/file.c	Fri Oct  4 15:28:35 2002
+++ 08-odirect3/fs/nfs/file.c	Fri Oct  4 15:30:34 2002
@@ -200,5 +200,8 @@
 	.writepage = nfs_writepage,
 	.prepare_write = nfs_prepare_write,
-	.commit_write = nfs_commit_write
+	.commit_write = nfs_commit_write,
+#ifdef CONFIG_NFS_DIRECTIO
+	.direct_IO = nfs_direct_IO,
+#endif
 };
 
diff -drN -U2 07-odirect2/fs/nfs/pagelist.c 08-odirect3/fs/nfs/pagelist.c
--- 07-odirect2/fs/nfs/pagelist.c	Fri Oct  4 15:28:35 2002
+++ 08-odirect3/fs/nfs/pagelist.c	Fri Oct  4 15:30:34 2002
@@ -177,4 +177,24 @@
 
 /**
+ * nfs_release_list - cleanly dispose of an unattached list of page requests
+ * @list: list of doomed page requests
+ */
+void
+nfs_release_list(struct list_head *list)
+{
+	while (!list_empty(list)) {
+		struct nfs_page *req = nfs_list_entry(list);
+
+		nfs_list_remove_request(req);
+
+		page_cache_release(req->wb_page);
+
+		/* Release struct file or cached credential */
+		nfs_clear_request(req);
+		nfs_page_free(req);
+	}
+}
+
+/**
  * nfs_list_add_request - Insert a request into a sorted list
  * @req: request
@@ -224,4 +244,35 @@
 }
 
+/**
+ * nfs_wait_for_reads - wait for outstanding requests to complete
+ * @head: list of page requests to wait for
+ */
+int
+nfs_wait_for_reads(struct list_head *head)
+{
+	struct list_head *p = head->next;
+	unsigned int res = 0;
+
+	while (p != head) {
+		struct nfs_page *req = nfs_list_entry(p);
+		int error;
+
+		if (!NFS_WBACK_BUSY(req))
+			continue;
+
+		req->wb_count++;
+		error = nfs_wait_on_request(req);
+		if (error < 0)
+			return error;
+		nfs_list_remove_request(req);
+		nfs_clear_request(req);
+		nfs_page_free(req);
+
+		p = head->next;
+		res++;
+	}
+	return res;
+}
+
 /**
  * nfs_coalesce_requests - Split coalesced requests out from a list.
diff -drN -U2 07-odirect2/include/linux/nfs_page.h 08-odirect3/include/linux/nfs_page.h
--- 07-odirect2/include/linux/nfs_page.h	Fri Oct  4 15:28:35 2002
+++ 08-odirect3/include/linux/nfs_page.h	Fri Oct  4 15:30:34 2002
@@ -49,4 +49,5 @@
 extern	void nfs_clear_request(struct nfs_page *req);
 extern	void nfs_release_request(struct nfs_page *req);
+extern	void nfs_release_list(struct list_head *list);
 
 
@@ -60,4 +61,5 @@
 				  unsigned int);
 extern  int nfs_wait_on_request(struct nfs_page *);
+extern	int nfs_wait_for_reads(struct list_head *);
 
 extern	spinlock_t nfs_wreq_lock;
diff -drN -U2 07-odirect2/include/linux/nfs_xdr.h 08-odirect3/include/linux/nfs_xdr.h
--- 07-odirect2/include/linux/nfs_xdr.h	Tue Oct  1 03:07:35 2002
+++ 08-odirect3/include/linux/nfs_xdr.h	Fri Oct  4 15:30:34 2002
@@ -2,4 +2,6 @@
 #define _LINUX_NFS_XDR_H
 
+#include <linux/sunrpc/xprt.h>
+
 struct nfs_fattr {
 	unsigned short		valid;		/* which fields are valid */
@@ -58,8 +60,12 @@
 };
 
-/* Arguments to the read call.
- * Note that NFS_READ_MAXIOV must be <= (MAX_IOVEC-2) from sunrpc/xprt.h
+/*
+ * Arguments to the read call.
  */
-#define NFS_READ_MAXIOV 8
+
+#define NFS_READ_MAXIOV		(9U)
+#if (NFS_READ_MAXIOV > (MAX_IOVEC -2))
+#error "NFS_READ_MAXIOV is too large"
+#endif
 
 struct nfs_readargs {
@@ -77,8 +83,12 @@
 };
 
-/* Arguments to the write call.
- * Note that NFS_WRITE_MAXIOV must be <= (MAX_IOVEC-2) from sunrpc/xprt.h
+/*
+ * Arguments to the write call.
  */
-#define NFS_WRITE_MAXIOV        8
+#define NFS_WRITE_MAXIOV	(9U)
+#if (NFS_WRITE_MAXIOV > (MAX_IOVEC -2))
+#error "NFS_WRITE_MAXIOV is too large"
+#endif
+
 struct nfs_writeargs {
 	struct nfs_fh *		fh;

-- 

corporate:	<cel at netapp dot com>
personal:	<chucklever at bigfoot dot com>


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2002-10-07  2:42 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-10-07  2:47 [PATCH] initial support for NFS direct I/O for 2.5 Chuck Lever

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).