Re: [PATCH 1/2] Add userspace device-mapper target

From: Dan Smith <danms@us.ibm.com>
To: device-mapper development <dm-devel@redhat.com>
Subject: Re: [PATCH 1/2] Add userspace device-mapper target
Date: Wed, 21 Feb 2007 13:35:46 -0800	[thread overview]
Message-ID: <m3ejojfb2l.fsf@guaranine.beaverton.ibm.com> (raw)
In-Reply-To: 20070220085522I.fujita.tomonori@lab.ntt.co.jp

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

FT> your original code used this logic, didn't it? I guess that the
FT> code doesn't give notable performance difference even if
FT> dmu_ctl_write checks new requests and calls bio_map_user if
FT> necessary.

I've been thinking about it.  We don't do too much in dmu_rxd(), so
just processing it in the write() may be ok.  We farm out some things
to kthreads anyway.  Perhaps we can proceed with processing the
requests in the write() now, and then optimize later if we can show
conclusively that it would help.

FT> This path isn't easy,

I was afraid of that :)

Here is a patch that does the processing in the write() and supports
userspace bio mapping.  Trivial test cases work well for me.  I'll
have to hack on cowd quite a bit before I can test it in a real case.

Note that this patch is still very chatty with debug messages :)

- -- 
Dan Smith
IBM Linux Technology Center
Open Hypervisor Team
email: danms@us.ibm.com

Signed-off-by: Dan Smith <danms@us.ibm.com>
diff -r 165c54942fb4 -r 5e2a821c0dff drivers/md/Kconfig
- --- a/drivers/md/Kconfig	Tue Feb 20 12:14:32 2007 -0800
+++ b/drivers/md/Kconfig	Wed Feb 21 13:33:36 2007 -0800
@@ -236,6 +236,12 @@ config DM_SNAPSHOT
        ---help---
          Allow volume managers to take writable snapshots of a device.
 
+config DM_USERSPACE
+       tristate "Userspace target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM && EXPERIMENTAL
+       ---help---
+         A target that provides a userspace interface to device-mapper
+
 config DM_MIRROR
        tristate "Mirror target (EXPERIMENTAL)"
        depends on BLK_DEV_DM && EXPERIMENTAL
diff -r 165c54942fb4 -r 5e2a821c0dff drivers/md/Makefile
- --- a/drivers/md/Makefile	Tue Feb 20 12:14:32 2007 -0800
+++ b/drivers/md/Makefile	Wed Feb 21 13:33:36 2007 -0800
@@ -14,6 +14,8 @@ raid456-objs	:= raid5.o raid6algos.o rai
 		   raid6altivec1.o raid6altivec2.o raid6altivec4.o \
 		   raid6altivec8.o \
 		   raid6mmx.o raid6sse1.o raid6sse2.o
+dm-user-objs    := dm-userspace.o dm-userspace-chardev.o \
+		   dm-userspace-cache.o
 hostprogs-y	:= mktables
 
 # Note: link order is important.  All raid personalities
@@ -36,6 +38,7 @@ obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
+obj-$(CONFIG_DM_USERSPACE)      += dm-user.o
 
 quiet_cmd_unroll = UNROLL  $@
       cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
diff -r 165c54942fb4 -r 5e2a821c0dff drivers/md/dm-user.h
- --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/md/dm-user.h	Wed Feb 21 13:33:36 2007 -0800
@@ -0,0 +1,182 @@
+/*
+ * Copyright IBM Corp., 2006
+ * Author: Dan Smith <danms@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#ifndef __DM_USER_H
+#define __DM_USER_H
+
+#include <linux/dm-userspace.h>
+
+#include <linux/hardirq.h>
+#include <linux/slab.h>
+
+#include "dm-bio-list.h"
+
+#define DMU_KEY_LEN 256
+
+extern struct target_type userspace_target;
+extern mempool_t *request_pool;
+extern dev_t dmu_dev;
+extern spinlock_t devices_lock;
+extern struct list_head devices;
+
+struct dmu_mappings;
+
+#define DMU_CP_HASH 1024
+
+/*
+ * A block device that we can send bios to
+ */
+struct target_device {
+	struct list_head list;        /* Our place in the targets list      */
+	struct block_device *bdev;    /* The target block_device            */
+	struct kref users;            /* Self-destructing reference count   */
+};
+
+/*
+ * A dm-userspace device, which consists of multiple targets sharing a
+ * common key
+ */
+struct dmu_device {
+	struct list_head list;        /* Our place in the devices list     */
+
+	spinlock_t lock;              /* Protects all the fields below     */
+
+	/* We need to protect the TX/RX lists with a separate lock that is
+	 * always used with IRQs disabled because it is locked from
+	 * inside the endio function
+	 */
+	spinlock_t xmit_lock;
+	struct list_head tx_requests; /* Requests to send to userspace     */
+	struct list_head *rx_requests; /* Requests waiting for reply        */
+
+	struct dmu_mappings *mappings;
+
+	/* Accounting */
+	atomic_t t_reqs;              /* Waiting to be sent to userspace   */
+	atomic_t r_reqs;              /* Waiting for a response from uspace*/
+	atomic_t f_reqs;              /* Submitted, waiting for endio      */
+	atomic_t total;               /* Total requests allocated          */
+
+	atomic_t idcounter;           /* Counter for making request IDs    */
+
+	struct list_head target_devs; /* List of devices we can target     */
+
+	void *transport_private;      /* Private data for userspace comms  */
+
+	char key[DMU_KEY_LEN];        /* Unique name string for device     */
+	struct kref users;            /* Self-destructing reference count  */
+
+	wait_queue_head_t lowmem;     /* To block while waiting for memory */
+
+	uint64_t block_size;          /* Block size for this device        */
+	uint64_t block_mask;          /* Mask for offset in block          */
+	unsigned int block_shift;     /* Shift to convert to/from block    */
+
+	struct kcopyd_client *kcopy;  /* Interface to kcopyd               */
+
+	unsigned int request_slots;   /* Max number of reqs we will queue  */
+};
+
+struct dmu_request {
+	struct list_head list;        /* Our place on the request queue    */
+	struct list_head copy;        /* Our place on the copy list        */
+	struct dmu_device *dev;       /* The DMU device that owns us       */
+
+	struct block_device *target_dev;
+
+	int type;                     /* Type of request                   */
+	uint32_t flags;               /* Attribute flags                   */
+	uint64_t id;                  /* Unique ID for sync with userspace */
+	union {
+		uint64_t block;       /* The block in question             */
+	} u;
+
+	struct list_head deps;        /* Requests depending on this one    */
+	struct bio *bio;              /* The bio this request represents   */
+
+	struct work_struct task;      /* Async task to run for this req    */
+
+	struct dmu_msg_map_response response; /* FIXME: Clean this up      */
+
+	struct task_struct *controlling_task;
+	struct bio_list extra_bios;
+	atomic_t extra_finished;
+};
+
+
+extern void add_tx_request(struct dmu_device *dev, struct dmu_request *req);
+extern void endio_worker(struct work_struct *work);
+
+/* Find and grab a reference to a target device */
+struct target_device *find_target(struct dmu_device *dev,
+				  dev_t devno);
+/* Character device transport functions */
+int register_chardev_transport(struct dmu_device *dev);
+void unregister_chardev_transport(struct dmu_device *dev);
+int init_chardev_transport(void);
+void cleanup_chardev_transport(void);
+void write_chardev_transport_info(struct dmu_device *dev,
+				  char *buf, unsigned int maxlen);
+
+/* Return the block number for @sector */
+static inline u64 dmu_block(struct dmu_device *dev,
+			    sector_t sector)
+{
+	return sector >> dev->block_shift;
+}
+
+/* Return the sector offset in a block for @sector */
+static inline u64 dmu_sector_offset(struct dmu_device *dev,
+				    sector_t sector)
+{
+	return sector & dev->block_mask;
+}
+
+/* Return the starting sector for @block */
+static inline u64 dmu_sector(struct dmu_device *dev,
+			     uint64_t block)
+{
+	return block << dev->block_shift;
+}
+
+/* Increase the usage count for @dev */
+static inline void get_dev(struct dmu_device *dev)
+{
+	kref_get(&dev->users);
+}
+
+/* Decrease the usage count for @dev */
+void destroy_dmu_device(struct kref *ref);
+static inline void put_dev(struct dmu_device *dev)
+{
+	kref_put(&dev->users, destroy_dmu_device);
+}
+
+int dmu_init_mappings(void);
+void dmu_cleanup_mappings(void);
+int dmu_make_mapping(struct dmu_device *dev,
+		     uint64_t org, uint64_t new, int64_t offset,
+		     struct block_device *dest, int rw);
+int dmu_map_from_mappings(struct dmu_device *dev,
+			  struct bio *bio);
+int dmu_alloc_mappings(struct dmu_mappings **m, uint32_t size);
+int dmu_remove_mapping(struct dmu_device *dev, uint64_t org);
+unsigned int dmu_remove_all_mappings(struct dmu_device *dev);
+
+#endif
diff -r 165c54942fb4 -r 5e2a821c0dff drivers/md/dm-userspace-cache.c
- --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/md/dm-userspace-cache.c	Wed Feb 21 13:33:36 2007 -0800
@@ -0,0 +1,256 @@
+/*
+ * Copyright IBM Corp., 2006
+ * Author: Dan Smith <danms@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/types.h>
+#include <linux/poll.h>
+
+#include "dm.h"
+
+#include <linux/dm-userspace.h>
+
+#include "dm-user.h"
+
+#define DM_MSG_PREFIX "dm-userspace-cache"
+
+static struct kmem_cache *map_cache;
+
+struct dmu_mappings {
+	struct list_head *table;
+	uint32_t size;
+	uint32_t count;
+	struct semaphore sem;
+};
+
+struct dmu_map {
+	struct list_head list;
+	uint64_t org_block;
+	uint64_t new_block;
+	int64_t offset;
+	struct block_device *dest_dev;
+	int rw;
+};
+
+int dmu_alloc_mappings(struct dmu_mappings **mp, uint32_t size)
+{
+	struct dmu_mappings *m;
+	int i;
+
+	(*mp) = kmalloc(sizeof(*m), GFP_KERNEL);
+	if (!(*mp)) {
+		DMERR("Failed to alloc mappings");
+		return 0;
+	}
+	
+	m = *mp;	   
+
+	m->table = kmalloc(sizeof(struct list_head) * size, GFP_KERNEL);
+	m->size = size;
+	m->count = 0;
+
+	for (i = 0; i < m->size; i++) {
+		INIT_LIST_HEAD(&m->table[i]);
+	}
+		
+	init_MUTEX(&m->sem);
+
+	return 1;
+}
+
+int dmu_destroy_mappings(struct dmu_mappings *m)
+{
+	if (m->table)
+		kfree(m->table);
+			
+	return 1;
+}
+
+static struct dmu_map *__dmu_find_mapping(struct dmu_mappings *m,
+					  uint64_t block)
+{
+	uint32_t bucket;
+	struct dmu_map *map;
+
+	bucket = ((uint32_t)block) % m->size;
+
+	list_for_each_entry(map, &m->table[bucket], list) {
+		if (map->org_block == block)
+			return map;
+	}
+
+	return NULL;
+}
+
+static void __dmu_delete_mapping(struct dmu_mappings *m,
+				 struct dmu_map *map)
+{
+	m->count--;
+	list_del(&map->list);
+	kmem_cache_free(map_cache, map);
+}
+
+static int dmu_add_mapping(struct dmu_mappings *m, 
+			   struct dmu_map *map)
+{
+	uint32_t bucket;
+	struct dmu_map *old;
+
+	down(&m->sem);
+
+	old = __dmu_find_mapping(m, map->org_block);
+	if (old)
+		__dmu_delete_mapping(m, old);
+
+	bucket = ((uint32_t)map->org_block) % m->size;
+	
+	list_add(&map->list, &m->table[bucket]);
+	m->count++;
+
+	up(&m->sem);
+
+	return 1;
+}
+
+int dmu_map_from_mappings(struct dmu_device *dev,
+			  struct bio *bio)
+{
+	struct dmu_map *map;
+	int ret = 0;
+
+	down(&dev->mappings->sem);
+
+	map = __dmu_find_mapping(dev->mappings,
+				 dmu_block(dev, bio->bi_sector));
+
+	if (map && (bio_rw(bio) == map->rw)) {
+		
+		bio->bi_sector = dmu_sector(dev, map->new_block) +
+			dmu_sector_offset(dev, bio->bi_sector) +
+			map->offset;
+		bio->bi_bdev = map->dest_dev;
+		ret = 1;
+	}
+
+	up(&dev->mappings->sem);
+
+	return ret;
+}
+
+int dmu_make_mapping(struct dmu_device *dev,
+		     uint64_t org, uint64_t new, int64_t offset,
+		     struct block_device *dest, int rw)
+{
+	struct dmu_map *map;
+
+	/* FIXME */
+	map = kmem_cache_alloc(map_cache, GFP_NOIO);
+	if (!map) {
+		DMERR("Failed to alloc mapping");
+		return 0;
+	}
+
+	INIT_LIST_HEAD(&map->list);
+
+	map->org_block = org;
+	map->new_block = new;
+	map->dest_dev = dest;
+	map->offset = offset;
+	map->rw = rw;
+
+	return dmu_add_mapping(dev->mappings, map);
+}
+
+int dmu_remove_mapping(struct dmu_device *dev,
+		       uint64_t org)
+{
+	struct dmu_map *map;
+	int ret = 0;
+
+	down(&dev->mappings->sem);
+
+	map = __dmu_find_mapping(dev->mappings, org);
+	if (map) {
+		__dmu_delete_mapping(dev->mappings, map);
+		ret = 1;
+	}
+
+	up(&dev->mappings->sem);
+
+	return ret;
+}
+
+static unsigned int __destroy_bucket(struct dmu_mappings *m,
+				     unsigned int index)
+{
+	struct dmu_map *map, *next;
+	unsigned int count = 0;
+
+	list_for_each_entry_safe(map, next, &m->table[index], list) {
+		__dmu_delete_mapping(m, map);
+		count++;
+	}
+
+	return count;
+}
+
+unsigned int dmu_remove_all_mappings(struct dmu_device *dev)
+{
+	int i;
+	unsigned int count = 0;
+
+	down(&dev->mappings->sem);
+
+	for (i = 0; i < dev->mappings->size; i++) {
+		count += __destroy_bucket(dev->mappings, i);
+	}
+	
+	up(&dev->mappings->sem);
+
+	return count;
+}
+
+int dmu_init_mappings(void)
+{
+	map_cache =
+		kmem_cache_create("dm-userspace-mappings",
+				  sizeof(struct dmu_map),
+				  __alignof__ (struct dmu_map),
+				  0, NULL, NULL);
+	if (!map_cache) {
+		DMERR("Failed to allocate map cache");
+		return 0;
+	}
+
+	return 1;
+}
+
+void dmu_cleanup_mappings(void)
+{
+	kmem_cache_destroy(map_cache);
+}
+
+
diff -r 165c54942fb4 -r 5e2a821c0dff drivers/md/dm-userspace-chardev.c
- --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/md/dm-userspace-chardev.c	Wed Feb 21 13:33:36 2007 -0800
@@ -0,0 +1,866 @@
+/*
+ * Copyright IBM Corp., 2006
+ * Author: Dan Smith <danms@us.ibm.com>
+ *
+ * (C) 2006 FUJITA Tomonori <tomof@acm.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/dm-userspace.h>
+#include <linux/list.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/mm.h>
+#include <asm/uaccess.h>
+
+#include "dm.h"
+#include "dm-bio-list.h"
+#include "kcopyd.h"
+#include "dm-user.h"
+
+#define DM_MSG_PREFIX "dm-userspace"
+
+static int count;
+
+/* This allows for a cleaner separation between the dm-userspace
+ * device-mapper target, and the userspace transport used.  Right now,
+ * only a chardev transport exists, but it's possible that there could
+ * be more in the future
+ */
+struct dmu_ring {
+	u32 r_idx;
+	unsigned long r_pages[DMU_RING_PAGES];
+	spinlock_t r_lock;
+};
+
+struct chardev_transport {
+	struct cdev cdev;
+	dev_t ctl_dev;
+	struct dmu_device *parent;
+
+	struct dmu_ring tx;
+	struct dmu_ring rx;
+
+	struct task_struct *tx_task;
+	struct task_struct *rx_task;
+
+	wait_queue_head_t tx_wqueue;
+	wait_queue_head_t rx_wqueue;
+	wait_queue_head_t poll_wait;
+
+	struct task_struct *task;
+};
+
+static inline void dmu_ring_idx_inc(struct dmu_ring *r)
+{
+	if (r->r_idx == DMU_MAX_EVENTS - 1)
+		r->r_idx = 0;
+	else
+		r->r_idx++;
+}
+
+static struct dmu_msg *dmu_head_msg(struct dmu_ring *r, u32 idx)
+{
+	u32 pidx, off;
+
+	pidx = idx / DMU_EVENT_PER_PAGE;
+	off = idx % DMU_EVENT_PER_PAGE;
+
+	return (struct dmu_msg *)
+		(r->r_pages[pidx] + sizeof(struct dmu_msg) * off);
+}
+
+static struct dmu_request *find_rx_request(struct dmu_device *dev,
+					   uint64_t id)
+{
+	struct dmu_request *req, *next, *match = NULL;
+	int count = 0;
+	struct list_head *list = &dev->rx_requests[id % DMU_CP_HASH];
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->xmit_lock, flags);
+	list_for_each_entry_safe(req, next, list, list) {
+		count++;
+		if (req->id == id) {
+			list_del_init(&req->list);
+			match = req;
+			atomic_dec(&dev->r_reqs);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&dev->xmit_lock, flags);
+
+	return match;
+}
+
+static int have_pending_requests(struct dmu_device *dev)
+{
+	return atomic_read(&dev->t_reqs) != 0;
+}
+
+static void send_userspace_message(struct dmu_msg *msg,
+				   struct dmu_request *req)
+{
+	memset(msg, 0, sizeof(*msg));
+
+	msg->hdr.id = req->id;
+
+	switch (req->type) {
+	case DM_USERSPACE_MAP_BLOCK_REQ:
+		msg->hdr.msg_type = req->type;
+		msg->payload.map_req.org_block = req->u.block;
+		dmu_cpy_flag(&msg->payload.map_req.flags,
+			     req->flags, DMU_FLAG_WR);
+		break;
+
+	case DM_USERSPACE_MAP_DONE:
+		msg->hdr.msg_type = DM_USERSPACE_MAP_DONE;
+		msg->payload.map_done.id_of_op = req->id;
+		msg->payload.map_done.org_block = req->u.block;
+		dmu_cpy_flag(&msg->payload.map_done.flags,
+			     req->flags, DMU_FLAG_WR);
+		break;
+
+	default:
+		DMWARN("Unknown outgoing message type %i", req->type);
+	}
+
+	/* If this request is not on a list (the rx_requests list),
+	 * then it needs to be freed after sending
+	 */
+	if (list_empty(&req->list)) {
+ 		INIT_WORK(&req->task, endio_worker);
+		schedule_work(&req->task);
+	}
+}
+
+static void add_rx_request(struct dmu_request *req)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&req->dev->xmit_lock, flags);
+	list_add_tail(&req->list, 
+		      &req->dev->rx_requests[req->id % DMU_CP_HASH]);
+	atomic_inc(&req->dev->r_reqs);
+	spin_unlock_irqrestore(&req->dev->xmit_lock, flags);
+}
+
+struct dmu_request *pluck_next_request(struct dmu_device *dev)
+{
+	struct dmu_request *req = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->xmit_lock, flags);
+	if (!list_empty(&dev->tx_requests)) {
+		req = list_entry(dev->tx_requests.next,
+				 struct dmu_request, list);
+		list_del_init(&req->list);
+
+		atomic_dec(&dev->t_reqs);
+	}
+	spin_unlock_irqrestore(&dev->xmit_lock, flags);
+
+	if (req && ((req->type == DM_USERSPACE_MAP_BLOCK_REQ) ||
+		    (req->type == DM_USERSPACE_MAP_DONE)))
+		add_rx_request(req);
+
+	return req;
+}
+
+static struct dmu_msg *get_tx_msg(struct dmu_ring *ring)
+{
+	struct dmu_msg *msg;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ring->r_lock, flags);
+	msg = dmu_head_msg(ring, ring->r_idx);
+	if (msg->hdr.status)
+		msg = NULL;
+	else
+		dmu_ring_idx_inc(ring);
+	spin_unlock_irqrestore(&ring->r_lock, flags);
+
+	return msg;
+}
+
+static void send_tx_request(struct dmu_msg *msg, struct dmu_request *req)
+{
+	struct chardev_transport *t = req->dev->transport_private;
+
+	send_userspace_message(msg, req);
+	msg->hdr.status = 1;
+	mb();
+	flush_dcache_page(virt_to_page(msg));
+	wake_up_interruptible(&t->poll_wait);
+}
+
+/* Add a request to a device's request queue */
+void add_tx_request(struct dmu_device *dev, struct dmu_request *req)
+{
+	unsigned long flags;
+	struct chardev_transport *t = dev->transport_private;
+	struct dmu_ring *ring = &t->tx;
+	struct dmu_msg *msg;
+
+	BUG_ON(!list_empty(&req->list));
+
+	msg = get_tx_msg(ring);
+
+	if (msg) {
+		add_rx_request(req);
+		send_tx_request(msg, req);
+	} else {
+		spin_lock_irqsave(&dev->xmit_lock, flags);
+		list_add_tail(&req->list, &dev->tx_requests);
+		atomic_inc(&dev->t_reqs);
+		spin_unlock_irqrestore(&dev->xmit_lock, flags);
+
+		wake_up_interruptible(&t->tx_wqueue);
+	}
+}
+
+static int dmu_txd(void *data)
+{
+
+	struct dmu_device *dev = data;
+	struct chardev_transport *t = dev->transport_private;
+	struct dmu_ring *ring = &t->tx;
+	struct dmu_request *req = NULL;
+	struct dmu_msg *msg;
+
+	while (!kthread_should_stop()) {
+		msg = dmu_head_msg(ring, ring->r_idx);
+
+		wait_event_interruptible(t->tx_wqueue,
+					 (!msg->hdr.status &&
+					  have_pending_requests(dev)) ||
+					 kthread_should_stop());
+
+		if (kthread_should_stop())
+			break;
+
+		msg = get_tx_msg(ring);
+		if (!msg)
+			continue;
+
+		req = pluck_next_request(dev);
+		BUG_ON(!req);
+
+		send_tx_request(msg, req);
+	}
+
+	return 0;
+}
+
+static void flush_block(int read_err, unsigned int write_err, void *data)
+{
+	struct dmu_request *req = data;
+
+	if (read_err || write_err) {
+		DMERR("Failed to copy block!");
+		bio_io_error(req->bio, req->bio->bi_size);
+		return;
+	}
+
+	atomic_inc(&req->dev->f_reqs);
+	generic_make_request(req->bio);
+
+}
+
+static void copy_block(struct dmu_device *dev,
+		       struct block_device *src_dev,
+		       struct block_device *dst_dev,
+		       struct dmu_request *req,
+		       uint64_t org_block,
+		       uint64_t new_block,
+		       int64_t offset)
+{
+	struct io_region src, dst;
+
+	src.bdev = src_dev;
+	src.sector = dmu_sector(dev, org_block);
+	src.count = dev->block_size;
+
+	dst.bdev = dst_dev;
+	dst.sector = dmu_sector(dev, new_block);
+	dst.sector += offset;
+	dst.count = dev->block_size;
+
+	kcopyd_copy(dev->kcopy, &src, 1, &dst, 0, flush_block, req);
+}
+
+static int extra_end_io(struct bio *bio, unsigned int a, int b)
+{
+	struct dmu_request *req = bio->bi_private;
+
+	printk("Extra endio: %p a:%u b:%i\n", bio, a, b);
+
+	atomic_inc(&req->extra_finished);
+
+	return 0;
+}
+
+static int make_extra_requests(struct dmu_request *req)
+{
+	struct request_queue *q;
+	struct bio *bio;
+	struct dmu_extra_write *extra = NULL;
+	int len;
+	int ret;
+	int i;
+
+	q = bdev_get_queue(req->target_dev);
+	if (blk_get_queue(q)) {
+		DMERR("Failed to get queue");
+		return -EINVAL;
+	}
+
+	len = req->response.extra_count * sizeof(*extra);
+	/* FIXME: agk won't like this because we're allocating memory
+	 *         in the critical path... gotta find a better way
+	 */
+	extra = kmalloc(len, GFP_KERNEL);
+	if (!extra) {
+		DMERR("Failed to alloc extra buffer");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (copy_from_user(extra, (void *)req->response.extra_writes, len)) {
+		DMERR("Failed to copy extra writes from userspace");
+		ret = -EACCES;
+		goto out;
+	}
+	 
+	for (i = 0; i < req->response.extra_count; i++) {
+		
+		DMINFO("(%i) Creating extra write: %llu %llu",
+		       i, extra[i].buf, extra[i].len);
+		bio = bio_map_user(q, req->target_dev, 
+				   extra[i].buf, extra[i].len, 0);
+		
+		if (IS_ERR(bio)) {
+			DMERR("Failed to create extra write bio: %ld",
+			      PTR_ERR(bio));
+			ret = -EINVAL;
+			goto out;
+		}
+		
+		bio->bi_sector = extra[i].offset;
+		bio->bi_end_io = extra_end_io;
+		bio->bi_private = req;
+
+		DMINFO("Extra write: s:%lu l:%llu (%s)", 
+		       bio->bi_sector, extra->len,
+		       req->target_dev->bd_disk->disk_name);
+		
+		bio_list_add(&req->extra_bios, bio);
+	}
+
+	ret = 0;
+
+ out:
+	kfree(extra);
+	blk_put_queue(q);
+
+	return ret;
+}
+
+static void map_worker(struct work_struct *work)
+{
+	struct dmu_request *req;
+	struct dmu_msg_map_response *msg;
+	struct dmu_device *dev;
+	struct target_device *src_dev;
+	struct chardev_transport *t;
+
+	req = container_of(work, struct dmu_request, task);
+	msg = &req->response;
+	dev = req->dev;
+	t = dev->transport_private;
+
+	if (dmu_get_flag(&msg->flags, DMU_FLAG_COPY_FIRST)) {
+		src_dev = find_target(dev, MKDEV(msg->src_maj, msg->src_min));
+		if (!src_dev) {
+			DMERR("Failed to find src device %i:%i\n",
+			      msg->src_maj, msg->src_min);
+			goto fail;
+		}
+	} else
+		src_dev = NULL;
+
+	/* Remap the bio */
+	req->bio->bi_sector = dmu_sector(dev, msg->new_block) +
+		dmu_sector_offset(dev, req->bio->bi_sector) +
+		msg->offset;
+	req->bio->bi_bdev = req->target_dev;
+
+	dmu_cpy_flag(&req->flags, msg->flags, DMU_FLAG_SYNC);
+
+	if (dmu_get_flag(&msg->flags, DMU_FLAG_COPY_FIRST))
+		copy_block(dev, src_dev->bdev, req->target_dev, req,
+			   req->u.block, msg->new_block,
+			   msg->offset);
+	else
+		flush_block(0, 0, req);
+
+	return;
+
+ fail:
+	bio_io_error(req->bio, req->bio->bi_size);
+}
+
+static void do_make_mapping(struct dmu_device *dev,
+			    struct dmu_msg_make_mapping *msg)
+{
+	struct target_device *target;
+
+	target = find_target(dev, MKDEV(msg->dev_maj, msg->dev_min));
+	if (!target) {
+		DMERR("Failed to find target device %i:%i\n",
+		      msg->dev_maj, msg->dev_min);
+		return;
+	}
+
+
+	
+	dmu_make_mapping(dev, 
+			 msg->org_block, msg->new_block, msg->offset,
+			 target->bdev, dmu_get_flag(&msg->flags, DMU_FLAG_WR));
+	
+}
+
+static void do_kill_mapping(struct dmu_device *dev,
+			    struct dmu_msg_make_mapping *msg)
+{
+	if (!dmu_remove_mapping(dev, msg->org_block))
+		DMERR("Tried to remove non-existent mapping for %llu",
+		      msg->org_block);
+}
+
+static void do_map_bio(struct dmu_device *dev,
+		       struct dmu_msg_map_response *msg)
+{
+	struct dmu_request *req;
+	struct target_device *dst_dev;
+
+	req = find_rx_request(dev, msg->id_of_req);
+	if (!req) {
+		DMERR("Unable to complete unknown map: %llu\n",
+		      (unsigned long long) msg->id_of_req);
+		return;
+	}
+
+	/* Go ahead and hook up the target device*/
+	dst_dev = find_target(dev, MKDEV(msg->dst_maj, msg->dst_min));
+	if (!dst_dev) {
+		DMERR("Failed to find dest device %i:%i\n",
+		      msg->dst_maj, msg->dst_min);
+		goto fail;
+	}
+
+	req->target_dev = dst_dev->bdev;
+
+	memcpy(&req->response, msg, sizeof(req->response));
+
+	if (req->response.extra_count) {
+		make_extra_requests(req);
+	}
+
+	INIT_WORK(&req->task, map_worker);
+	schedule_work(&req->task);
+
+	return;
+
+ fail:
+	bio_io_error(req->bio, req->bio->bi_size);
+}
+
+static void do_map_done(struct dmu_device *dev, uint64_t id_of_op, int fail)
+{
+	struct dmu_request *req;
+
+	req = find_rx_request(dev, id_of_op);
+	if (!req) {
+		DMERR("Unable to complete unknown request: %llu\n",
+		      (unsigned long long) id_of_op);
+		return;
+	}
+
+	dmu_clr_flag(&req->flags, DMU_FLAG_SYNC);
+
+	req->bio->bi_end_io(req->bio, req->bio->bi_size, fail);
+}
+
+static void do_map_failed(struct dmu_device *dev, uint64_t id_of_op)
+{
+	struct dmu_request *req;
+
+	req = find_rx_request(dev, id_of_op);
+	if (!req) {
+		DMERR("Unable to fail unknown request: %llu\n",
+		      (unsigned long long) id_of_op);
+		return;
+	}
+
+	DMERR("Userspace failed to map id %llu (sector %llu)",
+	      (unsigned long long) id_of_op,
+	      (unsigned long long) req->bio->bi_sector);
+
+	bio_io_error(req->bio, req->bio->bi_size);
+
+	mempool_free(req, request_pool);
+}
+
+static int dmu_rxd(void *data)
+{
+	struct dmu_device *dev = (struct dmu_device *) data;
+	struct chardev_transport *t = dev->transport_private;
+	struct dmu_ring *ring = &t->rx;
+	struct dmu_msg *msg;
+
+	while (1) {
+		msg = dmu_head_msg(ring, ring->r_idx);
+		/* do we need this? */
+		flush_dcache_page(virt_to_page(msg));
+
+		if (!msg->hdr.status)
+			break;
+		
+		switch (msg->hdr.msg_type) {
+		case DM_USERSPACE_MAP_BLOCK_RESP:
+			do_map_bio(dev, &msg->payload.map_rsp);
+			break;
+
+		case DM_USERSPACE_MAP_FAILED:
+			do_map_failed(dev, msg->payload.map_rsp.id_of_req);
+			break;
+
+		case DM_USERSPACE_MAP_DONE:
+			do_map_done(dev, msg->payload.map_done.id_of_op, 0);
+			break;
+
+		case DM_USERSPACE_MAP_DONE_FAILED:
+			do_map_done(dev, msg->payload.map_done.id_of_op, 1);
+			break;
+
+		case DM_USERSPACE_MAKE_MAPPING:
+			do_make_mapping(dev, &msg->payload.make_mapping);
+			break;
+
+		case DM_USERSPACE_KILL_MAPPING:
+			do_kill_mapping(dev, &msg->payload.make_mapping);
+			break;
+
+		default:
+			DMWARN("Unknown incoming request type: %i",
+			       msg->hdr.msg_type);
+		}
+
+		msg->hdr.status = 0;
+		dmu_ring_idx_inc(ring);
+	};
+
+	return 0;
+}
+
+ssize_t dmu_ctl_write(struct file *file, const char __user *buffer,
+		      size_t size, loff_t *offset)
+{
+	struct dmu_device *dev = (struct dmu_device *)file->private_data;
+	struct chardev_transport *t = dev->transport_private;
+
+	wake_up(&t->tx_wqueue);
+
+	dmu_rxd(dev);
+
+	return size;
+}
+
+static void dmu_ring_free(struct dmu_ring *r)
+{
+	int i;
+	for (i = 0; i < DMU_RING_PAGES; i++) {
+		if (!r->r_pages[i])
+			break;
+		free_page(r->r_pages[i]);
+		r->r_pages[i] = 0;
+	}
+}
+
+static int dmu_ring_alloc(struct dmu_ring *r)
+{
+	int i;
+
+	r->r_idx = 0;
+	spin_lock_init(&r->r_lock);
+
+	for (i = 0; i < DMU_RING_PAGES; i++) {
+		r->r_pages[i] = get_zeroed_page(GFP_KERNEL);
+		if (!r->r_pages[i])
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+int dmu_ctl_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct chardev_transport *t;
+	struct dmu_device *dev;
+
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+
+	t = container_of(inode->i_cdev, struct chardev_transport, cdev);
+	dev = t->parent;
+
+	t->task = current;
+
+	init_waitqueue_head(&t->poll_wait);
+	init_waitqueue_head(&t->tx_wqueue);
+	init_waitqueue_head(&t->rx_wqueue);
+
+	ret = dmu_ring_alloc(&t->tx);
+	if (ret)
+		return -ENOMEM;
+
+	ret = dmu_ring_alloc(&t->rx);
+	if (ret)
+		goto free_tx;
+
+	t->tx_task = kthread_run(dmu_txd, dev, "%s_tx", DM_MSG_PREFIX);
+	if (!t->tx_task)
+		goto free_rx;
+
+	t->rx_task = kthread_run(dmu_rxd, dev, "%s_rx", DM_MSG_PREFIX);
+	if (!t->rx_task) {
+		ret = -ENOMEM;
+		goto destroy_tx_task;
+	}
+
+	get_dev(dev);
+
+	file->private_data = dev;
+
+	return 0;
+ destroy_tx_task:
+	kthread_stop(t->tx_task);
+free_rx:
+	dmu_ring_free(&t->rx);
+free_tx:
+	dmu_ring_free(&t->tx);
+	return ret;
+}
+
+int dmu_ctl_release(struct inode *inode, struct file *file)
+{
+	struct dmu_device *dev = (struct dmu_device *)file->private_data;
+	struct chardev_transport *t = dev->transport_private;
+
+	kthread_stop(t->rx_task);
+	kthread_stop(t->tx_task);
+
+	dmu_ring_free(&t->rx);
+	dmu_ring_free(&t->tx);
+
+	put_dev(dev);
+
+	/* Stop taking requests when there is no userspace to service them */
+	dev->request_slots = 0;
+
+	return 0;
+}
+
+unsigned dmu_ctl_poll(struct file *file, poll_table *wait)
+{
+	struct dmu_device *dev = (struct dmu_device *)file->private_data;
+	struct chardev_transport *t = dev->transport_private;
+	struct dmu_ring *ring = &t->tx;
+	struct dmu_msg *msg;
+	unsigned mask = 0;
+	u32 idx;
+	unsigned long flags;
+
+	poll_wait(file, &t->poll_wait, wait);
+
+	spin_lock_irqsave(&ring->r_lock, flags);
+
+	idx = ring->r_idx ? ring->r_idx - 1 : DMU_MAX_EVENTS - 1;
+	msg = dmu_head_msg(ring, idx);
+	if (msg->hdr.status)
+		mask |= POLLIN | POLLRDNORM;
+
+	spin_unlock_irqrestore(&ring->r_lock, flags);
+
+	return mask;
+}
+
+static int dmu_ring_map(struct vm_area_struct *vma, unsigned long addr,
+			struct dmu_ring *ring)
+{
+	int i, err;
+
+	for (i = 0; i < DMU_RING_PAGES; i++) {
+		struct page *page = virt_to_page(ring->r_pages[i]);
+		err = vm_insert_page(vma, addr, page);
+		if (err)
+			return err;
+		addr += PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+static int dmu_ctl_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct dmu_device *dev = (struct dmu_device *)file->private_data;
+	struct chardev_transport *t = dev->transport_private;
+	unsigned long addr;
+	int err;
+
+	if (vma->vm_pgoff)
+		return -EINVAL;
+
+	if (vma->vm_end - vma->vm_start != DMU_RING_SIZE * 2) {
+		DMERR("mmap size must be %lu, not %lu \n",
+			DMU_RING_SIZE * 2, vma->vm_end - vma->vm_start);
+		return -EINVAL;
+	}
+
+	addr = vma->vm_start;
+	err = dmu_ring_map(vma, addr, &t->tx);
+	if (err)
+		return err;
+	err = dmu_ring_map(vma, addr + DMU_RING_SIZE, &t->rx);
+
+	/* Open the gates and wake anyone waiting */
+	/* FIXME: Magic number */
+	dev->request_slots = 20000;
+	wake_up_interruptible(&dev->lowmem);
+
+	return err;
+}
+
+static struct file_operations ctl_fops = {
+	.open    = dmu_ctl_open,
+	.release = dmu_ctl_release,
+	.write   = dmu_ctl_write,
+	.mmap    = dmu_ctl_mmap,
+	.poll    = dmu_ctl_poll,
+	.owner   = THIS_MODULE,
+};
+
+static int get_free_minor(void)
+{
+	struct dmu_device *dev;
+	int minor = 0;
+
+	spin_lock(&devices_lock);
+
+	while (1) {
+		list_for_each_entry(dev, &devices, list) {
+			struct chardev_transport *t = dev->transport_private;
+			if (MINOR(t->ctl_dev) == minor)
+				goto dupe;
+		}
+		break;
+	dupe:
+		minor++;
+	}
+
+	spin_unlock(&devices_lock);
+
+	return minor;
+}
+
+int register_chardev_transport(struct dmu_device *dev)
+{
+	struct chardev_transport *t;
+	int ret;
+
+	dev->transport_private = kmalloc(sizeof(struct chardev_transport),
+					 GFP_KERNEL);
+	t = dev->transport_private;
+
+	if (!t) {
+		DMERR("Failed to allocate chardev transport");
+		goto bad;
+	}
+
+	t->ctl_dev = MKDEV(MAJOR(dmu_dev), get_free_minor());
+	t->parent = dev;
+
+	cdev_init(&t->cdev, &ctl_fops);
+	t->cdev.owner = THIS_MODULE;
+	t->cdev.ops = &ctl_fops;
+
+	ret = cdev_add(&t->cdev, t->ctl_dev, 1);
+	if (ret < 0) {
+		DMERR("Failed to register control device %d:%d",
+		       MAJOR(t->ctl_dev), MINOR(t->ctl_dev));
+		goto bad;
+	}
+
+	return 1;
+
+ bad:
+	kfree(t);
+	return 0;
+}
+
+void unregister_chardev_transport(struct dmu_device *dev)
+{
+	struct chardev_transport *t = dev->transport_private;
+
+	cdev_del(&t->cdev);
+	kfree(t);
+}
+
+int init_chardev_transport(void)
+{
+	int r;
+
+	count = 0;
+
+	r = alloc_chrdev_region(&dmu_dev, 0, 10, "dm-userspace");
+	if (r) {
+		DMERR("Failed to allocate chardev region");
+		return 0;
+	} else
+		return 1;
+}
+
+void cleanup_chardev_transport(void)
+{
+	unregister_chrdev_region(dmu_dev, 10);
+}
+
+void write_chardev_transport_info(struct dmu_device *dev,
+			char *buf, unsigned int maxlen)
+{
+	struct chardev_transport *t = dev->transport_private;
+
+	snprintf(buf, maxlen, "%x:%x",
+		 MAJOR(t->ctl_dev), MINOR(t->ctl_dev));
+}
diff -r 165c54942fb4 -r 5e2a821c0dff drivers/md/dm-userspace.c
- --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/md/dm-userspace.c	Wed Feb 21 13:33:36 2007 -0800
@@ -0,0 +1,613 @@
+/*
+ * Copyright IBM Corp., 2006
+ * Author: Dan Smith <danms@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/types.h>
+#include <linux/poll.h>
+
+#include <linux/dm-userspace.h>
+
+#include "dm.h"
+#include "dm-bio-list.h"
+#include "kcopyd.h"
+#include "dm-user.h"
+
+#define DMU_COPY_PAGES     256
+
+#define DM_MSG_PREFIX     "dm-userspace"
+
+struct kmem_cache *request_cache;
+mempool_t *request_pool;
+
+spinlock_t devices_lock;
+LIST_HEAD(devices);
+
+/* Device number for the control device */
+dev_t dmu_dev;
+
+void endio_worker(struct work_struct *work)
+{
+	struct dmu_request *req;
+	struct dmu_device *dev;
+	int extra_finished = 0;
+
+	req = container_of(work, struct dmu_request, task);
+	dev  = req->dev;
+
+	/*
+	 * 1. If there are unsubmitted extra writes, do those
+	 *    and reschedule
+	 * 2. If FLAG_SYNC, send SYNC to userspace and do not reschedule
+	 * 3. if not on list and extra completed, destroy
+	 * 4. Reschedule
+	 */
+
+	spin_lock(&dev->lock);
+
+	if (req->extra_bios.head) {
+		struct bio *bio;
+
+		while ((bio = bio_list_pop(&req->extra_bios))) {
+			DMINFO("Submitting extra bio: %p", bio);
+			generic_make_request(bio);
+		}
+
+		goto resched;
+	}
+
+	if (dmu_get_flag(&req->flags, DMU_FLAG_SYNC)) {
+		req->type = DM_USERSPACE_MAP_DONE;
+		add_tx_request(req->dev, req);
+		
+		goto out;
+	}
+
+	if (atomic_read(&req->extra_finished) == req->response.extra_count)
+		extra_finished = 1;
+	else {
+		/* FIXME: Remove */
+		DMINFO("My extra bios haven't finished yet: %i != %llu",
+		       atomic_read(&req->extra_finished),
+		       req->response.extra_count);
+	}
+
+	if (list_empty(&req->list) && 
+	    list_empty(&req->copy) && 
+	    extra_finished) {
+		mempool_free(req, request_pool);
+		atomic_dec(&dev->f_reqs);
+		atomic_dec(&dev->total);
+		wake_up_interruptible(&dev->lowmem);
+		
+		goto out;
+	}
+
+ resched:
+	PREPARE_WORK(&req->task, endio_worker);
+	schedule_work(&req->task);
+ out:
+	spin_unlock(&dev->lock);
+}
+
+/* Return an already-bound target device */
+struct target_device *find_target(struct dmu_device *dev,
+					 dev_t devno)
+{
+	struct target_device *target, *match = NULL;
+
+	spin_lock(&dev->lock);
+	list_for_each_entry(target, &dev->target_devs, list) {
+		if (target->bdev->bd_dev == devno) {
+			match = target;
+			break;
+		}
+	}
+	spin_unlock(&dev->lock);
+
+	return match;
+}
+
+/* Find a new target device and bind it to our device */
+static struct target_device *get_target(struct dmu_device *dev,
+					dev_t devno)
+{
+	struct target_device *target;
+	struct block_device *bdev;
+
+	target = find_target(dev, devno);
+	if (target)
+		return target;
+
+	bdev = open_by_devnum(devno, FMODE_READ | FMODE_WRITE);
+	if (IS_ERR(bdev)) {
+		DMERR("Unable to lookup device %x", devno);
+		return NULL;
+	}
+
+	target = kmalloc(sizeof(*target), GFP_KERNEL);
+	if (!target) {
+		DMERR("Unable to alloc new target device");
+		return NULL;
+	}
+
+	target->bdev = bdev;
+	INIT_LIST_HEAD(&target->list);
+
+	if (in_interrupt())
+		DMERR("%s in irq\n", __FUNCTION__);
+
+	spin_lock(&dev->lock);
+	list_add_tail(&target->list, &dev->target_devs);
+	spin_unlock(&dev->lock);
+
+	return target;
+}
+
+/* Caller must hold dev->lock */
+static void put_target(struct dmu_device *dev,
+		       struct target_device *target)
+{
+	list_del(&target->list);
+
+	bd_release(target->bdev);
+	blkdev_put(target->bdev);
+
+	kfree(target);
+}
+
+void destroy_dmu_device(struct kref *ref)
+{
+	struct dmu_device *dev;
+	struct list_head *cursor, *next;
+	int i;
+
+	dev = container_of(ref, struct dmu_device, users);
+
+	spin_lock(&devices_lock);
+	list_del(&dev->list);
+	spin_unlock(&devices_lock);
+
+	list_for_each_safe(cursor, next, &dev->target_devs) {
+		struct target_device *target;
+
+		target = list_entry(cursor,
+				    struct target_device,
+				    list);
+
+		put_target(dev, target);
+	}
+
+	list_for_each_safe(cursor, next, &dev->tx_requests) {
+		struct dmu_request *req;
+
+		req = list_entry(cursor,
+				 struct dmu_request,
+				 list);
+
+		DMERR("Failing unsent bio");
+		bio_io_error(req->bio, req->bio->bi_size);
+
+		list_del(&req->list);
+
+		mempool_free(req, request_pool);
+	}
+
+	for (i = 0; i < DMU_CP_HASH; i++) {
+		list_for_each_safe(cursor, next, &dev->rx_requests[i]) {
+			struct dmu_request *req;
+
+			req = list_entry(cursor,
+					 struct dmu_request,
+					 list);
+
+			DMERR("Failing bio");
+			req->flags = 0;
+			bio_io_error(req->bio, req->bio->bi_size);
+
+			list_del(&req->list);
+
+			mempool_free(req, request_pool);
+		}
+	}
+
+	dmu_remove_all_mappings(dev);
+
+	kcopyd_client_destroy(dev->kcopy);
+	unregister_chardev_transport(dev);
+
+	kfree(dev);
+}
+
+static int init_dmu_device(struct dmu_device *dev, u32 block_size)
+{
+	int ret, i;
+
+	init_waitqueue_head(&dev->lowmem);
+	INIT_LIST_HEAD(&dev->list);
+	INIT_LIST_HEAD(&dev->target_devs);
+	kref_init(&dev->users);
+	spin_lock_init(&dev->lock);
+	spin_lock_init(&dev->xmit_lock);
+
+	INIT_LIST_HEAD(&dev->tx_requests);
+
+	dev->rx_requests = kmalloc(sizeof(struct list_head) * DMU_CP_HASH,
+				   GFP_KERNEL);
+	if (!dev->rx_requests) {
+		DMERR("Failed to alloc RX hash\n");
+		return 0;
+	}
+
+	for (i = 0; i < DMU_CP_HASH; i++)
+		INIT_LIST_HEAD(&dev->rx_requests[i]);
+
+	dev->block_size  = block_size;
+	dev->block_mask  = block_size - 1;
+	dev->block_shift = ffs(block_size) - 1;
+
+	atomic_set(&dev->t_reqs, 0);
+	atomic_set(&dev->r_reqs, 0);
+	atomic_set(&dev->f_reqs, 0);
+	atomic_set(&dev->total, 0);
+	atomic_set(&dev->idcounter, 0);
+
+	dmu_alloc_mappings(&dev->mappings, 2048);
+
+	ret = kcopyd_client_create(DMU_COPY_PAGES, &dev->kcopy);
+	if (ret) {
+		DMERR("Failed to initialize kcopyd client");
+		return 0;
+	}
+
+	dev->request_slots = 0; /* Unable to queue reqs right away */
+
+	return 1;
+}
+
+static struct dmu_device *new_dmu_device(char *key,
+					 struct dm_target *ti,
+					 u32 block_size)
+{
+	struct dmu_device *dev;
+	int                ret;
+
+	dev = kmalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev) {
+		DMERR("Failed to allocate new userspace device");
+		return NULL;
+	}
+
+	if (!init_dmu_device(dev, block_size))
+		goto bad1;
+
+	snprintf(dev->key, DMU_KEY_LEN, "%s", key);
+
+	ret = register_chardev_transport(dev);
+	if (!ret)
+		goto bad2;
+
+	spin_lock(&devices_lock);
+	list_add(&dev->list, &devices);
+	spin_unlock(&devices_lock);
+
+	return dev;
+
+ bad2:
+	put_dev(dev);
+ bad1:
+	kfree(dev);
+	DMERR("Failed to create device");
+	return NULL;
+}
+
+static struct dmu_device *find_dmu_device(const char *key)
+{
+	struct dmu_device *dev;
+	struct dmu_device *match = NULL;
+
+	spin_lock(&devices_lock);
+
+	list_for_each_entry(dev, &devices, list) {
+		spin_lock(&dev->lock);
+		if (strncmp(dev->key, key, DMU_KEY_LEN) == 0) {
+			match = dev;
+			spin_unlock(&dev->lock);
+			break;
+		}
+		spin_unlock(&dev->lock);
+	}
+
+	spin_unlock(&devices_lock);
+
+	return match;
+}
+
+static int dmu_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	uint64_t block_size;
+	struct dmu_device *dev;
+	char *device_key;
+	char *block_size_param;
+	int target_idx = 2;
+
+	if (argc < 3) {
+		ti->error = "Invalid argument count";
+		return -EINVAL;
+	}
+
+	device_key = argv[0];
+	block_size_param = argv[1];
+
+	block_size = simple_strtoul(block_size_param, NULL, 10) / 512;
+
+	dev = find_dmu_device(device_key);
+	if (!dev) {
+		dev = new_dmu_device(device_key, ti, block_size);
+		if (!dev) {
+			ti->error = "Failed to create device";
+			goto bad;
+		}
+	} else
+		get_dev(dev);
+
+	spin_lock(&dev->lock);
+	if (dev->block_size != block_size) {
+		ti->error = "Invalid block size";
+		goto bad;
+	}
+	spin_unlock(&dev->lock);
+
+	/* Resolve target devices */
+	do {
+		int maj, min;
+		sscanf(argv[target_idx], "%i:%i", &maj, &min);
+		if (!get_target(dev, MKDEV(maj, min))) {
+			DMERR("Failed to find target device %i:%i (%s)",
+			      maj, min, argv[target_idx]);
+			goto out;
+		}
+	} while (++target_idx < argc);
+
+	ti->private  = dev;
+	ti->split_io = block_size;
+
+	return 0;
+
+ bad:
+	if (dev)
+		spin_unlock(&dev->lock);
+ out:
+	if (dev)
+		put_dev(dev);
+
+	return -EINVAL;
+}
+
+static void dmu_dtr(struct dm_target *ti)
+{
+	struct dmu_device *dev = (struct dmu_device *) ti->private;
+
+	put_dev(dev);
+}
+
+static void init_req(struct dmu_device *dev,
+		     struct bio *bio,
+		     struct dmu_request *req)
+{
+	req->id = (uint64_t) atomic_add_return(1, &dev->idcounter);
+
+	req->type = DM_USERSPACE_MAP_BLOCK_REQ;
+	req->dev = dev;
+	req->bio = bio;
+	req->u.block = dmu_block(dev, bio->bi_sector);
+	req->flags = 0;
+	INIT_LIST_HEAD(&req->deps);
+	INIT_LIST_HEAD(&req->list);
+	INIT_LIST_HEAD(&req->copy);
+	bio_list_init(&req->extra_bios);
+	atomic_set(&req->extra_finished, 0);
+
+	if (bio_rw(bio))
+		dmu_set_flag(&req->flags, DMU_FLAG_WR);
+}
+
+static int dmu_map(struct dm_target *ti, struct bio *bio,
+		   union map_info *map_context)
+{
+	struct dmu_device *dev = (struct dmu_device *) ti->private;
+	struct dmu_request *req;
+
+	if (unlikely(bio_barrier(bio))) {
+		DMINFO("Refusing bio barrier\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (dmu_map_from_mappings(dev, bio)) {
+		map_context->ptr = NULL;
+		return 1;
+	}
+
+	wait_event_interruptible(dev->lowmem,
+				 atomic_read(&dev->total) < 
+				 dev->request_slots);
+
+	req = mempool_alloc(request_pool, GFP_NOIO);
+	if (!req) {
+		DMERR("Failed to alloc request");
+		return -1;
+	}
+
+	atomic_inc(&dev->total);
+
+	map_context->ptr = req;
+
+	init_req(dev, bio, req);
+
+	add_tx_request(dev, req);
+
+	return 0;
+}
+
+static int dmu_status(struct dm_target *ti, status_type_t type,
+		      char *result, unsigned int maxlen)
+{
+	struct dmu_device *dev = (struct dmu_device *) ti->private;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		write_chardev_transport_info(dev, result, maxlen);
+		break;
+
+	case STATUSTYPE_TABLE:
+		snprintf(result, maxlen, "%s %llu",
+			 dev->key,
+			 (unsigned long long) dev->block_size * 512);
+		break;
+	}
+
+	return 0;
+}
+
+static int dmu_end_io(struct dm_target *ti, struct bio *bio,
+                        int error, union map_info *map_context)
+{
+	struct dmu_request *req = map_context->ptr;
+	int ret = 0;
+
+	if (error)
+		return -1;
+
+	if (!req)
+		return 0;
+
+	if (dmu_get_flag(&req->flags, DMU_FLAG_SYNC)) {
+		req->type = DM_USERSPACE_MAP_DONE;
+		add_tx_request(req->dev, req);
+		ret = 1;
+	} else {
+		INIT_WORK(&req->task, endio_worker);
+		schedule_work(&req->task);
+	}
+
+	return ret;
+}
+
+struct target_type userspace_target = {
+	.name    = "userspace",
+	.version = {0, 1, 0},
+	.module  = THIS_MODULE,
+	.ctr     = dmu_ctr,
+	.dtr     = dmu_dtr,
+	.map     = dmu_map,
+	.status  = dmu_status,
+	.end_io  = dmu_end_io
+};
+
+int __init dm_userspace_init(void)
+{
+	int r = dm_register_target(&userspace_target);
+	if (r < 0) {
+		DMERR("Register failed %d", r);
+		return 0;
+	}
+
+	spin_lock_init(&devices_lock);
+
+	request_cache =
+		kmem_cache_create("dm-userspace-requests",
+				  sizeof(struct dmu_request),
+				  __alignof__ (struct dmu_request),
+				  0, NULL, NULL);
+	if (!request_cache) {
+		DMERR("Failed to allocate request cache");
+		goto bad;
+	}
+
+	request_pool = mempool_create(64,
+				      mempool_alloc_slab, mempool_free_slab,
+				      request_cache);
+	if (!request_pool) {
+		DMERR("Failed to allocate request pool");
+		goto bad2;
+	}
+
+	r = dmu_init_mappings();
+	if (!r)
+		goto bad3;
+
+	r = init_chardev_transport();
+	if (!r)
+		goto bad4;
+
+	return 1;
+ bad4:
+	dmu_cleanup_mappings();
+ bad3:
+	mempool_destroy(request_pool);
+ bad2:
+	kmem_cache_destroy(request_cache);
+ bad:
+	dm_unregister_target(&userspace_target);
+
+	return 0;
+}
+
+void __exit dm_userspace_exit(void)
+{
+	int r;
+	struct list_head *cursor, *next;
+	struct dmu_device *dev;
+
+	spin_lock(&devices_lock);
+
+	list_for_each_safe(cursor, next, &devices) {
+		dev = list_entry(cursor, struct dmu_device, list);
+		list_del(cursor);
+		destroy_dmu_device(&dev->users);
+		DMERR("Destroying hanging device %s", dev->key);
+	}
+
+	spin_unlock(&devices_lock);
+
+	cleanup_chardev_transport();
+
+	mempool_destroy(request_pool);
+	kmem_cache_destroy(request_cache);
+
+	dmu_cleanup_mappings();
+
+	r = dm_unregister_target(&userspace_target);
+	if (r < 0)
+		DMERR("unregister failed %d", r);
+}
+
+module_init(dm_userspace_init);
+module_exit(dm_userspace_exit);
+
+MODULE_DESCRIPTION(DM_NAME " userspace target");
+MODULE_AUTHOR("Dan Smith");
+MODULE_LICENSE("GPL");
diff -r 165c54942fb4 -r 5e2a821c0dff include/linux/dm-userspace.h
- --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/linux/dm-userspace.h	Wed Feb 21 13:33:36 2007 -0800
@@ -0,0 +1,139 @@
+/*
+ * Copyright IBM Corp., 2006
+ * Author: Dan Smith <danms@us.ibm.com>
+ *
+ * This file is released under the LGPL
+ *
+ */
+
+#ifndef __DM_USERSPACE_H
+#define __DM_USERSPACE_H
+
+#include <linux/types.h>
+
+/*
+ * Message Types
+ */
+#define DM_USERSPACE_MAP_BLOCK_REQ    1
+#define DM_USERSPACE_MAP_BLOCK_RESP   2
+#define DM_USERSPACE_MAP_FAILED       3
+#define DM_USERSPACE_MAP_DONE         4
+#define DM_USERSPACE_MAP_DONE_FAILED  5
+#define DM_USERSPACE_MAKE_MAPPING     6
+#define DM_USERSPACE_KILL_MAPPING     7
+
+/*
+ * Flags and associated macros
+ */
+#define DMU_FLAG_VALID       1
+#define DMU_FLAG_WR          2
+#define DMU_FLAG_COPY_FIRST  4
+#define DMU_FLAG_SYNC        8
+
+/*
+ * Message status values
+ */
+#define DMU_MSG_INACTIVE 0
+#define DMU_MSG_ACTIVE   1
+#define DMU_MSG_NEEDSATT 2
+
+static inline int dmu_get_flag(uint32_t *flags, uint32_t flag)
+{
+	return (*flags & flag) != 0;
+}
+
+static inline void dmu_set_flag(uint32_t *flags, uint32_t flag)
+{
+	*flags |= flag;
+}
+
+static inline void dmu_clr_flag(uint32_t *flags, uint32_t flag)
+{
+	*flags &= (~flag);
+}
+
+static inline void dmu_cpy_flag(uint32_t *flags, uint32_t src, uint32_t flag)
+{
+	*flags = (*flags & ~flag) | (src & flag);
+}
+
+/*
+ * This message header is sent in front of every message, in both
+ * directions
+ */
+struct dmu_msg_header {
+	uint64_t id;
+	uint32_t msg_type;
+	uint32_t payload_len;
+	uint32_t status;
+	uint32_t padding;
+};
+
+/* DM_USERSPACE_MAP_DONE
+ * DM_USERSPACE_MAP_DONE_FAILED
+ */
+struct dmu_msg_map_done {
+	uint64_t id_of_op;
+	uint64_t org_block;
+	uint32_t flags;
+};
+
+/* DM_USERSPACE_MAP_BLOCK_REQ */
+struct dmu_msg_map_request {
+	uint64_t org_block;
+
+	uint32_t flags;
+};
+
+struct dmu_msg_make_mapping {
+	uint64_t org_block;
+	uint64_t new_block;
+	int64_t offset;
+	uint32_t dev_maj;
+	uint32_t dev_min;
+	uint32_t flags;
+};
+
+struct dmu_extra_write {
+	uint64_t buf;
+	uint64_t offset;
+	uint64_t len;
+};
+
+/* DM_USERSPACE_MAP_BLOCK_RESP
+ * DM_USERSPACE_MAP_BLOCK_FAILED
+ */
+struct dmu_msg_map_response {
+	uint64_t new_block;
+	int64_t offset;
+
+	uint64_t extra_writes;
+	uint64_t extra_count;
+
+	uint64_t id_of_req;
+	uint32_t flags;
+
+	uint32_t src_maj;
+	uint32_t src_min;
+
+	uint32_t dst_maj;
+	uint32_t dst_min;
+};
+
+/* A full message */
+struct dmu_msg {
+	struct dmu_msg_header hdr;
+	union {
+		struct dmu_msg_map_done map_done;
+		struct dmu_msg_map_request map_req;
+		struct dmu_msg_map_response map_rsp;
+		struct dmu_msg_make_mapping make_mapping;
+	} payload;
+};
+
+#define DMU_RING_SIZE (1UL << 16)
+#define DMU_RING_PAGES (DMU_RING_SIZE >> PAGE_SHIFT)
+#define DMU_EVENT_PER_PAGE (PAGE_SIZE / sizeof(struct dmu_msg))
+#define DMU_MAX_EVENTS (DMU_EVENT_PER_PAGE * DMU_RING_PAGES)
+
+#endif
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.5 (GNU/Linux)

iD8DBQFF3Ls1wtEf7b4GJVQRAnNTAJ0YVFI+Gun7yifR6dT7bKj2QkS54QCgiq2H
Vq4iP7YqCYIigmTiFnYHcVg=
=n+r+
-----END PGP SIGNATURE-----