All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] osdblk: a Linux block device for OSD objects
@ 2009-04-02  1:54 Jeff Garzik
  2009-04-02  2:05 ` Jeff Garzik
                   ` (5 more replies)
  0 siblings, 6 replies; 24+ messages in thread
From: Jeff Garzik @ 2009-04-02  1:54 UTC (permalink / raw)
  To: LKML, linux-scsi; +Cc: linux-fsdevel, axboe, Andrew Morton


As I promised in older exofs threads, here is a client for libosd
_other_ than exofs.  This block driver exports a single OSD object
as a Linux block device.

See the comment block at the top of the driver for usage instructions.



 drivers/block/Kconfig  |   16 +
 drivers/block/Makefile |    1 
 drivers/block/osdblk.c |  563 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 580 insertions(+)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index e7b8aa0..ff46b0e 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -298,6 +298,22 @@ config BLK_DEV_NBD
 
 	  If unsure, say N.
 
+config BLK_DEV_OSD
+	tristate "OSD object-as-blkdev support"
+	depends on SCSI_OSD_INITIATOR
+	---help---
+	  Saying Y or M here will allow the exporting of a single SCSI
+	  OSD (object-based storage) object as a Linux block device.
+
+	  For example, if you create a 2G object on an OSD device,
+	  you can then use this module to present that 2G object as
+	  a Linux block device.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called osdblk.
+
+	  If unsure, say N.
+
 config BLK_DEV_SX8
 	tristate "Promise SATA SX8 support"
 	depends on PCI
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 3145141..859bf5d 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_BLK_DEV_DAC960)	+= DAC960.o
 obj-$(CONFIG_XILINX_SYSACE)	+= xsysace.o
 obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
 obj-$(CONFIG_SUNVDC)		+= sunvdc.o
+obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
 
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
 obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
new file mode 100644
index 0000000..d3a2fb5
--- /dev/null
+++ b/drivers/block/osdblk.c
@@ -0,0 +1,563 @@
+
+/*
+   osdblk.c -- Export a single SCSI OSD object as a Linux block device
+
+
+   Copyright 2009 Red Hat, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+   Instructions for use
+   --------------------
+
+   1) Map a Linux block device to an existing OSD object.
+
+      In this example, we will use partition id 1234, object id 5678,
+      OSD device /dev/osd1.
+
+      $ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add
+
+
+   2) List all active blkdev<->object mappings.
+
+      In this example, we have performed step #1 twice, creating two blkdevs,
+      mapped to two separate OSD objects.
+
+      $ cat /sys/class/osdblk/list
+      0 174 1234 5678 /dev/osd1
+      1 179 1994 897123 /dev/osd0
+
+      The columns, in order, are:
+      - blkdev unique id
+      - blkdev assigned major
+      - OSD object partition id
+      - OSD object id
+      - OSD device
+
+
+   3) Remove an active blkdev<->object mapping.
+
+      $ echo 1 > /sys/class/osdblk/remove
+
+
+   NOTE:  The actual creation and deletion of OSD objects is outside the scope
+   of this driver.
+
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_sec.h>
+
+#define DRV_NAME "osdblk"
+#define PFX DRV_NAME ": "
+
+struct osdblk_device;
+
+enum {
+	OSDBLK_MAX_DEVS		= 64,
+	OSDBLK_MINORS_PER_MAJOR	= 256,
+	OSDBLK_MAX_REQ		= 32,
+	OSDBLK_OP_TIMEOUT	= 4 * 60,
+};
+
+struct osdblk_request {
+	struct request		*rq;
+	struct bio		*bio;
+	struct osdblk_device	*osdev;
+	int			tag;
+	uint8_t			cred[OSD_CAP_LEN];
+};
+
+struct osdblk_device {
+	int			id;
+
+	int			major;
+	struct gendisk		*disk;
+	struct request_queue	*q;
+
+	struct osd_dev		*osd;
+
+	char			name[32];
+
+	spinlock_t		lock;
+
+	struct osd_obj_id	obj;
+	uint8_t			obj_cred[OSD_CAP_LEN];
+
+	struct osdblk_request	req[OSDBLK_MAX_REQ];
+
+	unsigned long		part_id;
+	unsigned long		obj_id;
+	char			osd_path[0];
+};
+
+static struct class *class_osdblk;		/* /sys/class/osdblk */
+static struct mutex ctl_mutex;	/* Serialize open/close/setup/teardown */
+static struct osdblk_device *osdblk_devs[OSDBLK_MAX_DEVS];
+
+static struct block_device_operations osdblk_bd_ops = {
+	.owner		= THIS_MODULE,
+};
+
+const struct osd_attr g_attr_logical_length = ATTR_DEF(
+	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+
+static void osd_make_credential(u8 cred_a[OSD_CAP_LEN],
+				const struct osd_obj_id *obj)
+{
+	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+
+/*
+ * Perform a synchronous OSD operation.
+ */
+static int osd_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
+{
+	int ret;
+
+	or->timeout = timeout;
+	ret = osd_finalize_request(or, 0, credential, NULL);
+	if (ret)
+		return ret;
+
+	ret = osd_execute_request(or);
+
+	/* osd_req_decode_sense(or, ret); */
+	return ret;
+}
+
+/*
+ * Perform an asynchronous OSD operation.
+ */
+static int osd_async_op(struct osd_request *or, osd_req_done_fn *async_done,
+		   void *caller_context, u8 *cred)
+{
+	int ret;
+
+	ret = osd_finalize_request(or, 0, cred, NULL);
+	if (ret)
+		return ret;
+
+	ret = osd_execute_request_async(or, async_done, caller_context);
+
+	return ret;
+}
+
+static int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
+{
+	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+	void *iter = NULL;
+	int nelem;
+
+	do {
+		nelem = 1;
+		osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
+		if ((cur_attr.attr_page == attr->attr_page) &&
+		    (cur_attr.attr_id == attr->attr_id)) {
+			attr->len = cur_attr.len;
+			attr->val_ptr = cur_attr.val_ptr;
+			return 0;
+		}
+	} while (iter);
+
+	return -EIO;
+}
+
+static int osdblk_get_obj_size(struct osdblk_device *osdev, u64 *size_out)
+{
+	struct osd_request *or;
+	struct osd_attr attr;
+	int ret;
+
+	osd_make_credential(osdev->obj_cred, &osdev->obj);
+
+	or = osd_start_request(osdev->osd, GFP_KERNEL);
+	if (!or)
+		return -ENOMEM;
+
+	osd_req_get_attributes(or, &osdev->obj);
+
+	osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
+
+	/* execute op synchronously */
+	ret = osd_sync_op(or, OSDBLK_OP_TIMEOUT, osdev->obj_cred);
+	if (ret)
+		goto out;
+
+	attr = g_attr_logical_length;
+	ret = extract_attr_from_req(or, &attr);
+	if (ret)
+		goto out;
+
+	*size_out = get_unaligned_be64(attr.val_ptr);
+
+out:
+	osd_end_request(or);
+	return ret;
+
+}
+
+static int osdblk_get_free_req(struct osdblk_device *osdev)
+{
+	int i;
+
+	for (i = 0; i < OSDBLK_MAX_REQ; i++) {
+		if (!osdev->req[i].rq)
+			return i;
+	}
+
+	return -1;
+}
+
+static void osdblk_end_request(struct osdblk_device *osdev,
+			       struct osdblk_request *orq,
+			       int error)
+{
+	struct request *rq = orq->rq;
+	int rc;
+
+	/* complete request, at block layer */
+	rc = __blk_end_request(rq, error, blk_rq_bytes(rq));
+
+	/* clear request slot for use */
+	osdev->req[orq->tag].rq = NULL;
+
+	/* restart queue, if necessary */
+	blk_start_queue(osdev->q);
+}
+
+static void osdblk_osd_complete(struct osd_request *or, void *private)
+{
+	struct osdblk_request *orq = private;
+	struct osd_sense_info osi;
+	int ret = osd_req_decode_sense(or, &osi);
+
+	if (ret)
+		ret = -EIO;
+
+	osd_end_request(or);
+	osdblk_end_request(orq->osdev, orq, ret);
+}
+
+static void osdblk_rq_fn(struct request_queue *q)
+{
+	struct osdblk_device *osdev = q->queuedata;
+	struct request *rq;
+	struct osdblk_request *orq;
+	struct osd_request *or;
+	struct bio *bio;
+	int rq_idx, do_write;
+
+	while (1) {
+		rq = elv_next_request(q);
+		if (!rq)
+			break;
+
+		do_write = (rq_data_dir(rq) == WRITE);
+
+		bio = bio_clone(rq->bio, GFP_NOIO);
+		if (!bio)
+			break;
+
+		rq_idx = osdblk_get_free_req(osdev);
+		if (rq_idx < 0) {
+			bio_put(bio);
+			blk_stop_queue(q);
+			break;
+		}
+
+		orq = &osdev->req[rq_idx];
+		orq->tag = rq_idx;
+		orq->rq = rq;
+		orq->bio = bio;
+		orq->osdev = osdev;
+
+		blkdev_dequeue_request(rq);
+
+		osd_make_credential(orq->cred, &osdev->obj);
+
+		or = osd_start_request(osdev->osd, GFP_NOIO);
+		if (!or) {
+			blk_requeue_request(q, rq);
+			bio_put(bio);
+			break;
+		}
+
+		if (do_write)
+			osd_req_write(or, &osdev->obj, bio,
+				      rq->sector * 512ULL);
+		else
+			osd_req_read(or, &osdev->obj, bio,
+				     rq->sector * 512ULL);
+
+		if (osd_async_op(or, osdblk_osd_complete, orq, orq->cred)) {
+			/* FIXME: leak OSD request 'or' ? */
+			blk_requeue_request(q, rq);
+			bio_put(bio);
+		}
+	}
+}
+
+static void osdblk_free_disk(struct osdblk_device *osdev)
+{
+	struct gendisk *disk = osdev->disk;
+
+	if (!disk)
+		return;
+
+	if (disk->flags & GENHD_FL_UP)
+		del_gendisk(disk);
+	if (disk->queue)
+		blk_cleanup_queue(disk->queue);
+	put_disk(disk);
+}
+
+static int osdblk_init_disk(struct osdblk_device *osdev)
+{
+	struct gendisk *disk;
+	struct request_queue *q;
+	int rc;
+	u64 obj_size = 0;
+
+	rc = osdblk_get_obj_size(osdev, &obj_size);
+	if (rc)
+		return rc;
+
+	disk = alloc_disk(OSDBLK_MINORS_PER_MAJOR);
+	if (!disk)
+		return -ENOMEM;
+
+	sprintf(disk->disk_name, DRV_NAME "/%d", osdev->id);
+	disk->major = osdev->major;
+	disk->first_minor = 0;
+	disk->fops = &osdblk_bd_ops;
+	disk->private_data = osdev;
+
+	q = blk_init_queue(osdblk_rq_fn, &osdev->lock);
+	if (!q) {
+		put_disk(disk);
+		return -ENOMEM;
+	}
+
+	disk->queue = q;
+
+	q->queuedata = osdev;
+
+	osdev->disk = disk;
+	osdev->q = q;
+
+	set_capacity(disk, obj_size);
+	add_disk(disk);
+
+	return 0;
+}
+
+/********************************************************************
+  /sys/class/osdblk/
+                     add	map OSD object to blkdev
+                     remove	unmap OSD object
+                     list	show mappings
+ *******************************************************************/
+
+static void class_osdblk_release(struct class *cls)
+{
+	kfree(cls);
+}
+
+static ssize_t class_osdblk_show(struct class *c, char *data)
+{
+	int n = 0;
+	int idx;
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+	for (idx = 0; idx < OSDBLK_MAX_DEVS; idx++) {
+		struct osdblk_device *osdev = osdblk_devs[idx];
+		if (!osdev)
+			continue;
+		n += sprintf(data+n, "%d %d %lu %lu %s\n",
+			osdev->id,
+			osdev->major,
+			osdev->part_id,
+			osdev->obj_id,
+			osdev->osd_path);
+	}
+	mutex_unlock(&ctl_mutex);
+	return n;
+}
+
+static ssize_t class_osdblk_add(struct class *c, const char *buf, size_t count)
+{
+	struct osdblk_device *osdev;
+	ssize_t rc;
+	int idx, irc;
+
+	osdev = kzalloc(sizeof(*osdev) + strlen(buf) + 1, GFP_KERNEL);
+	if (!osdev)
+		return -ENOMEM;
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	for (idx = 0; idx < OSDBLK_MAX_DEVS; idx++) {
+		if (!osdblk_devs[idx]) {
+			osdblk_devs[idx] = osdev;
+			osdev->id = idx;
+			break;
+		}
+	}
+
+	mutex_unlock(&ctl_mutex);
+
+	if (idx == OSDBLK_MAX_DEVS) {
+		rc = -ENOSPC;
+		goto err_out;
+	}
+
+	if (sscanf(buf, "%lu %lu %s", &osdev->part_id, &osdev->obj_id,
+		   osdev->osd_path) != 3) {
+		rc = -EINVAL;
+		goto err_out_slot;
+	}
+
+	osdev->obj.partition = osdev->part_id;
+	osdev->obj.id = osdev->obj_id;
+
+	sprintf(osdev->name, DRV_NAME "%d", osdev->id);
+	spin_lock_init(&osdev->lock);
+
+	osdev->osd = osduld_path_lookup(osdev->osd_path);
+	if (IS_ERR(osdev->osd)) {
+		rc = PTR_ERR(osdev->osd);
+		goto err_out_slot;
+	}
+
+	irc = register_blkdev(0, osdev->name);
+	if (irc < 0) {
+		rc = irc;
+		goto err_out_osd;
+	}
+
+	osdev->major = irc;
+
+	rc = osdblk_init_disk(osdev);
+	if (rc)
+		goto err_out_blkdev;
+
+	return 0;
+
+err_out_blkdev:
+	unregister_blkdev(osdev->major, osdev->name);
+err_out_osd:
+	osduld_put_device(osdev->osd);
+err_out_slot:
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+	osdblk_devs[osdev->id] = NULL;
+	mutex_unlock(&ctl_mutex);
+err_out:
+	kfree(osdev);
+	return rc;
+}
+
+static ssize_t class_osdblk_remove(struct class *c, const char *buf,
+					size_t count)
+{
+	struct osdblk_device *osdev;
+	int target_id;
+
+	if (sscanf(buf, "%d", &target_id) != 1)
+		return -EINVAL;
+	if (target_id < 0 || target_id >= OSDBLK_MAX_DEVS)
+		return -EINVAL;
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+	osdev = osdblk_devs[target_id];
+	osdblk_devs[target_id] = NULL;
+	mutex_unlock(&ctl_mutex);
+
+	if (!osdev)
+		return -ENOENT;
+
+	osdblk_free_disk(osdev);
+	unregister_blkdev(osdev->major, osdev->name);
+	osduld_put_device(osdev->osd);
+	kfree(osdev);
+
+	return 0;
+}
+
+static struct class_attribute class_osdblk_attrs[] = {
+	__ATTR(add,	0200, NULL, class_osdblk_add),
+	__ATTR(remove,	0200, NULL, class_osdblk_remove),
+	__ATTR(list,	0444, class_osdblk_show, NULL),
+	__ATTR_NULL
+};
+
+static int osdblk_sysfs_init(void)
+{
+	int ret = 0;
+
+	/*
+	 * create control files in sysfs
+	 * /sys/class/osdblk/...
+	 */
+	class_osdblk = kzalloc(sizeof(*class_osdblk), GFP_KERNEL);
+	if (!class_osdblk)
+		return -ENOMEM;
+
+	class_osdblk->name = DRV_NAME;
+	class_osdblk->owner = THIS_MODULE;
+	class_osdblk->class_release = class_osdblk_release;
+	class_osdblk->class_attrs = class_osdblk_attrs;
+
+	ret = class_register(class_osdblk);
+	if (ret) {
+		kfree(class_osdblk);
+		class_osdblk = NULL;
+		printk(PFX "failed to create class osdblk\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void osdblk_sysfs_cleanup(void)
+{
+	if (class_osdblk)
+		class_destroy(class_osdblk);
+	class_osdblk = NULL;
+}
+
+static int __init osdblk_init(void)
+{
+	int rc;
+
+	rc = osdblk_sysfs_init();
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static void __exit osdblk_exit(void)
+{
+	osdblk_sysfs_cleanup();
+}
+
+module_init(osdblk_init);
+module_exit(osdblk_exit);
+

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH] osdblk: a Linux block device for OSD objects
  2009-04-02  1:54 [PATCH] osdblk: a Linux block device for OSD objects Jeff Garzik
@ 2009-04-02  2:05 ` Jeff Garzik
  2009-04-02 12:26 ` Boaz Harrosh
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 24+ messages in thread
From: Jeff Garzik @ 2009-04-02  2:05 UTC (permalink / raw)
  To: LKML, linux-scsi; +Cc: linux-fsdevel, axboe, Andrew Morton

Jeff Garzik wrote:
> As I promised in older exofs threads, here is a client for libosd
> _other_ than exofs.  This block driver exports a single OSD object
> as a Linux block device.

It should be noted that the first handful of functions are duplicates of 
fs/exofs/osd.c; an obvious consolidation is in order.

	Jeff





^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] osdblk: a Linux block device for OSD objects
  2009-04-02  1:54 [PATCH] osdblk: a Linux block device for OSD objects Jeff Garzik
  2009-04-02  2:05 ` Jeff Garzik
@ 2009-04-02 12:26 ` Boaz Harrosh
  2009-04-02 16:46   ` Jeff Garzik
  2009-04-03  9:38   ` Jeff Garzik
  2009-04-03  1:32 ` James Bottomley
                   ` (3 subsequent siblings)
  5 siblings, 2 replies; 24+ messages in thread
From: Boaz Harrosh @ 2009-04-02 12:26 UTC (permalink / raw)
  To: Jeff Garzik, open-osd mailing-list
  Cc: LKML, linux-scsi, linux-fsdevel, axboe, Andrew Morton

On 04/02/2009 04:54 AM, Jeff Garzik wrote:
> As I promised in older exofs threads, here is a client for libosd
> _other_ than exofs.  This block driver exports a single OSD object
> as a Linux block device.
> 
> See the comment block at the top of the driver for usage instructions.
> 
> 
> 
>  drivers/block/Kconfig  |   16 +
>  drivers/block/Makefile |    1 
>  drivers/block/osdblk.c |  563 +++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 580 insertions(+)
> 

Forwarding to open-osd ml

Jeff is it OK if I pick up this patch through my tree and will push it
together with the other pending patches for 2.6.31 Kernel? I intend to
send a pull request for the exofs tree directly to Linus tonight, and
if it goes well, do so every kernel.

Unless you have intended this for the current 2.6.30 merge window which
would be hard for me to help, I don't want to push my luck.

I have already put this patch on the out-of-tree git. On your
positive feedback I can put it in the osd-Linux tree and it will
so, be included in linux-next (After the current merge window ends).

> It should be noted that the first handful of functions are duplicates of 
> fs/exofs/osd.c; an obvious consolidation is in order.
> 

I have taken that to my heart and will submit patches for that, next week.
Including a complimentary patch to this driver. These changes are only
intended for 2.6.31 though.

I also want to add a small utility that can manage objects, create, size,
remove, and mount as a complimentary wrapper for this driver is "osdblk"
a good name for such utility?

Few code comments below.

> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index e7b8aa0..ff46b0e 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -298,6 +298,22 @@ config BLK_DEV_NBD
>  
>  	  If unsure, say N.
>  
> +config BLK_DEV_OSD
> +	tristate "OSD object-as-blkdev support"
> +	depends on SCSI_OSD_INITIATOR
> +	---help---
> +	  Saying Y or M here will allow the exporting of a single SCSI
> +	  OSD (object-based storage) object as a Linux block device.
> +
> +	  For example, if you create a 2G object on an OSD device,
> +	  you can then use this module to present that 2G object as
> +	  a Linux block device.
> +
> +	  To compile this driver as a module, choose M here: the
> +	  module will be called osdblk.
> +
> +	  If unsure, say N.
> +
>  config BLK_DEV_SX8
>  	tristate "Promise SATA SX8 support"
>  	depends on PCI
> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
> index 3145141..859bf5d 100644
> --- a/drivers/block/Makefile
> +++ b/drivers/block/Makefile
> @@ -22,6 +22,7 @@ obj-$(CONFIG_BLK_DEV_DAC960)	+= DAC960.o
>  obj-$(CONFIG_XILINX_SYSACE)	+= xsysace.o
>  obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
>  obj-$(CONFIG_SUNVDC)		+= sunvdc.o
> +obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
>  
>  obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
>  obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
> diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
> new file mode 100644
> index 0000000..d3a2fb5
> --- /dev/null
> +++ b/drivers/block/osdblk.c
> @@ -0,0 +1,563 @@
> +
> +/*
> +   osdblk.c -- Export a single SCSI OSD object as a Linux block device
> +
> +
> +   Copyright 2009 Red Hat, Inc.
> +
> +   This program is free software; you can redistribute it and/or modify
> +   it under the terms of the GNU General Public License as published by
> +   the Free Software Foundation.
> +
> +   This program is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +   GNU General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with this program; see the file COPYING.  If not, write to
> +   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
> +
> +
> +   Instructions for use
> +   --------------------
> +
> +   1) Map a Linux block device to an existing OSD object.
> +
> +      In this example, we will use partition id 1234, object id 5678,
> +      OSD device /dev/osd1.
> +
> +      $ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add
> +
> +
> +   2) List all active blkdev<->object mappings.
> +
> +      In this example, we have performed step #1 twice, creating two blkdevs,
> +      mapped to two separate OSD objects.
> +
> +      $ cat /sys/class/osdblk/list
> +      0 174 1234 5678 /dev/osd1
> +      1 179 1994 897123 /dev/osd0
> +
> +      The columns, in order, are:
> +      - blkdev unique id
> +      - blkdev assigned major
> +      - OSD object partition id
> +      - OSD object id
> +      - OSD device
> +
> +
> +   3) Remove an active blkdev<->object mapping.
> +
> +      $ echo 1 > /sys/class/osdblk/remove
> +
> +
> +   NOTE:  The actual creation and deletion of OSD objects is outside the scope
> +   of this driver.
> +
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/device.h>
> +#include <linux/module.h>
> +#include <linux/fs.h>
> +#include <scsi/osd_initiator.h>
> +#include <scsi/osd_attributes.h>
> +#include <scsi/osd_sec.h>
> +
> +#define DRV_NAME "osdblk"
> +#define PFX DRV_NAME ": "
> +
> +struct osdblk_device;
> +
> +enum {
> +	OSDBLK_MAX_DEVS		= 64,
> +	OSDBLK_MINORS_PER_MAJOR	= 256,
> +	OSDBLK_MAX_REQ		= 32,
> +	OSDBLK_OP_TIMEOUT	= 4 * 60,
> +};
> +
> +struct osdblk_request {
> +	struct request		*rq;
> +	struct bio		*bio;
> +	struct osdblk_device	*osdev;
> +	int			tag;
> +	uint8_t			cred[OSD_CAP_LEN];
> +};
> +
> +struct osdblk_device {
> +	int			id;
> +
> +	int			major;
> +	struct gendisk		*disk;
> +	struct request_queue	*q;
> +
> +	struct osd_dev		*osd;
> +
> +	char			name[32];
> +
> +	spinlock_t		lock;
> +
> +	struct osd_obj_id	obj;
> +	uint8_t			obj_cred[OSD_CAP_LEN];
> +
> +	struct osdblk_request	req[OSDBLK_MAX_REQ];
> +
> +	unsigned long		part_id;
> +	unsigned long		obj_id;
> +	char			osd_path[0];
> +};
> +
> +static struct class *class_osdblk;		/* /sys/class/osdblk */
> +static struct mutex ctl_mutex;	/* Serialize open/close/setup/teardown */
> +static struct osdblk_device *osdblk_devs[OSDBLK_MAX_DEVS];
> +
> +static struct block_device_operations osdblk_bd_ops = {
> +	.owner		= THIS_MODULE,
> +};
> +
> +const struct osd_attr g_attr_logical_length = ATTR_DEF(
> +	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
> +
> +static void osd_make_credential(u8 cred_a[OSD_CAP_LEN],
> +				const struct osd_obj_id *obj)
> +{
> +	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
> +}
> +
> +/*
> + * Perform a synchronous OSD operation.
> + */
> +static int osd_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
> +{
> +	int ret;
> +
> +	or->timeout = timeout;
> +	ret = osd_finalize_request(or, 0, credential, NULL);
> +	if (ret)
> +		return ret;
> +
> +	ret = osd_execute_request(or);
> +
> +	/* osd_req_decode_sense(or, ret); */
> +	return ret;
> +}
> +
> +/*
> + * Perform an asynchronous OSD operation.
> + */
> +static int osd_async_op(struct osd_request *or, osd_req_done_fn *async_done,
> +		   void *caller_context, u8 *cred)
> +{
> +	int ret;
> +
> +	ret = osd_finalize_request(or, 0, cred, NULL);
> +	if (ret)
> +		return ret;
> +
> +	ret = osd_execute_request_async(or, async_done, caller_context);
> +
> +	return ret;
> +}
> +
> +static int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
> +{
> +	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
> +	void *iter = NULL;
> +	int nelem;
> +
> +	do {
> +		nelem = 1;
> +		osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
> +		if ((cur_attr.attr_page == attr->attr_page) &&
> +		    (cur_attr.attr_id == attr->attr_id)) {
> +			attr->len = cur_attr.len;
> +			attr->val_ptr = cur_attr.val_ptr;
> +			return 0;
> +		}
> +	} while (iter);
> +
> +	return -EIO;
> +}
> +
> +static int osdblk_get_obj_size(struct osdblk_device *osdev, u64 *size_out)
> +{
> +	struct osd_request *or;
> +	struct osd_attr attr;
> +	int ret;
> +
> +	osd_make_credential(osdev->obj_cred, &osdev->obj);
> +

-	osd_make_credential(osdev->obj_cred, &osdev->obj);
see below

> +	or = osd_start_request(osdev->osd, GFP_KERNEL);
> +	if (!or)
> +		return -ENOMEM;
> +
> +	osd_req_get_attributes(or, &osdev->obj);
> +
> +	osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
> +
> +	/* execute op synchronously */
> +	ret = osd_sync_op(or, OSDBLK_OP_TIMEOUT, osdev->obj_cred);
> +	if (ret)
> +		goto out;
> +
> +	attr = g_attr_logical_length;
> +	ret = extract_attr_from_req(or, &attr);
> +	if (ret)
> +		goto out;
> +
> +	*size_out = get_unaligned_be64(attr.val_ptr);
> +
> +out:
> +	osd_end_request(or);
> +	return ret;
> +
> +}
> +
> +static int osdblk_get_free_req(struct osdblk_device *osdev)
> +{
> +	int i;
> +
> +	for (i = 0; i < OSDBLK_MAX_REQ; i++) {
> +		if (!osdev->req[i].rq)
> +			return i;
> +	}
> +
> +	return -1;
> +}
> +
> +static void osdblk_end_request(struct osdblk_device *osdev,
> +			       struct osdblk_request *orq,
> +			       int error)
> +{
> +	struct request *rq = orq->rq;
> +	int rc;
> +
> +	/* complete request, at block layer */
> +	rc = __blk_end_request(rq, error, blk_rq_bytes(rq));
> +
> +	/* clear request slot for use */
> +	osdev->req[orq->tag].rq = NULL;
> +
> +	/* restart queue, if necessary */
> +	blk_start_queue(osdev->q);
> +}
> +
> +static void osdblk_osd_complete(struct osd_request *or, void *private)
> +{
> +	struct osdblk_request *orq = private;
> +	struct osd_sense_info osi;
> +	int ret = osd_req_decode_sense(or, &osi);
> +
> +	if (ret)
> +		ret = -EIO;
> +
> +	osd_end_request(or);
> +	osdblk_end_request(orq->osdev, orq, ret);

should be reversed, very bad things will happen otherwise

+	osdblk_end_request(orq->osdev, orq, ret);
+	osd_end_request(or);

> +}
> +
> +static void osdblk_rq_fn(struct request_queue *q)
> +{
> +	struct osdblk_device *osdev = q->queuedata;
> +	struct request *rq;
> +	struct osdblk_request *orq;
> +	struct osd_request *or;
> +	struct bio *bio;
> +	int rq_idx, do_write;
> +
> +	while (1) {
> +		rq = elv_next_request(q);
> +		if (!rq)
> +			break;
> +
> +		do_write = (rq_data_dir(rq) == WRITE);
> +
> +		bio = bio_clone(rq->bio, GFP_NOIO);
> +		if (!bio)
> +			break;
> +
> +		rq_idx = osdblk_get_free_req(osdev);
> +		if (rq_idx < 0) {
> +			bio_put(bio);
> +			blk_stop_queue(q);
> +			break;
> +		}
> +
> +		orq = &osdev->req[rq_idx];
> +		orq->tag = rq_idx;
> +		orq->rq = rq;
> +		orq->bio = bio;
> +		orq->osdev = osdev;
> +
> +		blkdev_dequeue_request(rq);
> +
> +		osd_make_credential(orq->cred, &osdev->obj);

-		osd_make_credential(orq->cred, &osdev->obj);

Don't do this here do it once on mount. The creds, once we define
the credential-manager protocol will have to be acquired at the begging.
(See below)

At much later stage, the credential-manager API will be able to callback
credentials and clients will need to reacquire them, or on credentials error
returns from I/O.

> +
> +		or = osd_start_request(osdev->osd, GFP_NOIO);
> +		if (!or) {
> +			blk_requeue_request(q, rq);
> +			bio_put(bio);
> +			break;
> +		}
> +
> +		if (do_write)
> +			osd_req_write(or, &osdev->obj, bio,
> +				      rq->sector * 512ULL);
> +		else
> +			osd_req_read(or, &osdev->obj, bio,
> +				     rq->sector * 512ULL);
> +
> +		if (osd_async_op(or, osdblk_osd_complete, orq, orq->cred)) {
> +			/* FIXME: leak OSD request 'or' ? */

yes a leak

> +			blk_requeue_request(q, rq);
	
+			or->request = NULL;

Sorry about that, I'll need to think of it some more.
Other wise osd_end_request() below will try to destroy
the request.

+			osd_end_request()


> +			bio_put(bio);
> +		}
> +	}
> +}
> +
> +static void osdblk_free_disk(struct osdblk_device *osdev)
> +{
> +	struct gendisk *disk = osdev->disk;
> +
> +	if (!disk)
> +		return;
> +
> +	if (disk->flags & GENHD_FL_UP)
> +		del_gendisk(disk);
> +	if (disk->queue)
> +		blk_cleanup_queue(disk->queue);
> +	put_disk(disk);
> +}
> +
> +static int osdblk_init_disk(struct osdblk_device *osdev)
> +{
> +	struct gendisk *disk;
> +	struct request_queue *q;
> +	int rc;
> +	u64 obj_size = 0;
> +

+	osd_make_credential(osdev->obj_cred, &osdev->obj);

Later, when credential-manager is used, this will get expensive and sleepy
possibly going on the network and back.

> +	rc = osdblk_get_obj_size(osdev, &obj_size);
> +	if (rc)
> +		return rc;
> +
> +	disk = alloc_disk(OSDBLK_MINORS_PER_MAJOR);
> +	if (!disk)
> +		return -ENOMEM;
> +
> +	sprintf(disk->disk_name, DRV_NAME "/%d", osdev->id);
> +	disk->major = osdev->major;
> +	disk->first_minor = 0;
> +	disk->fops = &osdblk_bd_ops;
> +	disk->private_data = osdev;
> +
> +	q = blk_init_queue(osdblk_rq_fn, &osdev->lock);
> +	if (!q) {
> +		put_disk(disk);
> +		return -ENOMEM;
> +	}
> +
> +	disk->queue = q;
> +
> +	q->queuedata = osdev;
> +
> +	osdev->disk = disk;
> +	osdev->q = q;
> +
> +	set_capacity(disk, obj_size);
> +	add_disk(disk);
> +
> +	return 0;
> +}
> +
> +/********************************************************************
> +  /sys/class/osdblk/
> +                     add	map OSD object to blkdev
> +                     remove	unmap OSD object
> +                     list	show mappings
> + *******************************************************************/
> +
> +static void class_osdblk_release(struct class *cls)
> +{
> +	kfree(cls);
> +}
> +
> +static ssize_t class_osdblk_show(struct class *c, char *data)
> +{
> +	int n = 0;
> +	int idx;
> +	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
> +	for (idx = 0; idx < OSDBLK_MAX_DEVS; idx++) {
> +		struct osdblk_device *osdev = osdblk_devs[idx];
> +		if (!osdev)
> +			continue;
> +		n += sprintf(data+n, "%d %d %lu %lu %s\n",
> +			osdev->id,
> +			osdev->major,
> +			osdev->part_id,
> +			osdev->obj_id,
> +			osdev->osd_path);
> +	}
> +	mutex_unlock(&ctl_mutex);
> +	return n;
> +}
> +
> +static ssize_t class_osdblk_add(struct class *c, const char *buf, size_t count)
> +{
> +	struct osdblk_device *osdev;
> +	ssize_t rc;
> +	int idx, irc;
> +
> +	osdev = kzalloc(sizeof(*osdev) + strlen(buf) + 1, GFP_KERNEL);
> +	if (!osdev)
> +		return -ENOMEM;
> +
> +	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
> +
> +	for (idx = 0; idx < OSDBLK_MAX_DEVS; idx++) {
> +		if (!osdblk_devs[idx]) {
> +			osdblk_devs[idx] = osdev;
> +			osdev->id = idx;
> +			break;
> +		}
> +	}
> +
> +	mutex_unlock(&ctl_mutex);
> +
> +	if (idx == OSDBLK_MAX_DEVS) {
> +		rc = -ENOSPC;
> +		goto err_out;
> +	}
> +
> +	if (sscanf(buf, "%lu %lu %s", &osdev->part_id, &osdev->obj_id,

-	if (sscanf(buf, "%lu %lu %s", &osdev->part_id, &osdev->obj_id,
+	if (sscanf(buf, "%llu %llu %s", &osdev->obj.partition, &osdev->obj.id,

> +		   osdev->osd_path) != 3) {
> +		rc = -EINVAL;
> +		goto err_out_slot;
> +	}
> +
> +	osdev->obj.partition = osdev->part_id;
> +	osdev->obj.id = osdev->obj_id;

-	osdev->obj.partition = osdev->part_id;
-	osdev->obj.id = osdev->obj_id;

osdev->obj_id, and osdev->part_id can be removed.

> +
> +	sprintf(osdev->name, DRV_NAME "%d", osdev->id);
> +	spin_lock_init(&osdev->lock);
> +
> +	osdev->osd = osduld_path_lookup(osdev->osd_path);
> +	if (IS_ERR(osdev->osd)) {
> +		rc = PTR_ERR(osdev->osd);
> +		goto err_out_slot;
> +	}
> +
> +	irc = register_blkdev(0, osdev->name);
> +	if (irc < 0) {
> +		rc = irc;
> +		goto err_out_osd;
> +	}
> +
> +	osdev->major = irc;
> +
> +	rc = osdblk_init_disk(osdev);
> +	if (rc)
> +		goto err_out_blkdev;
> +
> +	return 0;
> +
> +err_out_blkdev:
> +	unregister_blkdev(osdev->major, osdev->name);
> +err_out_osd:
> +	osduld_put_device(osdev->osd);
> +err_out_slot:
> +	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
> +	osdblk_devs[osdev->id] = NULL;
> +	mutex_unlock(&ctl_mutex);
> +err_out:
> +	kfree(osdev);
> +	return rc;
> +}
> +
> +static ssize_t class_osdblk_remove(struct class *c, const char *buf,
> +					size_t count)
> +{
> +	struct osdblk_device *osdev;
> +	int target_id;
> +
> +	if (sscanf(buf, "%d", &target_id) != 1)
> +		return -EINVAL;
> +	if (target_id < 0 || target_id >= OSDBLK_MAX_DEVS)
> +		return -EINVAL;
> +
> +	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
> +	osdev = osdblk_devs[target_id];
> +	osdblk_devs[target_id] = NULL;
> +	mutex_unlock(&ctl_mutex);
> +
> +	if (!osdev)
> +		return -ENOENT;
> +
> +	osdblk_free_disk(osdev);
> +	unregister_blkdev(osdev->major, osdev->name);
> +	osduld_put_device(osdev->osd);
> +	kfree(osdev);
> +
> +	return 0;
> +}
> +
> +static struct class_attribute class_osdblk_attrs[] = {
> +	__ATTR(add,	0200, NULL, class_osdblk_add),
> +	__ATTR(remove,	0200, NULL, class_osdblk_remove),
> +	__ATTR(list,	0444, class_osdblk_show, NULL),
> +	__ATTR_NULL
> +};
> +
> +static int osdblk_sysfs_init(void)
> +{
> +	int ret = 0;
> +
> +	/*
> +	 * create control files in sysfs
> +	 * /sys/class/osdblk/...
> +	 */
> +	class_osdblk = kzalloc(sizeof(*class_osdblk), GFP_KERNEL);
> +	if (!class_osdblk)
> +		return -ENOMEM;
> +
> +	class_osdblk->name = DRV_NAME;
> +	class_osdblk->owner = THIS_MODULE;
> +	class_osdblk->class_release = class_osdblk_release;
> +	class_osdblk->class_attrs = class_osdblk_attrs;
> +
> +	ret = class_register(class_osdblk);
> +	if (ret) {
> +		kfree(class_osdblk);
> +		class_osdblk = NULL;
> +		printk(PFX "failed to create class osdblk\n");
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static void osdblk_sysfs_cleanup(void)
> +{
> +	if (class_osdblk)
> +		class_destroy(class_osdblk);
> +	class_osdblk = NULL;
> +}
> +
> +static int __init osdblk_init(void)
> +{
> +	int rc;
> +
> +	rc = osdblk_sysfs_init();
> +	if (rc)
> +		return rc;
> +
> +	return 0;
> +}
> +
> +static void __exit osdblk_exit(void)
> +{
> +	osdblk_sysfs_cleanup();
> +}
> +
> +module_init(osdblk_init);
> +module_exit(osdblk_exit);
> +

What can I say, great stuff.

OSD is a very clean API, that makes whole subsystems look trivial.

Thanks a million
Boaz

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] osdblk: a Linux block device for OSD objects
  2009-04-02 12:26 ` Boaz Harrosh
@ 2009-04-02 16:46   ` Jeff Garzik
  2009-04-03  9:38   ` Jeff Garzik
  1 sibling, 0 replies; 24+ messages in thread
From: Jeff Garzik @ 2009-04-02 16:46 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: open-osd mailing-list, LKML, linux-scsi, linux-fsdevel, axboe,
	Andrew Morton

Boaz Harrosh wrote:
> On 04/02/2009 04:54 AM, Jeff Garzik wrote:
>> As I promised in older exofs threads, here is a client for libosd
>> _other_ than exofs.  This block driver exports a single OSD object
>> as a Linux block device.
>>
>> See the comment block at the top of the driver for usage instructions.
>>
>>
>>
>>  drivers/block/Kconfig  |   16 +
>>  drivers/block/Makefile |    1 
>>  drivers/block/osdblk.c |  563 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 580 insertions(+)
>>
> 
> Forwarding to open-osd ml
> 
> Jeff is it OK if I pick up this patch through my tree and will push it
> together with the other pending patches for 2.6.31 Kernel? I intend to

Please don't add it to any repo just yet.  It is still changing rapidly, 
and that would just slow me down, at the moment.

More comments later...

	Jeff




^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] osdblk: a Linux block device for OSD objects
  2009-04-02  1:54 [PATCH] osdblk: a Linux block device for OSD objects Jeff Garzik
  2009-04-02  2:05 ` Jeff Garzik
  2009-04-02 12:26 ` Boaz Harrosh
@ 2009-04-03  1:32 ` James Bottomley
  2009-04-03 10:14   ` Jeff Garzik
  2009-04-03  9:49 ` Jens Axboe
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 24+ messages in thread
From: James Bottomley @ 2009-04-03  1:32 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: LKML, linux-scsi, linux-fsdevel, axboe, Andrew Morton

On Wed, 2009-04-01 at 21:54 -0400, Jeff Garzik wrote:
> As I promised in older exofs threads, here is a client for libosd
> _other_ than exofs.  This block driver exports a single OSD object
> as a Linux block device.
> 
> See the comment block at the top of the driver for usage instructions.
> 
> 
> 
>  drivers/block/Kconfig  |   16 +
>  drivers/block/Makefile |    1 
>  drivers/block/osdblk.c |  563 +++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 580 insertions(+)
> 
> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index e7b8aa0..ff46b0e 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -298,6 +298,22 @@ config BLK_DEV_NBD
>  
>  	  If unsure, say N.
>  
> +config BLK_DEV_OSD
> +	tristate "OSD object-as-blkdev support"
> +	depends on SCSI_OSD_INITIATOR
> +	---help---
> +	  Saying Y or M here will allow the exporting of a single SCSI
> +	  OSD (object-based storage) object as a Linux block device.
> +
> +	  For example, if you create a 2G object on an OSD device,
> +	  you can then use this module to present that 2G object as
> +	  a Linux block device.
> +
> +	  To compile this driver as a module, choose M here: the
> +	  module will be called osdblk.
> +
> +	  If unsure, say N.
> +
>  config BLK_DEV_SX8
>  	tristate "Promise SATA SX8 support"
>  	depends on PCI
> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
> index 3145141..859bf5d 100644
> --- a/drivers/block/Makefile
> +++ b/drivers/block/Makefile
> @@ -22,6 +22,7 @@ obj-$(CONFIG_BLK_DEV_DAC960)	+= DAC960.o
>  obj-$(CONFIG_XILINX_SYSACE)	+= xsysace.o
>  obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
>  obj-$(CONFIG_SUNVDC)		+= sunvdc.o
> +obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
>  
>  obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
>  obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
> diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
> new file mode 100644
> index 0000000..d3a2fb5
> --- /dev/null
> +++ b/drivers/block/osdblk.c
> @@ -0,0 +1,563 @@
> +
> +/*
> +   osdblk.c -- Export a single SCSI OSD object as a Linux block device
> +
> +
> +   Copyright 2009 Red Hat, Inc.
> +
> +   This program is free software; you can redistribute it and/or modify
> +   it under the terms of the GNU General Public License as published by
> +   the Free Software Foundation.
> +
> +   This program is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +   GNU General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with this program; see the file COPYING.  If not, write to
> +   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
> +
> +
> +   Instructions for use
> +   --------------------
> +
> +   1) Map a Linux block device to an existing OSD object.
> +
> +      In this example, we will use partition id 1234, object id 5678,
> +      OSD device /dev/osd1.
> +
> +      $ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add
> +
> +
> +   2) List all active blkdev<->object mappings.
> +
> +      In this example, we have performed step #1 twice, creating two blkdevs,
> +      mapped to two separate OSD objects.
> +
> +      $ cat /sys/class/osdblk/list
> +      0 174 1234 5678 /dev/osd1
> +      1 179 1994 897123 /dev/osd0

This is a slight violation of the one piece of data per sysfs file
rule ... might it not be better as a file named <partid>-<objid> linking
to the osd device location in sysfs?

> +      The columns, in order, are:
> +      - blkdev unique id
> +      - blkdev assigned major
> +      - OSD object partition id
> +      - OSD object id
> +      - OSD device
> +
> +
> +   3) Remove an active blkdev<->object mapping.
> +
> +      $ echo 1 > /sys/class/osdblk/remove
> +
> +
> +   NOTE:  The actual creation and deletion of OSD objects is outside the scope
> +   of this driver.
> +
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/device.h>
> +#include <linux/module.h>
> +#include <linux/fs.h>
> +#include <scsi/osd_initiator.h>
> +#include <scsi/osd_attributes.h>
> +#include <scsi/osd_sec.h>
> +
> +#define DRV_NAME "osdblk"
> +#define PFX DRV_NAME ": "
> +
> +struct osdblk_device;
> +
> +enum {
> +	OSDBLK_MAX_DEVS		= 64,
> +	OSDBLK_MINORS_PER_MAJOR	= 256,
> +	OSDBLK_MAX_REQ		= 32,
> +	OSDBLK_OP_TIMEOUT	= 4 * 60,
> +};
> +
> +struct osdblk_request {
> +	struct request		*rq;
> +	struct bio		*bio;
> +	struct osdblk_device	*osdev;
> +	int			tag;
> +	uint8_t			cred[OSD_CAP_LEN];
> +};
> +
> +struct osdblk_device {
> +	int			id;
> +
> +	int			major;
> +	struct gendisk		*disk;
> +	struct request_queue	*q;
> +
> +	struct osd_dev		*osd;
> +
> +	char			name[32];
> +
> +	spinlock_t		lock;
> +
> +	struct osd_obj_id	obj;
> +	uint8_t			obj_cred[OSD_CAP_LEN];
> +
> +	struct osdblk_request	req[OSDBLK_MAX_REQ];
> +
> +	unsigned long		part_id;
> +	unsigned long		obj_id;
> +	char			osd_path[0];
> +};
> +
> +static struct class *class_osdblk;		/* /sys/class/osdblk */
> +static struct mutex ctl_mutex;	/* Serialize open/close/setup/teardown */
> +static struct osdblk_device *osdblk_devs[OSDBLK_MAX_DEVS];

Might it not be better to do this as a linked list on the private dev
structure instead?  This only works if you have one entry
in /sys/class/osdblock per device because now you have a device private
pointer to hang it off

> +static struct block_device_operations osdblk_bd_ops = {
> +	.owner		= THIS_MODULE,
> +};
> +
> +const struct osd_attr g_attr_logical_length = ATTR_DEF(
> +	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
> +
> +static void osd_make_credential(u8 cred_a[OSD_CAP_LEN],
> +				const struct osd_obj_id *obj)
> +{
> +	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
> +}
> +
> +/*
> + * Perform a synchronous OSD operation.
> + */
> +static int osd_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
> +{
> +	int ret;
> +
> +	or->timeout = timeout;
> +	ret = osd_finalize_request(or, 0, credential, NULL);
> +	if (ret)
> +		return ret;
> +
> +	ret = osd_execute_request(or);
> +
> +	/* osd_req_decode_sense(or, ret); */
> +	return ret;
> +}
> +
> +/*
> + * Perform an asynchronous OSD operation.
> + */
> +static int osd_async_op(struct osd_request *or, osd_req_done_fn *async_done,
> +		   void *caller_context, u8 *cred)
> +{
> +	int ret;
> +
> +	ret = osd_finalize_request(or, 0, cred, NULL);
> +	if (ret)
> +		return ret;
> +
> +	ret = osd_execute_request_async(or, async_done, caller_context);
> +
> +	return ret;
> +}
> +
> +static int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
> +{
> +	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
> +	void *iter = NULL;
> +	int nelem;
> +
> +	do {
> +		nelem = 1;
> +		osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
> +		if ((cur_attr.attr_page == attr->attr_page) &&
> +		    (cur_attr.attr_id == attr->attr_id)) {
> +			attr->len = cur_attr.len;
> +			attr->val_ptr = cur_attr.val_ptr;
> +			return 0;
> +		}
> +	} while (iter);
> +
> +	return -EIO;
> +}
> +
> +static int osdblk_get_obj_size(struct osdblk_device *osdev, u64 *size_out)
> +{
> +	struct osd_request *or;
> +	struct osd_attr attr;
> +	int ret;
> +
> +	osd_make_credential(osdev->obj_cred, &osdev->obj);
> +
> +	or = osd_start_request(osdev->osd, GFP_KERNEL);
> +	if (!or)
> +		return -ENOMEM;
> +
> +	osd_req_get_attributes(or, &osdev->obj);
> +
> +	osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
> +
> +	/* execute op synchronously */
> +	ret = osd_sync_op(or, OSDBLK_OP_TIMEOUT, osdev->obj_cred);
> +	if (ret)
> +		goto out;
> +
> +	attr = g_attr_logical_length;
> +	ret = extract_attr_from_req(or, &attr);
> +	if (ret)
> +		goto out;
> +
> +	*size_out = get_unaligned_be64(attr.val_ptr);
> +
> +out:
> +	osd_end_request(or);
> +	return ret;
> +
> +}
> +
> +static int osdblk_get_free_req(struct osdblk_device *osdev)
> +{
> +	int i;
> +
> +	for (i = 0; i < OSDBLK_MAX_REQ; i++) {
> +		if (!osdev->req[i].rq)
> +			return i;
> +	}

Rather than using a static list of outstanding requests, I think you
could probably use the block tag handling infrastructure for all of this

The rest looks fine.

James



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] osdblk: a Linux block device for OSD objects
  2009-04-02 12:26 ` Boaz Harrosh
  2009-04-02 16:46   ` Jeff Garzik
@ 2009-04-03  9:38   ` Jeff Garzik
  2009-04-05 10:22     ` Boaz Harrosh
  1 sibling, 1 reply; 24+ messages in thread
From: Jeff Garzik @ 2009-04-03  9:38 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: open-osd mailing-list, LKML, linux-scsi, linux-fsdevel, axboe,
	Andrew Morton

Boaz Harrosh wrote:
> I have taken that to my heart and will submit patches for that, next week.
> Including a complimentary patch to this driver. These changes are only
> intended for 2.6.31 though.

Consolidation of common code should occur after osdblk is in one of: 
open-osd.git, scsi-misc.git, or linux-2.6.git.

That way, the code movement can be consolidated into a single changeset, 
touching exofs, osdblk and libosd all at the same time...
	-exofs code
	-osdblk code
	+libosd code


> I also want to add a small utility that can manage objects, create, size,
> remove, and mount as a complimentary wrapper for this driver is "osdblk"
> a good name for such utility?

osdblk intentionally maintains -zero- metadata on its own.  Therefore, 
this utility you propose can be completely generic.  You could call it 
"osdobjutil", because it need not be tied to the osdblk driver.

The osdblk driver needs the following from the utility:

- create object of specified size

- delete object

and optionally:
- resize object to new size

There is no need for a mount operation, because this is handled through 
class_osdblk_add()


>> +static int osdblk_get_obj_size(struct osdblk_device *osdev, u64 *size_out)
>> +{
>> +	struct osd_request *or;
>> +	struct osd_attr attr;
>> +	int ret;
>> +
>> +	osd_make_credential(osdev->obj_cred, &osdev->obj);
>> +
> 
> -	osd_make_credential(osdev->obj_cred, &osdev->obj);
> see below

fixed


>> +static void osdblk_osd_complete(struct osd_request *or, void *private)
>> +{
>> +	struct osdblk_request *orq = private;
>> +	struct osd_sense_info osi;
>> +	int ret = osd_req_decode_sense(or, &osi);
>> +
>> +	if (ret)
>> +		ret = -EIO;
>> +
>> +	osd_end_request(or);
>> +	osdblk_end_request(orq->osdev, orq, ret);
> 
> should be reversed, very bad things will happen otherwise
> 
> +	osdblk_end_request(orq->osdev, orq, ret);
> +	osd_end_request(or);

Perhaps you are confusing two different 'struct request' in use?

	- struct request, passed to osdblk for execution
	- struct request, used by libosd to pass commands

The object lifetime of the struct request stored in 'orq' is longer than 
the lifetime of the osd_request:

	1) block layer passes 'rq' to osdblk
	2) osdblk creates new 'or', passes 'or' to libosd
	3) libosd calls osdblk completion function
	4) osdblk completes 'or'
	5) osdblk completes 'rq'

As you can see, the object lifetime of 'or' is entirely within 'rq'.


>> +		orq = &osdev->req[rq_idx];
>> +		orq->tag = rq_idx;
>> +		orq->rq = rq;
>> +		orq->bio = bio;
>> +		orq->osdev = osdev;
>> +
>> +		blkdev_dequeue_request(rq);
>> +
>> +		osd_make_credential(orq->cred, &osdev->obj);
> 
> -		osd_make_credential(orq->cred, &osdev->obj);
> 
> Don't do this here do it once on mount. The creds, once we define
> the credential-manager protocol will have to be acquired at the begging.
> (See below)
> 
> At much later stage, the credential-manager API will be able to callback
> credentials and clients will need to reacquire them, or on credentials error
> returns from I/O.

fixed


>> +
>> +		or = osd_start_request(osdev->osd, GFP_NOIO);
>> +		if (!or) {
>> +			blk_requeue_request(q, rq);
>> +			bio_put(bio);
>> +			break;
>> +		}
>> +
>> +		if (do_write)
>> +			osd_req_write(or, &osdev->obj, bio,
>> +				      rq->sector * 512ULL);
>> +		else
>> +			osd_req_read(or, &osdev->obj, bio,
>> +				     rq->sector * 512ULL);
>> +
>> +		if (osd_async_op(or, osdblk_osd_complete, orq, orq->cred)) {
>> +			/* FIXME: leak OSD request 'or' ? */
> 
> yes a leak
> 
>> +			blk_requeue_request(q, rq);
> 	
> +			or->request = NULL;

already fixed, long before writing this email :)


>> +			bio_put(bio);
>> +		}
>> +	}
>> +}
>> +
>> +static void osdblk_free_disk(struct osdblk_device *osdev)
>> +{
>> +	struct gendisk *disk = osdev->disk;
>> +
>> +	if (!disk)
>> +		return;
>> +
>> +	if (disk->flags & GENHD_FL_UP)
>> +		del_gendisk(disk);
>> +	if (disk->queue)
>> +		blk_cleanup_queue(disk->queue);
>> +	put_disk(disk);
>> +}
>> +
>> +static int osdblk_init_disk(struct osdblk_device *osdev)
>> +{
>> +	struct gendisk *disk;
>> +	struct request_queue *q;
>> +	int rc;
>> +	u64 obj_size = 0;
>> +
> 
> +	osd_make_credential(osdev->obj_cred, &osdev->obj);
> 
> Later, when credential-manager is used, this will get expensive and sleepy
> possibly going on the network and back.

fixed


>> +	if (idx == OSDBLK_MAX_DEVS) {
>> +		rc = -ENOSPC;
>> +		goto err_out;
>> +	}
>> +
>> +	if (sscanf(buf, "%lu %lu %s", &osdev->part_id, &osdev->obj_id,
> 
> -	if (sscanf(buf, "%lu %lu %s", &osdev->part_id, &osdev->obj_id,
> +	if (sscanf(buf, "%llu %llu %s", &osdev->obj.partition, &osdev->obj.id,
> 
>> +		   osdev->osd_path) != 3) {
>> +		rc = -EINVAL;
>> +		goto err_out_slot;
>> +	}
>> +
>> +	osdev->obj.partition = osdev->part_id;
>> +	osdev->obj.id = osdev->obj_id;
> 
> -	osdev->obj.partition = osdev->part_id;
> -	osdev->obj.id = osdev->obj_id;
> 
> osdev->obj_id, and osdev->part_id can be removed.

done


> What can I say, great stuff.
> 
> OSD is a very clean API, that makes whole subsystems look trivial.

I appreciate it, thanks for the review.

	Jeff




^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] osdblk: a Linux block device for OSD objects
  2009-04-02  1:54 [PATCH] osdblk: a Linux block device for OSD objects Jeff Garzik
                   ` (2 preceding siblings ...)
  2009-04-03  1:32 ` James Bottomley
@ 2009-04-03  9:49 ` Jens Axboe
  2009-04-03  9:58   ` Jeff Garzik
  2009-04-07  7:26 ` Pavel Machek
  2009-04-07 22:53 ` [PATCH v2] " Jeff Garzik
  5 siblings, 1 reply; 24+ messages in thread
From: Jens Axboe @ 2009-04-03  9:49 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: LKML, linux-scsi, linux-fsdevel, Andrew Morton

On Wed, Apr 01 2009, Jeff Garzik wrote:
> 
> As I promised in older exofs threads, here is a client for libosd
> _other_ than exofs.  This block driver exports a single OSD object
> as a Linux block device.
> 
> See the comment block at the top of the driver for usage instructions.
> 
> 
> 
> +static void osdblk_end_request(struct osdblk_device *osdev,
> +			       struct osdblk_request *orq,
> +			       int error)
> +{
> +	struct request *rq = orq->rq;
> +	int rc;
> +
> +	/* complete request, at block layer */
> +	rc = __blk_end_request(rq, error, blk_rq_bytes(rq));
> +
> +	/* clear request slot for use */
> +	osdev->req[orq->tag].rq = NULL;
> +
> +	/* restart queue, if necessary */
> +	blk_start_queue(osdev->q);
> +}
> +
> +static void osdblk_osd_complete(struct osd_request *or, void *private)
> +{
> +	struct osdblk_request *orq = private;
> +	struct osd_sense_info osi;
> +	int ret = osd_req_decode_sense(or, &osi);
> +
> +	if (ret)
> +		ret = -EIO;
> +
> +	osd_end_request(or);
> +	osdblk_end_request(orq->osdev, orq, ret);
> +}
> +
> +static void osdblk_rq_fn(struct request_queue *q)
> +{
> +	struct osdblk_device *osdev = q->queuedata;
> +	struct request *rq;
> +	struct osdblk_request *orq;
> +	struct osd_request *or;
> +	struct bio *bio;
> +	int rq_idx, do_write;
> +
> +	while (1) {
> +		rq = elv_next_request(q);
> +		if (!rq)
> +			break;
> +
> +		do_write = (rq_data_dir(rq) == WRITE);
> +
> +		bio = bio_clone(rq->bio, GFP_NOIO);
> +		if (!bio)
> +			break;

This wont work, GFP_NOIO inside the queue lock. You are also only
cloning the front bio, what happens if you have > 1 bio on the request?
You seem to dequeue the request and complete all of it, regardless of
whether bio->bi_size == blk_rq_bytes(rq). I'm assuming you have to clone
because of how the osd_req_{read,write} works, so I'd suggest storing
the byte size in your osdblk_request and only completing that in
osdblk_end_request(). Then do a rq_for_each_bio() look in there, and
only dequeue if you manage to start an osd request for each of them,
THEN moving on to the next request.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] osdblk: a Linux block device for OSD objects
  2009-04-03  9:49 ` Jens Axboe
@ 2009-04-03  9:58   ` Jeff Garzik
  2009-04-05 10:18     ` Boaz Harrosh
  0 siblings, 1 reply; 24+ messages in thread
From: Jeff Garzik @ 2009-04-03  9:58 UTC (permalink / raw)
  To: Jens Axboe; +Cc: LKML, linux-scsi, linux-fsdevel, Andrew Morton

Jens Axboe wrote:
> This wont work, GFP_NOIO inside the queue lock. You are also only
> cloning the front bio, what happens if you have > 1 bio on the request?
> You seem to dequeue the request and complete all of it, regardless of
> whether bio->bi_size == blk_rq_bytes(rq). I'm assuming you have to clone
> because of how the osd_req_{read,write} works, so I'd suggest storing
> the byte size in your osdblk_request and only completing that in
> osdblk_end_request(). Then do a rq_for_each_bio() look in there, and
> only dequeue if you manage to start an osd request for each of them,
> THEN moving on to the next request.

Thanks for the review.  Will fix...

	Jeff



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] osdblk: a Linux block device for OSD objects
  2009-04-03  1:32 ` James Bottomley
@ 2009-04-03 10:14   ` Jeff Garzik
  0 siblings, 0 replies; 24+ messages in thread
From: Jeff Garzik @ 2009-04-03 10:14 UTC (permalink / raw)
  To: James Bottomley; +Cc: LKML, linux-scsi, linux-fsdevel, axboe, Andrew Morton

James Bottomley wrote:
>> +   1) Map a Linux block device to an existing OSD object.
>> +
>> +      In this example, we will use partition id 1234, object id 5678,
>> +      OSD device /dev/osd1.
>> +
>> +      $ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add
>> +
>> +
>> +   2) List all active blkdev<->object mappings.
>> +
>> +      In this example, we have performed step #1 twice, creating two blkdevs,
>> +      mapped to two separate OSD objects.
>> +
>> +      $ cat /sys/class/osdblk/list
>> +      0 174 1234 5678 /dev/osd1
>> +      1 179 1994 897123 /dev/osd0
> 
> This is a slight violation of the one piece of data per sysfs file
> rule ... might it not be better as a file named <partid>-<objid> linking
> to the osd device location in sysfs?

Yeah...   I leaned more towards a consolidated table, as it was 
elegantly implemented in just a few lines of code, including locking :)


>> +      The columns, in order, are:
>> +      - blkdev unique id
>> +      - blkdev assigned major
>> +      - OSD object partition id
>> +      - OSD object id
>> +      - OSD device
>> +
>> +
>> +   3) Remove an active blkdev<->object mapping.
>> +


>> +	unsigned long		obj_id;
>> +	char			osd_path[0];
>> +};
>> +
>> +static struct class *class_osdblk;		/* /sys/class/osdblk */
>> +static struct mutex ctl_mutex;	/* Serialize open/close/setup/teardown */
>> +static struct osdblk_device *osdblk_devs[OSDBLK_MAX_DEVS];
> 
> Might it not be better to do this as a linked list on the private dev
> structure instead?  This only works if you have one entry
> in /sys/class/osdblock per device because now you have a device private
> pointer to hang it off

converted to list



>> +static int osdblk_get_free_req(struct osdblk_device *osdev)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < OSDBLK_MAX_REQ; i++) {
>> +		if (!osdev->req[i].rq)
>> +			return i;
>> +	}
> 
> Rather than using a static list of outstanding requests, I think you
> could probably use the block tag handling infrastructure for all of this

converted to use blk-tag.c gadgetry

Thanks for the review!

	Jeff



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] osdblk: a Linux block device for OSD objects
  2009-04-03  9:58   ` Jeff Garzik
@ 2009-04-05 10:18     ` Boaz Harrosh
  2009-04-08  1:29       ` Jeff Garzik
  0 siblings, 1 reply; 24+ messages in thread
From: Boaz Harrosh @ 2009-04-05 10:18 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Jens Axboe, LKML, linux-scsi, linux-fsdevel, Andrew Morton

On 04/03/2009 12:58 PM, Jeff Garzik wrote:
> Jens Axboe wrote:
>> This wont work, GFP_NOIO inside the queue lock. You are also only
>> cloning the front bio, what happens if you have > 1 bio on the request?
>> You seem to dequeue the request and complete all of it, regardless of
>> whether bio->bi_size == blk_rq_bytes(rq). I'm assuming you have to clone
>> because of how the osd_req_{read,write} works, so I'd suggest storing
>> the byte size in your osdblk_request and only completing that in
>> osdblk_end_request(). Then do a rq_for_each_bio() look in there, and
>> only dequeue if you manage to start an osd request for each of them,
>> THEN moving on to the next request.
> 

There is nothing preventing from issuing a linked bio list. The only thing
is that osd_read/write looks at the first bio for total size.
If the first bio->bi_size does not specify the full length of the chain
then we should add another parameter to osd_read/write for that.

The original idea was to specifically allow chained bios.

Please advise?

> Thanks for the review.  Will fix...
> 
> 	Jeff
> 

Thanks
Boaz

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] osdblk: a Linux block device for OSD objects
  2009-04-03  9:38   ` Jeff Garzik
@ 2009-04-05 10:22     ` Boaz Harrosh
  0 siblings, 0 replies; 24+ messages in thread
From: Boaz Harrosh @ 2009-04-05 10:22 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: open-osd mailing-list, LKML, linux-scsi, linux-fsdevel, axboe,
	Andrew Morton

On 04/03/2009 12:38 PM, Jeff Garzik wrote:
> Boaz Harrosh wrote:
>> I have taken that to my heart and will submit patches for that, next week.
>> Including a complimentary patch to this driver. These changes are only
>> intended for 2.6.31 though.
> 
> Consolidation of common code should occur after osdblk is in one of: 
> open-osd.git, scsi-misc.git, or linux-2.6.git.
> 
> That way, the code movement can be consolidated into a single changeset, 
> touching exofs, osdblk and libosd all at the same time...
> 	-exofs code
> 	-osdblk code
> 	+libosd code
> 
> 

Yes my plan exactly. I'll do it on my linux-open-osd tree and push it
through the exofs branch in the next Kernel (2.6.31)

>> I also want to add a small utility that can manage objects, create, size,
>> remove, and mount as a complimentary wrapper for this driver is "osdblk"
>> a good name for such utility?
> 
> osdblk intentionally maintains -zero- metadata on its own.  Therefore, 
> this utility you propose can be completely generic.  You could call it 
> "osdobjutil", because it need not be tied to the osdblk driver.
> 
> The osdblk driver needs the following from the utility:
> 
> - create object of specified size
> 
> - delete object
> 
> and optionally:
> - resize object to new size
> 

In that case then I have on Q, an "osd" application that will support
the full osd API set through command line, like:
usage: osd create|remove|read|write|append|flush... \
	--obj=par_id,obj_id \
	[--set_attr=page_no,attr_no,set_attr_file] \
	[--get_attr=page_no,attr_no,set_attr_file] \
	[--file=in_out_file] \
	...

I intend to fully support any functionality available by the library.
the osd_ktests should be duplicatable in bash

> There is no need for a mount operation, because this is handled through 
> class_osdblk_add()
> 
> 
<snip>
>>> +static void osdblk_osd_complete(struct osd_request *or, void *private)
>>> +{
>>> +	struct osdblk_request *orq = private;
>>> +	struct osd_sense_info osi;
>>> +	int ret = osd_req_decode_sense(or, &osi);
>>> +
>>> +	if (ret)
>>> +		ret = -EIO;
>>> +
>>> +	osd_end_request(or);
>>> +	osdblk_end_request(orq->osdev, orq, ret);
>> should be reversed, very bad things will happen otherwise
>>
>> +	osdblk_end_request(orq->osdev, orq, ret);
>> +	osd_end_request(or);
> 
> Perhaps you are confusing two different 'struct request' in use?
> 
> 	- struct request, passed to osdblk for execution
> 	- struct request, used by libosd to pass commands
> 
> The object lifetime of the struct request stored in 'orq' is longer than 
> the lifetime of the osd_request:
> 
> 	1) block layer passes 'rq' to osdblk
> 	2) osdblk creates new 'or', passes 'or' to libosd
> 	3) libosd calls osdblk completion function
> 	4) osdblk completes 'or'
> 	5) osdblk completes 'rq'
> 
> As you can see, the object lifetime of 'or' is entirely within 'rq'.
> 
> 

Yes I was confused exactly as you described, thanks grate stuff.

> 
> 
>> What can I say, great stuff.
>>
>> OSD is a very clean API, that makes whole subsystems look trivial.
> 
> I appreciate it, thanks for the review.
> 
> 	Jeff

Thank you Jeff for doing this
Boaz

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] osdblk: a Linux block device for OSD objects
  2009-04-02  1:54 [PATCH] osdblk: a Linux block device for OSD objects Jeff Garzik
                   ` (3 preceding siblings ...)
  2009-04-03  9:49 ` Jens Axboe
@ 2009-04-07  7:26 ` Pavel Machek
  2009-04-07 22:53 ` [PATCH v2] " Jeff Garzik
  5 siblings, 0 replies; 24+ messages in thread
From: Pavel Machek @ 2009-04-07  7:26 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: LKML, linux-scsi, linux-fsdevel, axboe, Andrew Morton

On Wed 2009-04-01 21:54:55, Jeff Garzik wrote:
> 
> As I promised in older exofs threads, here is a client for libosd
> _other_ than exofs.  This block driver exports a single OSD object
> as a Linux block device.
> 
> See the comment block at the top of the driver for usage instructions.

I see it is not yet signed off... but maybe docs should go to
documentation directory?

									Pavel

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v2] osdblk: a Linux block device for OSD objects
  2009-04-02  1:54 [PATCH] osdblk: a Linux block device for OSD objects Jeff Garzik
                   ` (4 preceding siblings ...)
  2009-04-07  7:26 ` Pavel Machek
@ 2009-04-07 22:53 ` Jeff Garzik
  2009-04-10 11:48   ` [PATCH 1/3] block/blk-map.c: blk_rq_append_bio should ensure it's not appending a chain Jeff Garzik
  5 siblings, 1 reply; 24+ messages in thread
From: Jeff Garzik @ 2009-04-07 22:53 UTC (permalink / raw)
  To: LKML, linux-scsi
  Cc: linux-fsdevel, axboe, Andrew Morton, James.Bottomley, bharrosh, osd-dev

This is a client for libosd.  This block driver exports a single OSD
object as a Linux block device.

See the comment block at the top of the driver for usage instructions.

The major remaining pre-merge FIXME is its handling of bio's, as pointed
out by Jens.  This perhaps requires looking at libosd, and a bit of
discussion w/ Boaz and crew.

Not-yet-signed-off-by: me

---
Other changes since last posting:
- use strict_strtoul (akpm review)
- add barrier support, and issue OSD flush requests
- replace global array of devices with linked list (jejb review)
- fix OSD credentials (boaz rev)
- use block layer tagging (jejb rev)
- use GFP_ATOMIC rather than GFP_NOIO (axboe rev)
- improve comments
- module refcounting

 drivers/block/Kconfig  |   16 +
 drivers/block/Makefile |    1 
 drivers/block/osdblk.c |  630 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 647 insertions(+)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index ddea8e4..34722c8 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -298,6 +298,22 @@ config BLK_DEV_NBD
 
 	  If unsure, say N.
 
+config BLK_DEV_OSD
+	tristate "OSD object-as-blkdev support"
+	depends on SCSI_OSD_INITIATOR
+	---help---
+	  Saying Y or M here will allow the exporting of a single SCSI
+	  OSD (object-based storage) object as a Linux block device.
+
+	  For example, if you create a 2G object on an OSD device,
+	  you can then use this module to present that 2G object as
+	  a Linux block device.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called osdblk.
+
+	  If unsure, say N.
+
 config BLK_DEV_SX8
 	tristate "Promise SATA SX8 support"
 	depends on PCI
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 7755a5e..cdaa3f8 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_XILINX_SYSACE)	+= xsysace.o
 obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
 obj-$(CONFIG_MG_DISK)		+= mg_disk.o
 obj-$(CONFIG_SUNVDC)		+= sunvdc.o
+obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
 
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
 obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
new file mode 100644
index 0000000..645ccc6
--- /dev/null
+++ b/drivers/block/osdblk.c
@@ -0,0 +1,630 @@
+
+/*
+   osdblk.c -- Export a single SCSI OSD object as a Linux block device
+
+
+   Copyright 2009 Red Hat, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+   Instructions for use
+   --------------------
+
+   1) Map a Linux block device to an existing OSD object.
+
+      In this example, we will use partition id 1234, object id 5678,
+      OSD device /dev/osd1.
+
+      $ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add
+
+
+   2) List all active blkdev<->object mappings.
+
+      In this example, we have performed step #1 twice, creating two blkdevs,
+      mapped to two separate OSD objects.
+
+      $ cat /sys/class/osdblk/list
+      0 174 1234 5678 /dev/osd1
+      1 179 1994 897123 /dev/osd0
+
+      The columns, in order, are:
+      - blkdev unique id
+      - blkdev assigned major
+      - OSD object partition id
+      - OSD object id
+      - OSD device
+
+
+   3) Remove an active blkdev<->object mapping.
+
+      In this example, we remove the mapping with blkdev unique id 1.
+
+      $ echo 1 > /sys/class/osdblk/remove
+
+
+   NOTE:  The actual creation and deletion of OSD objects is outside the scope
+   of this driver.
+
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_sec.h>
+
+#define DRV_NAME "osdblk"
+#define PFX DRV_NAME ": "
+
+struct osdblk_device;
+
+enum {
+	OSDBLK_MINORS_PER_MAJOR	= 256,		/* max minors per blkdev */
+	OSDBLK_MAX_REQ		= 32,		/* max parallel requests */
+	OSDBLK_OP_TIMEOUT	= 4 * 60,	/* sync OSD req timeout */
+};
+
+struct osdblk_request {
+	struct request		*rq;		/* blk layer request */
+	struct bio		*bio;		/* cloned bio */
+	struct osdblk_device	*osdev;		/* associated blkdev */
+};
+
+struct osdblk_device {
+	int			id;		/* blkdev unique id */
+
+	int			major;		/* blkdev assigned major */
+	struct gendisk		*disk;		/* blkdev's gendisk and rq */
+	struct request_queue	*q;
+
+	struct osd_dev		*osd;		/* associated OSD */
+
+	char			name[32];	/* blkdev name, e.g. osdblk34 */
+
+	spinlock_t		lock;		/* queue lock */
+
+	struct osd_obj_id	obj;		/* OSD partition, obj id */
+	uint8_t			obj_cred[OSD_CAP_LEN]; /* OSD cred */
+
+	struct osdblk_request	req[OSDBLK_MAX_REQ]; /* request table */
+
+	struct list_head	node;
+
+	char			osd_path[0];	/* OSD device path */
+};
+
+static struct class *class_osdblk;		/* /sys/class/osdblk */
+static struct mutex ctl_mutex;	/* Serialize open/close/setup/teardown */
+static LIST_HEAD(osdblkdev_list);
+
+static struct block_device_operations osdblk_bd_ops = {
+	.owner		= THIS_MODULE,
+};
+
+static const struct osd_attr g_attr_logical_length = ATTR_DEF(
+	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+
+/* copied from exofs; move to libosd? */
+static void osd_make_credential(u8 cred_a[OSD_CAP_LEN],
+				const struct osd_obj_id *obj)
+{
+	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+
+/*
+ * Perform a synchronous OSD operation.  copied from exofs; move to libosd?
+ */
+static int osd_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
+{
+	int ret;
+
+	or->timeout = timeout;
+	ret = osd_finalize_request(or, 0, credential, NULL);
+	if (ret)
+		return ret;
+
+	ret = osd_execute_request(or);
+
+	/* osd_req_decode_sense(or, ret); */
+	return ret;
+}
+
+/*
+ * Perform an asynchronous OSD operation.  copied from exofs; move to libosd?
+ */
+static int osd_async_op(struct osd_request *or, osd_req_done_fn *async_done,
+		   void *caller_context, u8 *cred)
+{
+	int ret;
+
+	ret = osd_finalize_request(or, 0, cred, NULL);
+	if (ret)
+		return ret;
+
+	ret = osd_execute_request_async(or, async_done, caller_context);
+
+	return ret;
+}
+
+/* copied from exofs; move to libosd? */
+static int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
+{
+	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+	void *iter = NULL;
+	int nelem;
+
+	do {
+		nelem = 1;
+		osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
+		if ((cur_attr.attr_page == attr->attr_page) &&
+		    (cur_attr.attr_id == attr->attr_id)) {
+			attr->len = cur_attr.len;
+			attr->val_ptr = cur_attr.val_ptr;
+			return 0;
+		}
+	} while (iter);
+
+	return -EIO;
+}
+
+static int osdblk_get_obj_size(struct osdblk_device *osdev, u64 *size_out)
+{
+	struct osd_request *or;
+	struct osd_attr attr;
+	int ret;
+
+	/* start request */
+	or = osd_start_request(osdev->osd, GFP_KERNEL);
+	if (!or)
+		return -ENOMEM;
+
+	/* create a get-attributes(length) request */
+	osd_req_get_attributes(or, &osdev->obj);
+
+	osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
+
+	/* execute op synchronously */
+	ret = osd_sync_op(or, OSDBLK_OP_TIMEOUT, osdev->obj_cred);
+	if (ret)
+		goto out;
+
+	/* extract length from returned attribute info */
+	attr = g_attr_logical_length;
+	ret = extract_attr_from_req(or, &attr);
+	if (ret)
+		goto out;
+
+	*size_out = get_unaligned_be64(attr.val_ptr);
+
+out:
+	osd_end_request(or);
+	return ret;
+
+}
+
+static void osdblk_end_request(struct osdblk_device *osdev,
+			       struct osdblk_request *orq,
+			       int error)
+{
+	struct request *rq = orq->rq;
+	int rc;
+
+	/* complete request, at block layer */
+	rc = __blk_end_request(rq, error, blk_rq_bytes(rq));
+}
+
+static void osdblk_osd_complete(struct osd_request *or, void *private)
+{
+	struct osdblk_request *orq = private;
+	struct osd_sense_info osi;
+	int ret = osd_req_decode_sense(or, &osi);
+
+	if (ret)
+		ret = -EIO;
+
+	/* complete OSD request */
+	osd_end_request(or);
+
+	/* complete request passed to osdblk by block layer */
+	osdblk_end_request(orq->osdev, orq, ret);
+}
+
+static void osdblk_rq_fn(struct request_queue *q)
+{
+	struct osdblk_device *osdev = q->queuedata;
+	struct request *rq;
+	struct osdblk_request *orq;
+	struct osd_request *or;
+	struct bio *bio;
+	int do_write, do_flush;
+
+	while (1) {
+		/* peek at request from block layer */
+		rq = elv_next_request(q);
+		if (!rq)
+			break;
+
+		/* filter out block requests we don't understand */
+		if (!blk_fs_request(rq) && !blk_barrier_rq(rq)) {
+			end_request(rq, 0);
+			continue;
+		}
+
+		/* deduce our operation (read, write, flush) */
+		/* I wish the block layer simplified cmd_type/cmd_flags/cmd[]
+		 * into a clearly defined set of RPC commands:
+		 * read, write, flush, scsi command, power mgmt req,
+		 * driver-specific, etc.
+		 */
+
+		do_flush = (rq->special == (void *) 0xdeadbeefUL);
+		do_write = (rq_data_dir(rq) == WRITE);
+
+		/* a bio clone to be passed down to OSD request */
+		bio = bio_clone(rq->bio, GFP_ATOMIC);
+		if (!bio)
+			break;
+
+		/* alloc internal OSD request, for OSD command execution */
+		or = osd_start_request(osdev->osd, GFP_ATOMIC);
+		if (!or) {
+			bio_put(bio);
+			break;
+		}
+
+		orq = &osdev->req[rq->tag];
+		orq->rq = rq;
+		orq->bio = bio;
+		orq->osdev = osdev;
+
+		/* init OSD command: flush, write or read */
+		if (do_flush)
+			osd_req_flush_object(or, &osdev->obj,
+					     OSD_CDB_FLUSH_ALL, 0, 0);
+		else if (do_write)
+			osd_req_write(or, &osdev->obj, bio,
+				      rq->sector * 512ULL);
+		else
+			osd_req_read(or, &osdev->obj, bio,
+				     rq->sector * 512ULL);
+
+		/* begin OSD command execution */
+		if (osd_async_op(or, osdblk_osd_complete, orq,
+				 osdev->obj_cred)) {
+			osd_end_request(or);
+			blk_requeue_request(q, rq);
+			bio_put(bio);
+		}
+
+		/* remove the special 'flush' marker, now that the command
+		 * is executing
+		 */
+		rq->special = NULL;
+	}
+}
+
+static void osdblk_prepare_flush(struct request_queue *q, struct request *rq)
+{
+	/* add driver-specific marker, to indicate that this request
+	 * is a flush command
+	 */
+	rq->special = (void *) 0xdeadbeefUL;
+}
+
+static void osdblk_free_disk(struct osdblk_device *osdev)
+{
+	struct gendisk *disk = osdev->disk;
+
+	if (!disk)
+		return;
+
+	if (disk->flags & GENHD_FL_UP)
+		del_gendisk(disk);
+	if (disk->queue)
+		blk_cleanup_queue(disk->queue);
+	put_disk(disk);
+}
+
+static int osdblk_init_disk(struct osdblk_device *osdev)
+{
+	struct gendisk *disk;
+	struct request_queue *q;
+	int rc;
+	u64 obj_size = 0;
+
+	/* contact OSD, request size info about the object being mapped */
+	rc = osdblk_get_obj_size(osdev, &obj_size);
+	if (rc)
+		return rc;
+
+	/* create gendisk info */
+	disk = alloc_disk(OSDBLK_MINORS_PER_MAJOR);
+	if (!disk)
+		return -ENOMEM;
+
+	sprintf(disk->disk_name, DRV_NAME "/%d", osdev->id);
+	disk->major = osdev->major;
+	disk->first_minor = 0;
+	disk->fops = &osdblk_bd_ops;
+	disk->private_data = osdev;
+
+	/* init rq */
+	q = blk_init_queue(osdblk_rq_fn, &osdev->lock);
+	if (!q) {
+		put_disk(disk);
+		return -ENOMEM;
+	}
+
+	/* switch queue to TCQ mode; allocate tag map */
+	rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL);
+	if (rc) {
+		blk_cleanup_queue(q);
+		put_disk(disk);
+		return rc;
+	}
+
+	blk_queue_prep_rq(q, blk_queue_start_tag);
+	blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH, osdblk_prepare_flush);
+
+	disk->queue = q;
+
+	q->queuedata = osdev;
+
+	osdev->disk = disk;
+	osdev->q = q;
+
+	/* finally, announce the disk to the world */
+	set_capacity(disk, obj_size);
+	add_disk(disk);
+
+	return 0;
+}
+
+/********************************************************************
+  /sys/class/osdblk/
+                     add	map OSD object to blkdev
+                     remove	unmap OSD object
+                     list	show mappings
+ *******************************************************************/
+
+static void class_osdblk_release(struct class *cls)
+{
+	kfree(cls);
+}
+
+static ssize_t class_osdblk_list(struct class *c, char *data)
+{
+	int n = 0;
+	struct list_head *tmp;
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	list_for_each(tmp, &osdblkdev_list) {
+		struct osdblk_device *osdev;
+
+		osdev = list_entry(tmp, struct osdblk_device, node);
+
+		n += sprintf(data+n, "%d %d %llu %llu %s\n",
+			osdev->id,
+			osdev->major,
+			osdev->obj.partition,
+			osdev->obj.id,
+			osdev->osd_path);
+	}
+
+	mutex_unlock(&ctl_mutex);
+	return n;
+}
+
+static ssize_t class_osdblk_add(struct class *c, const char *buf, size_t count)
+{
+	struct osdblk_device *osdev;
+	ssize_t rc;
+	int irc, new_id = 0;
+	struct list_head *tmp;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	/* new osdblk_device object */
+	osdev = kzalloc(sizeof(*osdev) + strlen(buf) + 1, GFP_KERNEL);
+	if (!osdev) {
+		rc = -ENOMEM;
+		goto err_out_mod;
+	}
+
+	/* static osdblk_device initialization */
+	spin_lock_init(&osdev->lock);
+	INIT_LIST_HEAD(&osdev->node);
+
+	/* generate unique id: find highest unique id, add one */
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	list_for_each(tmp, &osdblkdev_list) {
+		struct osdblk_device *osdev;
+
+		osdev = list_entry(tmp, struct osdblk_device, node);
+		if (osdev->id > new_id)
+			new_id = osdev->id + 1;
+	}
+
+	osdev->id = new_id;
+
+	/* add to global list */
+	list_add_tail(&osdev->node, &osdblkdev_list);
+
+	mutex_unlock(&ctl_mutex);
+
+	/* parse add command */
+	if (sscanf(buf, "%llu %llu %s", &osdev->obj.partition, &osdev->obj.id,
+		   osdev->osd_path) != 3) {
+		rc = -EINVAL;
+		goto err_out_slot;
+	}
+
+	/* initialize rest of new object */
+	sprintf(osdev->name, DRV_NAME "%d", osdev->id);
+
+	/* contact requested OSD */
+	osdev->osd = osduld_path_lookup(osdev->osd_path);
+	if (IS_ERR(osdev->osd)) {
+		rc = PTR_ERR(osdev->osd);
+		goto err_out_slot;
+	}
+
+	/* build OSD credential */
+	osd_make_credential(osdev->obj_cred, &osdev->obj);
+
+	/* register our block device */
+	irc = register_blkdev(0, osdev->name);
+	if (irc < 0) {
+		rc = irc;
+		goto err_out_osd;
+	}
+
+	osdev->major = irc;
+
+	/* set up and announce blkdev mapping */
+	rc = osdblk_init_disk(osdev);
+	if (rc)
+		goto err_out_blkdev;
+
+	return 0;
+
+err_out_blkdev:
+	unregister_blkdev(osdev->major, osdev->name);
+err_out_osd:
+	osduld_put_device(osdev->osd);
+err_out_slot:
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+	list_del_init(&osdev->node);
+	mutex_unlock(&ctl_mutex);
+
+	kfree(osdev);
+err_out_mod:
+	module_put(THIS_MODULE);
+	return rc;
+}
+
+static ssize_t class_osdblk_remove(struct class *c, const char *buf,
+					size_t count)
+{
+	struct osdblk_device *osdev = NULL;
+	int target_id, rc;
+	unsigned long ul;
+	struct list_head *tmp;
+
+	rc = strict_strtoul(buf, 10, &ul);
+	if (rc)
+		return rc;
+
+	/* convert to int; abort if we lost anything in the conversion */
+	target_id = (int) ul;
+	if (target_id != ul)
+		return -EINVAL;
+
+	/* remove object from list immediately */
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	list_for_each(tmp, &osdblkdev_list) {
+		osdev = list_entry(tmp, struct osdblk_device, node);
+		if (osdev->id == target_id) {
+			list_del_init(&osdev->node);
+			break;
+		}
+		osdev = NULL;
+	}
+
+	mutex_unlock(&ctl_mutex);
+
+	if (!osdev)
+		return -ENOENT;
+
+	/* clean up and free blkdev and associated OSD connection */
+	osdblk_free_disk(osdev);
+	unregister_blkdev(osdev->major, osdev->name);
+	osduld_put_device(osdev->osd);
+	kfree(osdev);
+
+	/* release module ref */
+	module_put(THIS_MODULE);
+
+	return 0;
+}
+
+static struct class_attribute class_osdblk_attrs[] = {
+	__ATTR(add,	0200, NULL, class_osdblk_add),
+	__ATTR(remove,	0200, NULL, class_osdblk_remove),
+	__ATTR(list,	0444, class_osdblk_list, NULL),
+	__ATTR_NULL
+};
+
+static int osdblk_sysfs_init(void)
+{
+	int ret = 0;
+
+	/*
+	 * create control files in sysfs
+	 * /sys/class/osdblk/...
+	 */
+	class_osdblk = kzalloc(sizeof(*class_osdblk), GFP_KERNEL);
+	if (!class_osdblk)
+		return -ENOMEM;
+
+	class_osdblk->name = DRV_NAME;
+	class_osdblk->owner = THIS_MODULE;
+	class_osdblk->class_release = class_osdblk_release;
+	class_osdblk->class_attrs = class_osdblk_attrs;
+
+	ret = class_register(class_osdblk);
+	if (ret) {
+		kfree(class_osdblk);
+		class_osdblk = NULL;
+		printk(PFX "failed to create class osdblk\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void osdblk_sysfs_cleanup(void)
+{
+	if (class_osdblk)
+		class_destroy(class_osdblk);
+	class_osdblk = NULL;
+}
+
+static int __init osdblk_init(void)
+{
+	int rc;
+
+	rc = osdblk_sysfs_init();
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static void __exit osdblk_exit(void)
+{
+	osdblk_sysfs_cleanup();
+}
+
+module_init(osdblk_init);
+module_exit(osdblk_exit);
+

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH] osdblk: a Linux block device for OSD objects
  2009-04-05 10:18     ` Boaz Harrosh
@ 2009-04-08  1:29       ` Jeff Garzik
  2009-04-08  5:45         ` Jens Axboe
  0 siblings, 1 reply; 24+ messages in thread
From: Jeff Garzik @ 2009-04-08  1:29 UTC (permalink / raw)
  To: Boaz Harrosh; +Cc: Jens Axboe, LKML, linux-scsi, linux-fsdevel, Andrew Morton

Boaz Harrosh wrote:
> On 04/03/2009 12:58 PM, Jeff Garzik wrote:
>> Jens Axboe wrote:
>>> This wont work, GFP_NOIO inside the queue lock. You are also only
>>> cloning the front bio, what happens if you have > 1 bio on the request?
>>> You seem to dequeue the request and complete all of it, regardless of
>>> whether bio->bi_size == blk_rq_bytes(rq). I'm assuming you have to clone
>>> because of how the osd_req_{read,write} works, so I'd suggest storing
>>> the byte size in your osdblk_request and only completing that in
>>> osdblk_end_request(). Then do a rq_for_each_bio() look in there, and
>>> only dequeue if you manage to start an osd request for each of them,
>>> THEN moving on to the next request.
> 
> There is nothing preventing from issuing a linked bio list. The only thing
> is that osd_read/write looks at the first bio for total size.
> If the first bio->bi_size does not specify the full length of the chain
> then we should add another parameter to osd_read/write for that.
> 
> The original idea was to specifically allow chained bios.
> 
> Please advise?

As passed to us from the block layer, there is nothing special about the 
size of the first bio, AFAIK.

This seems like a libosd bug?  If you want to support chained bio's, I 
would presume you would either walk the list and sum all sizes, or in 
some other way input the total request size?

	Jeff




^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] osdblk: a Linux block device for OSD objects
  2009-04-08  1:29       ` Jeff Garzik
@ 2009-04-08  5:45         ` Jens Axboe
  2009-04-08  6:02           ` Jeff Garzik
  0 siblings, 1 reply; 24+ messages in thread
From: Jens Axboe @ 2009-04-08  5:45 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Boaz Harrosh, LKML, linux-scsi, linux-fsdevel, Andrew Morton

On Tue, Apr 07 2009, Jeff Garzik wrote:
> Boaz Harrosh wrote:
>> On 04/03/2009 12:58 PM, Jeff Garzik wrote:
>>> Jens Axboe wrote:
>>>> This wont work, GFP_NOIO inside the queue lock. You are also only
>>>> cloning the front bio, what happens if you have > 1 bio on the request?
>>>> You seem to dequeue the request and complete all of it, regardless of
>>>> whether bio->bi_size == blk_rq_bytes(rq). I'm assuming you have to clone
>>>> because of how the osd_req_{read,write} works, so I'd suggest storing
>>>> the byte size in your osdblk_request and only completing that in
>>>> osdblk_end_request(). Then do a rq_for_each_bio() look in there, and
>>>> only dequeue if you manage to start an osd request for each of them,
>>>> THEN moving on to the next request.
>>
>> There is nothing preventing from issuing a linked bio list. The only thing
>> is that osd_read/write looks at the first bio for total size.
>> If the first bio->bi_size does not specify the full length of the chain
>> then we should add another parameter to osd_read/write for that.
>>
>> The original idea was to specifically allow chained bios.
>>
>> Please advise?
>
> As passed to us from the block layer, there is nothing special about the  
> size of the first bio, AFAIK.
>
> This seems like a libosd bug?  If you want to support chained bio's, I  
> would presume you would either walk the list and sum all sizes, or in  
> some other way input the total request size?

Completely agree, if you want to support passing in a chain, you better
make the first bio just part of the chain (not some header bio).

And Jeff, you still have that bio_clone() bug in your v2 posting.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] osdblk: a Linux block device for OSD objects
  2009-04-08  5:45         ` Jens Axboe
@ 2009-04-08  6:02           ` Jeff Garzik
  2009-04-08  6:08             ` Jens Axboe
  0 siblings, 1 reply; 24+ messages in thread
From: Jeff Garzik @ 2009-04-08  6:02 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Boaz Harrosh, LKML, linux-scsi, linux-fsdevel, Andrew Morton

Jens Axboe wrote:
> On Tue, Apr 07 2009, Jeff Garzik wrote:
>> Boaz Harrosh wrote:
>>> On 04/03/2009 12:58 PM, Jeff Garzik wrote:
>>>> Jens Axboe wrote:
>>>>> This wont work, GFP_NOIO inside the queue lock. You are also only
>>>>> cloning the front bio, what happens if you have > 1 bio on the request?
>>>>> You seem to dequeue the request and complete all of it, regardless of
>>>>> whether bio->bi_size == blk_rq_bytes(rq). I'm assuming you have to clone
>>>>> because of how the osd_req_{read,write} works, so I'd suggest storing
>>>>> the byte size in your osdblk_request and only completing that in
>>>>> osdblk_end_request(). Then do a rq_for_each_bio() look in there, and
>>>>> only dequeue if you manage to start an osd request for each of them,
>>>>> THEN moving on to the next request.
>>> There is nothing preventing from issuing a linked bio list. The only thing
>>> is that osd_read/write looks at the first bio for total size.
>>> If the first bio->bi_size does not specify the full length of the chain
>>> then we should add another parameter to osd_read/write for that.
>>>
>>> The original idea was to specifically allow chained bios.
>>>
>>> Please advise?
>> As passed to us from the block layer, there is nothing special about the  
>> size of the first bio, AFAIK.
>>
>> This seems like a libosd bug?  If you want to support chained bio's, I  
>> would presume you would either walk the list and sum all sizes, or in  
>> some other way input the total request size?
> 
> Completely agree, if you want to support passing in a chain, you better
> make the first bio just part of the chain (not some header bio).
> 
> And Jeff, you still have that bio_clone() bug in your v2 posting.

Yep, that was noted in the patch description as "The major remaining 
pre-merge FIXME" :)

	Jeff



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] osdblk: a Linux block device for OSD objects
  2009-04-08  6:02           ` Jeff Garzik
@ 2009-04-08  6:08             ` Jens Axboe
  0 siblings, 0 replies; 24+ messages in thread
From: Jens Axboe @ 2009-04-08  6:08 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Boaz Harrosh, LKML, linux-scsi, linux-fsdevel, Andrew Morton

On Wed, Apr 08 2009, Jeff Garzik wrote:
> Jens Axboe wrote:
>> On Tue, Apr 07 2009, Jeff Garzik wrote:
>>> Boaz Harrosh wrote:
>>>> On 04/03/2009 12:58 PM, Jeff Garzik wrote:
>>>>> Jens Axboe wrote:
>>>>>> This wont work, GFP_NOIO inside the queue lock. You are also only
>>>>>> cloning the front bio, what happens if you have > 1 bio on the request?
>>>>>> You seem to dequeue the request and complete all of it, regardless of
>>>>>> whether bio->bi_size == blk_rq_bytes(rq). I'm assuming you have to clone
>>>>>> because of how the osd_req_{read,write} works, so I'd suggest storing
>>>>>> the byte size in your osdblk_request and only completing that in
>>>>>> osdblk_end_request(). Then do a rq_for_each_bio() look in there, and
>>>>>> only dequeue if you manage to start an osd request for each of them,
>>>>>> THEN moving on to the next request.
>>>> There is nothing preventing from issuing a linked bio list. The only thing
>>>> is that osd_read/write looks at the first bio for total size.
>>>> If the first bio->bi_size does not specify the full length of the chain
>>>> then we should add another parameter to osd_read/write for that.
>>>>
>>>> The original idea was to specifically allow chained bios.
>>>>
>>>> Please advise?
>>> As passed to us from the block layer, there is nothing special about 
>>> the  size of the first bio, AFAIK.
>>>
>>> This seems like a libosd bug?  If you want to support chained bio's, 
>>> I  would presume you would either walk the list and sum all sizes, or 
>>> in  some other way input the total request size?
>>
>> Completely agree, if you want to support passing in a chain, you better
>> make the first bio just part of the chain (not some header bio).
>>
>> And Jeff, you still have that bio_clone() bug in your v2 posting.
>
> Yep, that was noted in the patch description as "The major remaining  
> pre-merge FIXME" :)

Indeed it is, sorry for missing that :-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 1/3] block/blk-map.c: blk_rq_append_bio should ensure it's not appending a chain
  2009-04-07 22:53 ` [PATCH v2] " Jeff Garzik
@ 2009-04-10 11:48   ` Jeff Garzik
  2009-04-10 11:49     ` [PATCH 2/3] osd_initiator: support bio chains Jeff Garzik
  0 siblings, 1 reply; 24+ messages in thread
From: Jeff Garzik @ 2009-04-10 11:48 UTC (permalink / raw)
  To: LKML, linux-scsi
  Cc: linux-fsdevel, axboe, Andrew Morton, James.Bottomley, bharrosh, osd-dev


Add a standard bit of list safety.  The other functions called in
blk_rq_append_bio() are written to assume only a single bio is passed to
them.  Therefore, it is an error to _ever_ pass a list to this function.

Enforce this error by ensuring the bio list is guaranteed to only append
a single list item with each blk_rq_append_bio() call.

Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/block/blk-map.c b/block/blk-map.c
index f103729..a7eeac4 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -19,6 +19,7 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 	else {
 		rq->biotail->bi_next = bio;
 		rq->biotail = bio;
+		bio->bi_next = NULL;
 
 		rq->data_len += bio->bi_size;
 	}

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 2/3] osd_initiator: support bio chains
  2009-04-10 11:48   ` [PATCH 1/3] block/blk-map.c: blk_rq_append_bio should ensure it's not appending a chain Jeff Garzik
@ 2009-04-10 11:49     ` Jeff Garzik
  2009-04-10 11:50       ` [PATCH 3/3 v3] osdblk: a Linux block device for OSD objects Jeff Garzik
  2009-04-27 16:02       ` [PATCH 2/3] osd_initiator: support bio chains Boaz Harrosh
  0 siblings, 2 replies; 24+ messages in thread
From: Jeff Garzik @ 2009-04-10 11:49 UTC (permalink / raw)
  To: LKML, linux-scsi
  Cc: linux-fsdevel, axboe, Andrew Morton, James.Bottomley, bharrosh, osd-dev


Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/scsi/osd/osd_initiator.c |   83 +++++++++++++++++++++++++++++++--------
 1 file changed, 67 insertions(+), 16 deletions(-)

diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 2a5f077..2d35d65 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -709,14 +709,36 @@ EXPORT_SYMBOL(osd_req_remove_object);
 	struct osd_obj_id *first, struct osd_obj_id_list *list, unsigned nelem);
 */
 
+static unsigned int bio_chain_size(struct bio *bio)
+{
+	unsigned int chain_size = 0;
+
+	while (bio) {
+		chain_size += bio->bi_size;
+		bio = bio->bi_next;
+	}
+
+	return chain_size;
+}
+
 void osd_req_write(struct osd_request *or,
 	const struct osd_obj_id *obj, struct bio *bio, u64 offset)
 {
-	_osd_req_encode_common(or, OSD_ACT_WRITE, obj, offset, bio->bi_size);
+	unsigned int chain_size = 0;
+	struct bio *tmp_bio = bio;
+
+	while (tmp_bio) {
+		chain_size += tmp_bio->bi_size;
+		tmp_bio->bi_rw |= (1 << BIO_RW);
+
+		tmp_bio = tmp_bio->bi_next;
+	}
+
+	_osd_req_encode_common(or, OSD_ACT_WRITE, obj, offset, chain_size);
 	WARN_ON(or->out.bio || or->out.total_bytes);
-	bio->bi_rw |= (1 << BIO_RW);
+
 	or->out.bio = bio;
-	or->out.total_bytes = bio->bi_size;
+	or->out.total_bytes = chain_size;
 }
 EXPORT_SYMBOL(osd_req_write);
 
@@ -747,11 +769,21 @@ EXPORT_SYMBOL(osd_req_flush_object);
 void osd_req_read(struct osd_request *or,
 	const struct osd_obj_id *obj, struct bio *bio, u64 offset)
 {
-	_osd_req_encode_common(or, OSD_ACT_READ, obj, offset, bio->bi_size);
+	unsigned int chain_size = 0;
+	struct bio *tmp_bio = bio;
+
+	while (tmp_bio) {
+		chain_size += tmp_bio->bi_size;
+		tmp_bio->bi_rw &= ~(1 << BIO_RW);
+
+		tmp_bio = tmp_bio->bi_next;
+	}
+
+	_osd_req_encode_common(or, OSD_ACT_READ, obj, offset, chain_size);
 	WARN_ON(or->in.bio || or->in.total_bytes);
-	bio->bi_rw &= ~(1 << BIO_RW);
+
 	or->in.bio = bio;
-	or->in.total_bytes = bio->bi_size;
+	or->in.total_bytes = chain_size;
 }
 EXPORT_SYMBOL(osd_req_read);
 
@@ -1176,7 +1208,7 @@ static int _osd_req_finalize_data_integrity(struct osd_request *or,
 		unsigned pad;
 
 		or->out_data_integ.data_bytes = cpu_to_be64(
-			or->out.bio ? or->out.bio->bi_size : 0);
+			or->out.bio ? bio_chain_size(or->out.bio) : 0);
 		or->out_data_integ.set_attributes_bytes = cpu_to_be64(
 			or->set_attr.total_bytes);
 		or->out_data_integ.get_attributes_bytes = cpu_to_be64(
@@ -1269,6 +1301,7 @@ int osd_finalize_request(struct osd_request *or,
 	struct osd_cdb_head *cdbh = osd_cdb_head(&or->cdb);
 	bool has_in, has_out;
 	int ret;
+	struct bio *bio, *tmp;
 
 	if (options & OSD_REQ_FUA)
 		cdbh->options |= OSD_CDB_FUA;
@@ -1292,23 +1325,40 @@ int osd_finalize_request(struct osd_request *or,
 	}
 
 	if (or->out.bio) {
-		ret = blk_rq_append_bio(or->request->q, or->out.req,
-					or->out.bio);
-		if (ret) {
-			OSD_DEBUG("blk_rq_append_bio out failed\n");
-			return ret;
+		bio = or->out.bio;
+		while (bio) {
+			tmp = bio;
+			bio = bio->bi_next;
+
+			ret = blk_rq_append_bio(or->request->q, or->out.req,
+						tmp);
+			if (ret) {
+				OSD_DEBUG("blk_rq_append_bio out failed\n");
+				return ret;
+			}
 		}
 		OSD_DEBUG("out bytes=%llu (bytes_req=%u)\n",
 			_LLU(or->out.total_bytes), or->out.req->data_len);
+
+		or->out.bio = NULL;
 	}
 	if (or->in.bio) {
-		ret = blk_rq_append_bio(or->request->q, or->in.req, or->in.bio);
-		if (ret) {
-			OSD_DEBUG("blk_rq_append_bio in failed\n");
-			return ret;
+		bio = or->in.bio;
+		while (bio) {
+			tmp = bio;
+			bio = bio->bi_next;
+
+			ret = blk_rq_append_bio(or->request->q, or->in.req,
+						tmp);
+			if (ret) {
+				OSD_DEBUG("blk_rq_append_bio in failed\n");
+				return ret;
+			}
 		}
 		OSD_DEBUG("in bytes=%llu (bytes_req=%u)\n",
 			_LLU(or->in.total_bytes), or->in.req->data_len);
+
+		or->in.bio = NULL;
 	}
 
 	or->out.pad_buff = sg_out_pad_buffer;
@@ -1618,6 +1668,7 @@ void osd_sec_sign_cdb(struct osd_cdb *ocdb __unused, const u8 *cap_key __unused)
 void osd_sec_sign_data(void *data_integ __unused,
 		       struct bio *bio __unused, const u8 *cap_key __unused)
 {
+	/* NOTE: bio is a linked list */
 }
 
 /*

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 3/3 v3] osdblk: a Linux block device for OSD objects
  2009-04-10 11:49     ` [PATCH 2/3] osd_initiator: support bio chains Jeff Garzik
@ 2009-04-10 11:50       ` Jeff Garzik
  2009-04-27 15:59         ` Boaz Harrosh
  2009-04-27 16:02       ` [PATCH 2/3] osd_initiator: support bio chains Boaz Harrosh
  1 sibling, 1 reply; 24+ messages in thread
From: Jeff Garzik @ 2009-04-10 11:50 UTC (permalink / raw)
  To: LKML, linux-scsi
  Cc: linux-fsdevel, axboe, Andrew Morton, James.Bottomley, bharrosh, osd-dev


NOT-Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
Changes since last version:
- support bio chains

Still needs testing with some real OSD devices...


 drivers/block/Kconfig            |   16 
 drivers/block/Makefile           |    1 
 drivers/block/osdblk.c           |  669 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 754 insertions(+), 16 deletions(-)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index ddea8e4..34722c8 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -298,6 +298,22 @@ config BLK_DEV_NBD
 
 	  If unsure, say N.
 
+config BLK_DEV_OSD
+	tristate "OSD object-as-blkdev support"
+	depends on SCSI_OSD_INITIATOR
+	---help---
+	  Saying Y or M here will allow the exporting of a single SCSI
+	  OSD (object-based storage) object as a Linux block device.
+
+	  For example, if you create a 2G object on an OSD device,
+	  you can then use this module to present that 2G object as
+	  a Linux block device.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called osdblk.
+
+	  If unsure, say N.
+
 config BLK_DEV_SX8
 	tristate "Promise SATA SX8 support"
 	depends on PCI
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 7755a5e..cdaa3f8 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_XILINX_SYSACE)	+= xsysace.o
 obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
 obj-$(CONFIG_MG_DISK)		+= mg_disk.o
 obj-$(CONFIG_SUNVDC)		+= sunvdc.o
+obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
 
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
 obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
new file mode 100644
index 0000000..c7a1bb7
--- /dev/null
+++ b/drivers/block/osdblk.c
@@ -0,0 +1,669 @@
+
+/*
+   osdblk.c -- Export a single SCSI OSD object as a Linux block device
+
+
+   Copyright 2009 Red Hat, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+   Instructions for use
+   --------------------
+
+   1) Map a Linux block device to an existing OSD object.
+
+      In this example, we will use partition id 1234, object id 5678,
+      OSD device /dev/osd1.
+
+      $ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add
+
+
+   2) List all active blkdev<->object mappings.
+
+      In this example, we have performed step #1 twice, creating two blkdevs,
+      mapped to two separate OSD objects.
+
+      $ cat /sys/class/osdblk/list
+      0 174 1234 5678 /dev/osd1
+      1 179 1994 897123 /dev/osd0
+
+      The columns, in order, are:
+      - blkdev unique id
+      - blkdev assigned major
+      - OSD object partition id
+      - OSD object id
+      - OSD device
+
+
+   3) Remove an active blkdev<->object mapping.
+
+      In this example, we remove the mapping with blkdev unique id 1.
+
+      $ echo 1 > /sys/class/osdblk/remove
+
+
+   NOTE:  The actual creation and deletion of OSD objects is outside the scope
+   of this driver.
+
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_sec.h>
+
+#define DRV_NAME "osdblk"
+#define PFX DRV_NAME ": "
+
+struct osdblk_device;
+
+enum {
+	OSDBLK_MINORS_PER_MAJOR	= 256,		/* max minors per blkdev */
+	OSDBLK_MAX_REQ		= 32,		/* max parallel requests */
+	OSDBLK_OP_TIMEOUT	= 4 * 60,	/* sync OSD req timeout */
+};
+
+struct osdblk_request {
+	struct request		*rq;		/* blk layer request */
+	struct bio		*bio;		/* cloned bio */
+	struct osdblk_device	*osdev;		/* associated blkdev */
+};
+
+struct osdblk_device {
+	int			id;		/* blkdev unique id */
+
+	int			major;		/* blkdev assigned major */
+	struct gendisk		*disk;		/* blkdev's gendisk and rq */
+	struct request_queue	*q;
+
+	struct osd_dev		*osd;		/* associated OSD */
+
+	char			name[32];	/* blkdev name, e.g. osdblk34 */
+
+	spinlock_t		lock;		/* queue lock */
+
+	struct osd_obj_id	obj;		/* OSD partition, obj id */
+	uint8_t			obj_cred[OSD_CAP_LEN]; /* OSD cred */
+
+	struct osdblk_request	req[OSDBLK_MAX_REQ]; /* request table */
+
+	struct list_head	node;
+
+	char			osd_path[0];	/* OSD device path */
+};
+
+static struct class *class_osdblk;		/* /sys/class/osdblk */
+static struct mutex ctl_mutex;	/* Serialize open/close/setup/teardown */
+static LIST_HEAD(osdblkdev_list);
+
+static struct block_device_operations osdblk_bd_ops = {
+	.owner		= THIS_MODULE,
+};
+
+static const struct osd_attr g_attr_logical_length = ATTR_DEF(
+	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+
+/* copied from exofs; move to libosd? */
+static void osd_make_credential(u8 cred_a[OSD_CAP_LEN],
+				const struct osd_obj_id *obj)
+{
+	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+
+/*
+ * Perform a synchronous OSD operation.  copied from exofs; move to libosd?
+ */
+static int osd_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
+{
+	int ret;
+
+	or->timeout = timeout;
+	ret = osd_finalize_request(or, 0, credential, NULL);
+	if (ret)
+		return ret;
+
+	ret = osd_execute_request(or);
+
+	/* osd_req_decode_sense(or, ret); */
+	return ret;
+}
+
+/*
+ * Perform an asynchronous OSD operation.  copied from exofs; move to libosd?
+ */
+static int osd_async_op(struct osd_request *or, osd_req_done_fn *async_done,
+		   void *caller_context, u8 *cred)
+{
+	int ret;
+
+	ret = osd_finalize_request(or, 0, cred, NULL);
+	if (ret)
+		return ret;
+
+	ret = osd_execute_request_async(or, async_done, caller_context);
+
+	return ret;
+}
+
+/* copied from exofs; move to libosd? */
+static int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
+{
+	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+	void *iter = NULL;
+	int nelem;
+
+	do {
+		nelem = 1;
+		osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
+		if ((cur_attr.attr_page == attr->attr_page) &&
+		    (cur_attr.attr_id == attr->attr_id)) {
+			attr->len = cur_attr.len;
+			attr->val_ptr = cur_attr.val_ptr;
+			return 0;
+		}
+	} while (iter);
+
+	return -EIO;
+}
+
+static int osdblk_get_obj_size(struct osdblk_device *osdev, u64 *size_out)
+{
+	struct osd_request *or;
+	struct osd_attr attr;
+	int ret;
+
+	/* start request */
+	or = osd_start_request(osdev->osd, GFP_KERNEL);
+	if (!or)
+		return -ENOMEM;
+
+	/* create a get-attributes(length) request */
+	osd_req_get_attributes(or, &osdev->obj);
+
+	osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
+
+	/* execute op synchronously */
+	ret = osd_sync_op(or, OSDBLK_OP_TIMEOUT, osdev->obj_cred);
+	if (ret)
+		goto out;
+
+	/* extract length from returned attribute info */
+	attr = g_attr_logical_length;
+	ret = extract_attr_from_req(or, &attr);
+	if (ret)
+		goto out;
+
+	*size_out = get_unaligned_be64(attr.val_ptr);
+
+out:
+	osd_end_request(or);
+	return ret;
+
+}
+
+static void osdblk_end_request(struct osdblk_device *osdev,
+			       struct osdblk_request *orq,
+			       int error)
+{
+	struct request *rq = orq->rq;
+	int rc;
+
+	/* complete request, at block layer */
+	rc = __blk_end_request(rq, error, blk_rq_bytes(rq));
+}
+
+static void osdblk_osd_complete(struct osd_request *or, void *private)
+{
+	struct osdblk_request *orq = private;
+	struct osd_sense_info osi;
+	int ret = osd_req_decode_sense(or, &osi);
+
+	if (ret)
+		ret = -EIO;
+
+	/* complete OSD request */
+	osd_end_request(or);
+
+	/* complete request passed to osdblk by block layer */
+	osdblk_end_request(orq->osdev, orq, ret);
+}
+
+static void bio_chain_put(struct bio *chain)
+{
+	struct bio *tmp;
+
+	while (chain) {
+		tmp = chain;
+		chain = chain->bi_next;
+
+		bio_put(tmp);
+	}
+}
+
+static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask)
+{
+	struct bio *tmp, *new_chain = NULL, *tail = NULL;
+
+	while (old_chain) {
+		tmp = bio_clone(old_chain, gfpmask);
+		if (!tmp)
+			goto err_out;
+
+		tmp->bi_next = NULL;
+		if (!new_chain)
+			new_chain = tail = tmp;
+		else {
+			tail->bi_next = tmp;
+			tail = tmp;
+		}
+
+		old_chain = old_chain->bi_next;
+	}
+
+	return new_chain;
+
+err_out:
+	bio_chain_put(new_chain);
+	return NULL;
+}
+
+static void osdblk_rq_fn(struct request_queue *q)
+{
+	struct osdblk_device *osdev = q->queuedata;
+	struct request *rq;
+	struct osdblk_request *orq;
+	struct osd_request *or;
+	struct bio *bio;
+	int do_write, do_flush;
+
+	while (1) {
+		/* peek at request from block layer */
+		rq = elv_next_request(q);
+		if (!rq)
+			break;
+
+		/* filter out block requests we don't understand */
+		if (!blk_fs_request(rq) && !blk_barrier_rq(rq)) {
+			end_request(rq, 0);
+			continue;
+		}
+
+		/* deduce our operation (read, write, flush) */
+		/* I wish the block layer simplified cmd_type/cmd_flags/cmd[]
+		 * into a clearly defined set of RPC commands:
+		 * read, write, flush, scsi command, power mgmt req,
+		 * driver-specific, etc.
+		 */
+
+		do_flush = (rq->special == (void *) 0xdeadbeefUL);
+		do_write = (rq_data_dir(rq) == WRITE);
+
+		/* a bio clone to be passed down to OSD request */
+		bio = bio_chain_clone(rq->bio, GFP_ATOMIC);
+		if (!bio)
+			break;
+
+		/* alloc internal OSD request, for OSD command execution */
+		or = osd_start_request(osdev->osd, GFP_ATOMIC);
+		if (!or) {
+			bio_chain_put(bio);
+			break;
+		}
+
+		orq = &osdev->req[rq->tag];
+		orq->rq = rq;
+		orq->bio = bio;
+		orq->osdev = osdev;
+
+		/* init OSD command: flush, write or read */
+		if (do_flush)
+			osd_req_flush_object(or, &osdev->obj,
+					     OSD_CDB_FLUSH_ALL, 0, 0);
+		else if (do_write)
+			osd_req_write(or, &osdev->obj, bio,
+				      rq->sector * 512ULL);
+		else
+			osd_req_read(or, &osdev->obj, bio,
+				     rq->sector * 512ULL);
+
+		/* begin OSD command execution */
+		if (osd_async_op(or, osdblk_osd_complete, orq,
+				 osdev->obj_cred)) {
+			osd_end_request(or);
+			blk_requeue_request(q, rq);
+			bio_chain_put(bio);
+		}
+
+		/* remove the special 'flush' marker, now that the command
+		 * is executing
+		 */
+		rq->special = NULL;
+	}
+}
+
+static void osdblk_prepare_flush(struct request_queue *q, struct request *rq)
+{
+	/* add driver-specific marker, to indicate that this request
+	 * is a flush command
+	 */
+	rq->special = (void *) 0xdeadbeefUL;
+}
+
+static void osdblk_free_disk(struct osdblk_device *osdev)
+{
+	struct gendisk *disk = osdev->disk;
+
+	if (!disk)
+		return;
+
+	if (disk->flags & GENHD_FL_UP)
+		del_gendisk(disk);
+	if (disk->queue)
+		blk_cleanup_queue(disk->queue);
+	put_disk(disk);
+}
+
+static int osdblk_init_disk(struct osdblk_device *osdev)
+{
+	struct gendisk *disk;
+	struct request_queue *q;
+	int rc;
+	u64 obj_size = 0;
+
+	/* contact OSD, request size info about the object being mapped */
+	rc = osdblk_get_obj_size(osdev, &obj_size);
+	if (rc)
+		return rc;
+
+	/* create gendisk info */
+	disk = alloc_disk(OSDBLK_MINORS_PER_MAJOR);
+	if (!disk)
+		return -ENOMEM;
+
+	sprintf(disk->disk_name, DRV_NAME "/%d", osdev->id);
+	disk->major = osdev->major;
+	disk->first_minor = 0;
+	disk->fops = &osdblk_bd_ops;
+	disk->private_data = osdev;
+
+	/* init rq */
+	q = blk_init_queue(osdblk_rq_fn, &osdev->lock);
+	if (!q) {
+		put_disk(disk);
+		return -ENOMEM;
+	}
+
+	/* switch queue to TCQ mode; allocate tag map */
+	rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL);
+	if (rc) {
+		blk_cleanup_queue(q);
+		put_disk(disk);
+		return rc;
+	}
+
+	blk_queue_prep_rq(q, blk_queue_start_tag);
+	blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH, osdblk_prepare_flush);
+
+	disk->queue = q;
+
+	q->queuedata = osdev;
+
+	osdev->disk = disk;
+	osdev->q = q;
+
+	/* finally, announce the disk to the world */
+	set_capacity(disk, obj_size);
+	add_disk(disk);
+
+	return 0;
+}
+
+/********************************************************************
+  /sys/class/osdblk/
+                     add	map OSD object to blkdev
+                     remove	unmap OSD object
+                     list	show mappings
+ *******************************************************************/
+
+static void class_osdblk_release(struct class *cls)
+{
+	kfree(cls);
+}
+
+static ssize_t class_osdblk_list(struct class *c, char *data)
+{
+	int n = 0;
+	struct list_head *tmp;
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	list_for_each(tmp, &osdblkdev_list) {
+		struct osdblk_device *osdev;
+
+		osdev = list_entry(tmp, struct osdblk_device, node);
+
+		n += sprintf(data+n, "%d %d %llu %llu %s\n",
+			osdev->id,
+			osdev->major,
+			osdev->obj.partition,
+			osdev->obj.id,
+			osdev->osd_path);
+	}
+
+	mutex_unlock(&ctl_mutex);
+	return n;
+}
+
+static ssize_t class_osdblk_add(struct class *c, const char *buf, size_t count)
+{
+	struct osdblk_device *osdev;
+	ssize_t rc;
+	int irc, new_id = 0;
+	struct list_head *tmp;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	/* new osdblk_device object */
+	osdev = kzalloc(sizeof(*osdev) + strlen(buf) + 1, GFP_KERNEL);
+	if (!osdev) {
+		rc = -ENOMEM;
+		goto err_out_mod;
+	}
+
+	/* static osdblk_device initialization */
+	spin_lock_init(&osdev->lock);
+	INIT_LIST_HEAD(&osdev->node);
+
+	/* generate unique id: find highest unique id, add one */
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	list_for_each(tmp, &osdblkdev_list) {
+		struct osdblk_device *osdev;
+
+		osdev = list_entry(tmp, struct osdblk_device, node);
+		if (osdev->id > new_id)
+			new_id = osdev->id + 1;
+	}
+
+	osdev->id = new_id;
+
+	/* add to global list */
+	list_add_tail(&osdev->node, &osdblkdev_list);
+
+	mutex_unlock(&ctl_mutex);
+
+	/* parse add command */
+	if (sscanf(buf, "%llu %llu %s", &osdev->obj.partition, &osdev->obj.id,
+		   osdev->osd_path) != 3) {
+		rc = -EINVAL;
+		goto err_out_slot;
+	}
+
+	/* initialize rest of new object */
+	sprintf(osdev->name, DRV_NAME "%d", osdev->id);
+
+	/* contact requested OSD */
+	osdev->osd = osduld_path_lookup(osdev->osd_path);
+	if (IS_ERR(osdev->osd)) {
+		rc = PTR_ERR(osdev->osd);
+		goto err_out_slot;
+	}
+
+	/* build OSD credential */
+	osd_make_credential(osdev->obj_cred, &osdev->obj);
+
+	/* register our block device */
+	irc = register_blkdev(0, osdev->name);
+	if (irc < 0) {
+		rc = irc;
+		goto err_out_osd;
+	}
+
+	osdev->major = irc;
+
+	/* set up and announce blkdev mapping */
+	rc = osdblk_init_disk(osdev);
+	if (rc)
+		goto err_out_blkdev;
+
+	return 0;
+
+err_out_blkdev:
+	unregister_blkdev(osdev->major, osdev->name);
+err_out_osd:
+	osduld_put_device(osdev->osd);
+err_out_slot:
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+	list_del_init(&osdev->node);
+	mutex_unlock(&ctl_mutex);
+
+	kfree(osdev);
+err_out_mod:
+	module_put(THIS_MODULE);
+	return rc;
+}
+
+static ssize_t class_osdblk_remove(struct class *c, const char *buf,
+					size_t count)
+{
+	struct osdblk_device *osdev = NULL;
+	int target_id, rc;
+	unsigned long ul;
+	struct list_head *tmp;
+
+	rc = strict_strtoul(buf, 10, &ul);
+	if (rc)
+		return rc;
+
+	/* convert to int; abort if we lost anything in the conversion */
+	target_id = (int) ul;
+	if (target_id != ul)
+		return -EINVAL;
+
+	/* remove object from list immediately */
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	list_for_each(tmp, &osdblkdev_list) {
+		osdev = list_entry(tmp, struct osdblk_device, node);
+		if (osdev->id == target_id) {
+			list_del_init(&osdev->node);
+			break;
+		}
+		osdev = NULL;
+	}
+
+	mutex_unlock(&ctl_mutex);
+
+	if (!osdev)
+		return -ENOENT;
+
+	/* clean up and free blkdev and associated OSD connection */
+	osdblk_free_disk(osdev);
+	unregister_blkdev(osdev->major, osdev->name);
+	osduld_put_device(osdev->osd);
+	kfree(osdev);
+
+	/* release module ref */
+	module_put(THIS_MODULE);
+
+	return 0;
+}
+
+static struct class_attribute class_osdblk_attrs[] = {
+	__ATTR(add,	0200, NULL, class_osdblk_add),
+	__ATTR(remove,	0200, NULL, class_osdblk_remove),
+	__ATTR(list,	0444, class_osdblk_list, NULL),
+	__ATTR_NULL
+};
+
+static int osdblk_sysfs_init(void)
+{
+	int ret = 0;
+
+	/*
+	 * create control files in sysfs
+	 * /sys/class/osdblk/...
+	 */
+	class_osdblk = kzalloc(sizeof(*class_osdblk), GFP_KERNEL);
+	if (!class_osdblk)
+		return -ENOMEM;
+
+	class_osdblk->name = DRV_NAME;
+	class_osdblk->owner = THIS_MODULE;
+	class_osdblk->class_release = class_osdblk_release;
+	class_osdblk->class_attrs = class_osdblk_attrs;
+
+	ret = class_register(class_osdblk);
+	if (ret) {
+		kfree(class_osdblk);
+		class_osdblk = NULL;
+		printk(PFX "failed to create class osdblk\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void osdblk_sysfs_cleanup(void)
+{
+	if (class_osdblk)
+		class_destroy(class_osdblk);
+	class_osdblk = NULL;
+}
+
+static int __init osdblk_init(void)
+{
+	int rc;
+
+	rc = osdblk_sysfs_init();
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static void __exit osdblk_exit(void)
+{
+	osdblk_sysfs_cleanup();
+}
+
+module_init(osdblk_init);
+module_exit(osdblk_exit);
+

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3 v3] osdblk: a Linux block device for OSD objects
  2009-04-10 11:50       ` [PATCH 3/3 v3] osdblk: a Linux block device for OSD objects Jeff Garzik
@ 2009-04-27 15:59         ` Boaz Harrosh
  2009-04-27 18:24           ` Jens Axboe
  0 siblings, 1 reply; 24+ messages in thread
From: Boaz Harrosh @ 2009-04-27 15:59 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: LKML, linux-scsi, linux-fsdevel, axboe, Andrew Morton,
	James.Bottomley, osd-dev, Tejun Heo

On 04/10/2009 02:50 PM, Jeff Garzik wrote:
> NOT-Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
> ---
> Changes since last version:
> - support bio chains
> 
> Still needs testing with some real OSD devices...
> 
> 
>  drivers/block/Kconfig            |   16 
>  drivers/block/Makefile           |    1 
>  drivers/block/osdblk.c           |  669 +++++++++++++++++++++++++++++++++++++++
>  5 files changed, 754 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index ddea8e4..34722c8 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -298,6 +298,22 @@ config BLK_DEV_NBD
>  
>  	  If unsure, say N.
>  
> +config BLK_DEV_OSD
> +	tristate "OSD object-as-blkdev support"
> +	depends on SCSI_OSD_INITIATOR
> +	---help---
> +	  Saying Y or M here will allow the exporting of a single SCSI
> +	  OSD (object-based storage) object as a Linux block device.
> +
> +	  For example, if you create a 2G object on an OSD device,
> +	  you can then use this module to present that 2G object as
> +	  a Linux block device.
> +
> +	  To compile this driver as a module, choose M here: the
> +	  module will be called osdblk.
> +
> +	  If unsure, say N.
> +
>  config BLK_DEV_SX8
>  	tristate "Promise SATA SX8 support"
>  	depends on PCI
> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
> index 7755a5e..cdaa3f8 100644
> --- a/drivers/block/Makefile
> +++ b/drivers/block/Makefile
> @@ -23,6 +23,7 @@ obj-$(CONFIG_XILINX_SYSACE)	+= xsysace.o
>  obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
>  obj-$(CONFIG_MG_DISK)		+= mg_disk.o
>  obj-$(CONFIG_SUNVDC)		+= sunvdc.o
> +obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
>  
>  obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
>  obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
> diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
> new file mode 100644
> index 0000000..c7a1bb7
> --- /dev/null
> +++ b/drivers/block/osdblk.c
> @@ -0,0 +1,669 @@
> +
> +/*
> +   osdblk.c -- Export a single SCSI OSD object as a Linux block device
> +
> +
> +   Copyright 2009 Red Hat, Inc.
> +
> +   This program is free software; you can redistribute it and/or modify
> +   it under the terms of the GNU General Public License as published by
> +   the Free Software Foundation.
> +
> +   This program is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +   GNU General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with this program; see the file COPYING.  If not, write to
> +   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
> +
> +
> +   Instructions for use
> +   --------------------
> +
> +   1) Map a Linux block device to an existing OSD object.
> +
> +      In this example, we will use partition id 1234, object id 5678,
> +      OSD device /dev/osd1.
> +
> +      $ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add
> +
> +
> +   2) List all active blkdev<->object mappings.
> +
> +      In this example, we have performed step #1 twice, creating two blkdevs,
> +      mapped to two separate OSD objects.
> +
> +      $ cat /sys/class/osdblk/list
> +      0 174 1234 5678 /dev/osd1
> +      1 179 1994 897123 /dev/osd0
> +
> +      The columns, in order, are:
> +      - blkdev unique id
> +      - blkdev assigned major
> +      - OSD object partition id
> +      - OSD object id
> +      - OSD device
> +
> +
> +   3) Remove an active blkdev<->object mapping.
> +
> +      In this example, we remove the mapping with blkdev unique id 1.
> +
> +      $ echo 1 > /sys/class/osdblk/remove
> +
> +
> +   NOTE:  The actual creation and deletion of OSD objects is outside the scope
> +   of this driver.
> +
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/device.h>
> +#include <linux/module.h>
> +#include <linux/fs.h>
> +#include <scsi/osd_initiator.h>
> +#include <scsi/osd_attributes.h>
> +#include <scsi/osd_sec.h>
> +
> +#define DRV_NAME "osdblk"
> +#define PFX DRV_NAME ": "
> +
> +struct osdblk_device;
> +
> +enum {
> +	OSDBLK_MINORS_PER_MAJOR	= 256,		/* max minors per blkdev */
> +	OSDBLK_MAX_REQ		= 32,		/* max parallel requests */
> +	OSDBLK_OP_TIMEOUT	= 4 * 60,	/* sync OSD req timeout */
> +};
> +
> +struct osdblk_request {
> +	struct request		*rq;		/* blk layer request */
> +	struct bio		*bio;		/* cloned bio */
> +	struct osdblk_device	*osdev;		/* associated blkdev */
> +};
> +
> +struct osdblk_device {
> +	int			id;		/* blkdev unique id */
> +
> +	int			major;		/* blkdev assigned major */
> +	struct gendisk		*disk;		/* blkdev's gendisk and rq */
> +	struct request_queue	*q;
> +
> +	struct osd_dev		*osd;		/* associated OSD */
> +
> +	char			name[32];	/* blkdev name, e.g. osdblk34 */
> +
> +	spinlock_t		lock;		/* queue lock */
> +
> +	struct osd_obj_id	obj;		/* OSD partition, obj id */
> +	uint8_t			obj_cred[OSD_CAP_LEN]; /* OSD cred */
> +
> +	struct osdblk_request	req[OSDBLK_MAX_REQ]; /* request table */
> +
> +	struct list_head	node;
> +
> +	char			osd_path[0];	/* OSD device path */
> +};
> +
> +static struct class *class_osdblk;		/* /sys/class/osdblk */
> +static struct mutex ctl_mutex;	/* Serialize open/close/setup/teardown */
> +static LIST_HEAD(osdblkdev_list);
> +
> +static struct block_device_operations osdblk_bd_ops = {
> +	.owner		= THIS_MODULE,
> +};
> +
> +static const struct osd_attr g_attr_logical_length = ATTR_DEF(
> +	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
> +
> +/* copied from exofs; move to libosd? */

- /* copied from exofs; move to libosd? */

> +static void osd_make_credential(u8 cred_a[OSD_CAP_LEN],
> +				const struct osd_obj_id *obj)
> +{
> +	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
> +}
> +

this member should stay. It will be enhanced in the future with calls
to a network security manager. For now it issues it's own NO_SEC caps.
Perhaps osd_make_credential => osdblk_make_credential?

+ /* copied from exofs; move to libosd? */

> +/*
> + * Perform a synchronous OSD operation.  copied from exofs; move to libosd?
> + */
> +static int osd_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
> +{
> +	int ret;
> +
> +	or->timeout = timeout;
> +	ret = osd_finalize_request(or, 0, credential, NULL);
> +	if (ret)
> +		return ret;
> +
> +	ret = osd_execute_request(or);
> +
> +	/* osd_req_decode_sense(or, ret); */
> +	return ret;
> +}
> +
> +/*
> + * Perform an asynchronous OSD operation.  copied from exofs; move to libosd?
> + */
> +static int osd_async_op(struct osd_request *or, osd_req_done_fn *async_done,
> +		   void *caller_context, u8 *cred)
> +{
> +	int ret;
> +
> +	ret = osd_finalize_request(or, 0, cred, NULL);
> +	if (ret)
> +		return ret;
> +
> +	ret = osd_execute_request_async(or, async_done, caller_context);
> +
> +	return ret;
> +}
> +

NOTE-TO-ME:
  these two will be changed to plain osd_execute_* where osd_finalize_request
  will be called internally.

> +/* copied from exofs; move to libosd? */
> +static int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
> +{
> +	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
> +	void *iter = NULL;
> +	int nelem;
> +
> +	do {
> +		nelem = 1;
> +		osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
> +		if ((cur_attr.attr_page == attr->attr_page) &&
> +		    (cur_attr.attr_id == attr->attr_id)) {
> +			attr->len = cur_attr.len;
> +			attr->val_ptr = cur_attr.val_ptr;
> +			return 0;
> +		}
> +	} while (iter);
> +
> +	return -EIO;
> +}
> +

NOTE-TO-ME:
  This is needed because of a bug (short-comings) in osd_req_decode_get_attr_list().
  Once fixed it can be just removed.

> +static int osdblk_get_obj_size(struct osdblk_device *osdev, u64 *size_out)
> +{
> +	struct osd_request *or;
> +	struct osd_attr attr;
> +	int ret;
> +
> +	/* start request */
> +	or = osd_start_request(osdev->osd, GFP_KERNEL);
> +	if (!or)
> +		return -ENOMEM;
> +
> +	/* create a get-attributes(length) request */
> +	osd_req_get_attributes(or, &osdev->obj);
> +
> +	osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
> +
> +	/* execute op synchronously */
> +	ret = osd_sync_op(or, OSDBLK_OP_TIMEOUT, osdev->obj_cred);
> +	if (ret)
> +		goto out;
> +
> +	/* extract length from returned attribute info */
> +	attr = g_attr_logical_length;
> +	ret = extract_attr_from_req(or, &attr);
> +	if (ret)
> +		goto out;
> +
> +	*size_out = get_unaligned_be64(attr.val_ptr);
> +
> +out:
> +	osd_end_request(or);
> +	return ret;
> +
> +}
> +
> +static void osdblk_end_request(struct osdblk_device *osdev,
> +			       struct osdblk_request *orq,
> +			       int error)
> +{
> +	struct request *rq = orq->rq;
> +	int rc;
> +
> +	/* complete request, at block layer */
> +	rc = __blk_end_request(rq, error, blk_rq_bytes(rq));
> +}
> +
> +static void osdblk_osd_complete(struct osd_request *or, void *private)
> +{
> +	struct osdblk_request *orq = private;
> +	struct osd_sense_info osi;
> +	int ret = osd_req_decode_sense(or, &osi);
> +
> +	if (ret)
> +		ret = -EIO;
> +
> +	/* complete OSD request */
> +	osd_end_request(or);
> +
> +	/* complete request passed to osdblk by block layer */
> +	osdblk_end_request(orq->osdev, orq, ret);

Only a single user left for osdblk_end_request(). Perhaps it would be
more clear to open code it?

> +}
> +
> +static void bio_chain_put(struct bio *chain)
> +{
> +	struct bio *tmp;
> +
> +	while (chain) {
> +		tmp = chain;
> +		chain = chain->bi_next;
> +
> +		bio_put(tmp);
> +	}
> +}
> +

NOTE-TO-ME:
  blk_bio_put()

> +static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask)
> +{
> +	struct bio *tmp, *new_chain = NULL, *tail = NULL;
> +
> +	while (old_chain) {
> +		tmp = bio_clone(old_chain, gfpmask);
> +		if (!tmp)
> +			goto err_out;
> +
> +		tmp->bi_next = NULL;
> +		if (!new_chain)
> +			new_chain = tail = tmp;
> +		else {
> +			tail->bi_next = tmp;
> +			tail = tmp;
> +		}
> +
> +		old_chain = old_chain->bi_next;
> +	}
> +
> +	return new_chain;
> +
> +err_out:
> +	bio_chain_put(new_chain);
> +	return NULL;
> +}
> +

NOTE-TO-ME:
  blk_bio_clone()

> +static void osdblk_rq_fn(struct request_queue *q)
> +{
> +	struct osdblk_device *osdev = q->queuedata;
> +	struct request *rq;
> +	struct osdblk_request *orq;
> +	struct osd_request *or;
> +	struct bio *bio;
> +	int do_write, do_flush;
> +
> +	while (1) {
> +		/* peek at request from block layer */
> +		rq = elv_next_request(q);
> +		if (!rq)
> +			break;
> +
> +		/* filter out block requests we don't understand */
> +		if (!blk_fs_request(rq) && !blk_barrier_rq(rq)) {
> +			end_request(rq, 0);
> +			continue;
> +		}
> +
> +		/* deduce our operation (read, write, flush) */
> +		/* I wish the block layer simplified cmd_type/cmd_flags/cmd[]
> +		 * into a clearly defined set of RPC commands:
> +		 * read, write, flush, scsi command, power mgmt req,
> +		 * driver-specific, etc.
> +		 */
> +
> +		do_flush = (rq->special == (void *) 0xdeadbeefUL);

That's for real? or it needs a "FIXME" next to it?

> +		do_write = (rq_data_dir(rq) == WRITE);
> +
> +		/* a bio clone to be passed down to OSD request */
> +		bio = bio_chain_clone(rq->bio, GFP_ATOMIC);
> +		if (!bio)
> +			break;

does blk_barrier_rq() have a rq->bio?

> +
> +		/* alloc internal OSD request, for OSD command execution */
> +		or = osd_start_request(osdev->osd, GFP_ATOMIC);
> +		if (!or) {
> +			bio_chain_put(bio);
> +			break;
> +		}
> +
> +		orq = &osdev->req[rq->tag];
> +		orq->rq = rq;
> +		orq->bio = bio;
> +		orq->osdev = osdev;
> +
> +		/* init OSD command: flush, write or read */
> +		if (do_flush)
> +			osd_req_flush_object(or, &osdev->obj,
> +					     OSD_CDB_FLUSH_ALL, 0, 0);

Is this enough? I mean, osd_req_flush_object+osd_async_op will insure osd-target
writes all it's caches to stable storage. But what about in-flight requests.
Do we need to wait for all requests to finish then issue the flush, or does
the block layer do that for us? (is queue empty at this point?)

> +		else if (do_write)
> +			osd_req_write(or, &osdev->obj, bio,
> +				      rq->sector * 512ULL);
> +		else
> +			osd_req_read(or, &osdev->obj, bio,
> +				     rq->sector * 512ULL);
> +

OK, this is my original mistake, question remains:

Are we guaranteed that blk_rq_bytes(rq) (see osdblk_end_request) is the same
as chained_size(bio). I think that there are corner cases when it might be a bit
shorter (bio-size was rounded up)

I wish we just pass total_bytes to osd_req_read/write and let blk layer sort this
out. Will also save the trouble inside osd_req_read/write too.

so for example:

+			osd_req_read(or, &osdev->obj, bio, blk_rq_bytes(rq),
+				     rq->sector * 512ULL);

(It should be no trouble to change all users)

> +		/* begin OSD command execution */
> +		if (osd_async_op(or, osdblk_osd_complete, orq,
> +				 osdev->obj_cred)) {
> +			osd_end_request(or);
> +			blk_requeue_request(q, rq);
> +			bio_chain_put(bio);
> +		}
> +
> +		/* remove the special 'flush' marker, now that the command
> +		 * is executing
> +		 */
> +		rq->special = NULL;
> +	}
> +}
> +
> +static void osdblk_prepare_flush(struct request_queue *q, struct request *rq)
> +{
> +	/* add driver-specific marker, to indicate that this request
> +	 * is a flush command
> +	 */
> +	rq->special = (void *) 0xdeadbeefUL;

OK now I get it, disregard the comment above

> +}
> +
> +static void osdblk_free_disk(struct osdblk_device *osdev)
> +{
> +	struct gendisk *disk = osdev->disk;
> +
> +	if (!disk)
> +		return;
> +
> +	if (disk->flags & GENHD_FL_UP)
> +		del_gendisk(disk);
> +	if (disk->queue)
> +		blk_cleanup_queue(disk->queue);
> +	put_disk(disk);
> +}
> +
> +static int osdblk_init_disk(struct osdblk_device *osdev)
> +{
> +	struct gendisk *disk;
> +	struct request_queue *q;
> +	int rc;
> +	u64 obj_size = 0;
> +
> +	/* contact OSD, request size info about the object being mapped */
> +	rc = osdblk_get_obj_size(osdev, &obj_size);
> +	if (rc)
> +		return rc;
> +
> +	/* create gendisk info */
> +	disk = alloc_disk(OSDBLK_MINORS_PER_MAJOR);
> +	if (!disk)
> +		return -ENOMEM;
> +
> +	sprintf(disk->disk_name, DRV_NAME "/%d", osdev->id);
> +	disk->major = osdev->major;
> +	disk->first_minor = 0;
> +	disk->fops = &osdblk_bd_ops;
> +	disk->private_data = osdev;
> +
> +	/* init rq */
> +	q = blk_init_queue(osdblk_rq_fn, &osdev->lock);
> +	if (!q) {
> +		put_disk(disk);
> +		return -ENOMEM;
> +	}
> +
> +	/* switch queue to TCQ mode; allocate tag map */
> +	rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL);
> +	if (rc) {
> +		blk_cleanup_queue(q);
> +		put_disk(disk);
> +		return rc;
> +	}
> +
> +	blk_queue_prep_rq(q, blk_queue_start_tag);
> +	blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH, osdblk_prepare_flush);
> +
> +	disk->queue = q;
> +
> +	q->queuedata = osdev;
> +
> +	osdev->disk = disk;
> +	osdev->q = q;
> +
> +	/* finally, announce the disk to the world */
> +	set_capacity(disk, obj_size);
> +	add_disk(disk);
> +
> +	return 0;
> +}
> +
> +/********************************************************************
> +  /sys/class/osdblk/
> +                     add	map OSD object to blkdev
> +                     remove	unmap OSD object
> +                     list	show mappings
> + *******************************************************************/
> +
> +static void class_osdblk_release(struct class *cls)
> +{
> +	kfree(cls);
> +}
> +
> +static ssize_t class_osdblk_list(struct class *c, char *data)
> +{
> +	int n = 0;
> +	struct list_head *tmp;
> +
> +	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
> +
> +	list_for_each(tmp, &osdblkdev_list) {
> +		struct osdblk_device *osdev;
> +
> +		osdev = list_entry(tmp, struct osdblk_device, node);
> +
> +		n += sprintf(data+n, "%d %d %llu %llu %s\n",
> +			osdev->id,
> +			osdev->major,
> +			osdev->obj.partition,
> +			osdev->obj.id,
> +			osdev->osd_path);
> +	}
> +
> +	mutex_unlock(&ctl_mutex);
> +	return n;
> +}
> +
> +static ssize_t class_osdblk_add(struct class *c, const char *buf, size_t count)
> +{
> +	struct osdblk_device *osdev;
> +	ssize_t rc;
> +	int irc, new_id = 0;
> +	struct list_head *tmp;
> +
> +	if (!try_module_get(THIS_MODULE))
> +		return -ENODEV;
> +
> +	/* new osdblk_device object */
> +	osdev = kzalloc(sizeof(*osdev) + strlen(buf) + 1, GFP_KERNEL);
> +	if (!osdev) {
> +		rc = -ENOMEM;
> +		goto err_out_mod;
> +	}
> +
> +	/* static osdblk_device initialization */
> +	spin_lock_init(&osdev->lock);
> +	INIT_LIST_HEAD(&osdev->node);
> +
> +	/* generate unique id: find highest unique id, add one */
> +
> +	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
> +
> +	list_for_each(tmp, &osdblkdev_list) {
> +		struct osdblk_device *osdev;
> +
> +		osdev = list_entry(tmp, struct osdblk_device, node);
> +		if (osdev->id > new_id)
> +			new_id = osdev->id + 1;
> +	}
> +
> +	osdev->id = new_id;
> +
> +	/* add to global list */
> +	list_add_tail(&osdev->node, &osdblkdev_list);
> +
> +	mutex_unlock(&ctl_mutex);
> +
> +	/* parse add command */
> +	if (sscanf(buf, "%llu %llu %s", &osdev->obj.partition, &osdev->obj.id,
> +		   osdev->osd_path) != 3) {
> +		rc = -EINVAL;
> +		goto err_out_slot;
> +	}
> +
> +	/* initialize rest of new object */
> +	sprintf(osdev->name, DRV_NAME "%d", osdev->id);
> +
> +	/* contact requested OSD */
> +	osdev->osd = osduld_path_lookup(osdev->osd_path);
> +	if (IS_ERR(osdev->osd)) {
> +		rc = PTR_ERR(osdev->osd);
> +		goto err_out_slot;
> +	}
> +
> +	/* build OSD credential */
> +	osd_make_credential(osdev->obj_cred, &osdev->obj);
> +
> +	/* register our block device */
> +	irc = register_blkdev(0, osdev->name);
> +	if (irc < 0) {
> +		rc = irc;
> +		goto err_out_osd;
> +	}
> +
> +	osdev->major = irc;
> +
> +	/* set up and announce blkdev mapping */
> +	rc = osdblk_init_disk(osdev);
> +	if (rc)
> +		goto err_out_blkdev;
> +
> +	return 0;
> +
> +err_out_blkdev:
> +	unregister_blkdev(osdev->major, osdev->name);
> +err_out_osd:
> +	osduld_put_device(osdev->osd);
> +err_out_slot:
> +	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
> +	list_del_init(&osdev->node);
> +	mutex_unlock(&ctl_mutex);
> +
> +	kfree(osdev);
> +err_out_mod:
> +	module_put(THIS_MODULE);
> +	return rc;
> +}
> +
> +static ssize_t class_osdblk_remove(struct class *c, const char *buf,
> +					size_t count)
> +{
> +	struct osdblk_device *osdev = NULL;
> +	int target_id, rc;
> +	unsigned long ul;
> +	struct list_head *tmp;
> +
> +	rc = strict_strtoul(buf, 10, &ul);
> +	if (rc)
> +		return rc;
> +
> +	/* convert to int; abort if we lost anything in the conversion */
> +	target_id = (int) ul;
> +	if (target_id != ul)
> +		return -EINVAL;
> +
> +	/* remove object from list immediately */
> +	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
> +
> +	list_for_each(tmp, &osdblkdev_list) {
> +		osdev = list_entry(tmp, struct osdblk_device, node);
> +		if (osdev->id == target_id) {
> +			list_del_init(&osdev->node);
> +			break;
> +		}
> +		osdev = NULL;
> +	}
> +
> +	mutex_unlock(&ctl_mutex);
> +
> +	if (!osdev)
> +		return -ENOENT;
> +
> +	/* clean up and free blkdev and associated OSD connection */
> +	osdblk_free_disk(osdev);
> +	unregister_blkdev(osdev->major, osdev->name);
> +	osduld_put_device(osdev->osd);
> +	kfree(osdev);
> +
> +	/* release module ref */
> +	module_put(THIS_MODULE);
> +
> +	return 0;
> +}
> +
> +static struct class_attribute class_osdblk_attrs[] = {
> +	__ATTR(add,	0200, NULL, class_osdblk_add),
> +	__ATTR(remove,	0200, NULL, class_osdblk_remove),
> +	__ATTR(list,	0444, class_osdblk_list, NULL),
> +	__ATTR_NULL
> +};
> +
> +static int osdblk_sysfs_init(void)
> +{
> +	int ret = 0;
> +
> +	/*
> +	 * create control files in sysfs
> +	 * /sys/class/osdblk/...
> +	 */
> +	class_osdblk = kzalloc(sizeof(*class_osdblk), GFP_KERNEL);
> +	if (!class_osdblk)
> +		return -ENOMEM;
> +
> +	class_osdblk->name = DRV_NAME;
> +	class_osdblk->owner = THIS_MODULE;
> +	class_osdblk->class_release = class_osdblk_release;
> +	class_osdblk->class_attrs = class_osdblk_attrs;
> +
> +	ret = class_register(class_osdblk);
> +	if (ret) {
> +		kfree(class_osdblk);
> +		class_osdblk = NULL;
> +		printk(PFX "failed to create class osdblk\n");
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static void osdblk_sysfs_cleanup(void)
> +{
> +	if (class_osdblk)
> +		class_destroy(class_osdblk);
> +	class_osdblk = NULL;
> +}
> +
> +static int __init osdblk_init(void)
> +{
> +	int rc;
> +
> +	rc = osdblk_sysfs_init();
> +	if (rc)
> +		return rc;
> +
> +	return 0;
> +}
> +
> +static void __exit osdblk_exit(void)
> +{
> +	osdblk_sysfs_cleanup();
> +}
> +
> +module_init(osdblk_init);
> +module_exit(osdblk_exit);
> +

I think we should push for a couple of new block-layer APIs:

blk_bio_put() - takes care of chained bio's
blk_bio_clone() - clones a chain. Set all directions and sizes properly

and also for osd_initiator we will need something like:
  blk_make_request(bio, total_bytes, ...) 
  That can deal with a chained-bio and sets all sizes and members accordingly.

Perhaps there will be many other users for these APIs?

Also one more comment:
  I'm not sure, but maybe you would want to set all upper request_queue
  alignments, masks and limits to be byte-aligned, maximum aloud, this will
  avoid any kind of un-wanted bouncing at the upper level. If any,
  (not iscsi osd), proper bouncing will be done at the lower-level request_queue
  where the right limitations are known. (and are done once).

Thanks
Boaz

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/3] osd_initiator: support bio chains
  2009-04-10 11:49     ` [PATCH 2/3] osd_initiator: support bio chains Jeff Garzik
  2009-04-10 11:50       ` [PATCH 3/3 v3] osdblk: a Linux block device for OSD objects Jeff Garzik
@ 2009-04-27 16:02       ` Boaz Harrosh
  1 sibling, 0 replies; 24+ messages in thread
From: Boaz Harrosh @ 2009-04-27 16:02 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: LKML, linux-scsi, linux-fsdevel, axboe, Andrew Morton,
	James.Bottomley, osd-dev, Tejun Heo

On 04/10/2009 02:49 PM, Jeff Garzik wrote:
> Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
> ---
>  drivers/scsi/osd/osd_initiator.c |   83 +++++++++++++++++++++++++++++++--------
>  1 file changed, 67 insertions(+), 16 deletions(-)
> 

Some of my comments below will be more clear after reading the 
comments to the next patch

> diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
> index 2a5f077..2d35d65 100644
> --- a/drivers/scsi/osd/osd_initiator.c
> +++ b/drivers/scsi/osd/osd_initiator.c
> @@ -709,14 +709,36 @@ EXPORT_SYMBOL(osd_req_remove_object);
>  	struct osd_obj_id *first, struct osd_obj_id_list *list, unsigned nelem);
>  */
>  
> +static unsigned int bio_chain_size(struct bio *bio)
> +{
> +	unsigned int chain_size = 0;
> +
> +	while (bio) {
> +		chain_size += bio->bi_size;
> +		bio = bio->bi_next;
> +	}
> +
> +	return chain_size;
> +}
> +

This will not be needed see below.

>  void osd_req_write(struct osd_request *or,
>  	const struct osd_obj_id *obj, struct bio *bio, u64 offset)

-	const struct osd_obj_id *obj, struct bio *bio, u64 offset)
+	const struct osd_obj_id *obj, struct bio *bio, u64 total_bytes, u64 offset)

>  {
> -	_osd_req_encode_common(or, OSD_ACT_WRITE, obj, offset, bio->bi_size);
> +	unsigned int chain_size = 0;
> +	struct bio *tmp_bio = bio;
> +
> +	while (tmp_bio) {
> +		chain_size += tmp_bio->bi_size;
> +		tmp_bio->bi_rw |= (1 << BIO_RW);
> +
> +		tmp_bio = tmp_bio->bi_next;
> +	}
> +

Is not needed bio->bi_rw should be taken care of, in bio_clone()
and chain_size is total_bytes received.

> +	_osd_req_encode_common(or, OSD_ACT_WRITE, obj, offset, chain_size);
>  	WARN_ON(or->out.bio || or->out.total_bytes);
> -	bio->bi_rw |= (1 << BIO_RW);
> +
>  	or->out.bio = bio;
> -	or->out.total_bytes = bio->bi_size;
> +	or->out.total_bytes = chain_size;
>  }
>  EXPORT_SYMBOL(osd_req_write);
>  
> @@ -747,11 +769,21 @@ EXPORT_SYMBOL(osd_req_flush_object);
>  void osd_req_read(struct osd_request *or,
>  	const struct osd_obj_id *obj, struct bio *bio, u64 offset)

same here

>  {
> -	_osd_req_encode_common(or, OSD_ACT_READ, obj, offset, bio->bi_size);
> +	unsigned int chain_size = 0;
> +	struct bio *tmp_bio = bio;
> +
> +	while (tmp_bio) {
> +		chain_size += tmp_bio->bi_size;
> +		tmp_bio->bi_rw &= ~(1 << BIO_RW);
> +
> +		tmp_bio = tmp_bio->bi_next;
> +	}
> +

and here

> +	_osd_req_encode_common(or, OSD_ACT_READ, obj, offset, chain_size);
>  	WARN_ON(or->in.bio || or->in.total_bytes);
> -	bio->bi_rw &= ~(1 << BIO_RW);
> +
>  	or->in.bio = bio;
> -	or->in.total_bytes = bio->bi_size;
> +	or->in.total_bytes = chain_size;
>  }
>  EXPORT_SYMBOL(osd_req_read);
>  
> @@ -1176,7 +1208,7 @@ static int _osd_req_finalize_data_integrity(struct osd_request *or,
>  		unsigned pad;
>  
>  		or->out_data_integ.data_bytes = cpu_to_be64(
> -			or->out.bio ? or->out.bio->bi_size : 0);
> +			or->out.bio ? bio_chain_size(or->out.bio) : 0);

here we have or->out.total_bytes set properly at this stage
(This will also work well when in the future we will support
 CDB-CONTINUATION which is yet another segment in out buffer)

>  		or->out_data_integ.set_attributes_bytes = cpu_to_be64(
>  			or->set_attr.total_bytes);
>  		or->out_data_integ.get_attributes_bytes = cpu_to_be64(
> @@ -1269,6 +1301,7 @@ int osd_finalize_request(struct osd_request *or,
>  	struct osd_cdb_head *cdbh = osd_cdb_head(&or->cdb);
>  	bool has_in, has_out;
>  	int ret;
> +	struct bio *bio, *tmp;
>  
>  	if (options & OSD_REQ_FUA)
>  		cdbh->options |= OSD_CDB_FUA;
> @@ -1292,23 +1325,40 @@ int osd_finalize_request(struct osd_request *or,
>  	}
>  
>  	if (or->out.bio) {
> -		ret = blk_rq_append_bio(or->request->q, or->out.req,
> -					or->out.bio);
> -		if (ret) {
> -			OSD_DEBUG("blk_rq_append_bio out failed\n");
> -			return ret;
> +		bio = or->out.bio;
> +		while (bio) {
> +			tmp = bio;
> +			bio = bio->bi_next;
> +
> +			ret = blk_rq_append_bio(or->request->q, or->out.req,
> +						tmp);
> +			if (ret) {
> +				OSD_DEBUG("blk_rq_append_bio out failed\n");
> +				return ret;
> +			}
>  		}
>  		OSD_DEBUG("out bytes=%llu (bytes_req=%u)\n",
>  			_LLU(or->out.total_bytes), or->out.req->data_len);
> +
> +		or->out.bio = NULL;
>  	}
>  	if (or->in.bio) {
> -		ret = blk_rq_append_bio(or->request->q, or->in.req, or->in.bio);
> -		if (ret) {
> -			OSD_DEBUG("blk_rq_append_bio in failed\n");
> -			return ret;
> +		bio = or->in.bio;
> +		while (bio) {
> +			tmp = bio;
> +			bio = bio->bi_next;
> +
> +			ret = blk_rq_append_bio(or->request->q, or->in.req,
> +						tmp);
> +			if (ret) {
> +				OSD_DEBUG("blk_rq_append_bio in failed\n");
> +				return ret;
> +			}

OK a loop here is plain hack, but two loops is a blasphemy. Specially over
"wants to die" API. If at all, then it needs an helper that will be later
pushed to block-layer.

I have an RFC in place that makes use of a new blk_make_request(bio, total_bytes, ...)
It should be enhanced to properly support chained-bios.

I will send this patch rebased on top of that RFC so it will be ready for
the osdblk patch.

>  		}
>  		OSD_DEBUG("in bytes=%llu (bytes_req=%u)\n",
>  			_LLU(or->in.total_bytes), or->in.req->data_len);
> +
> +		or->in.bio = NULL;
>  	}
>  
>  	or->out.pad_buff = sg_out_pad_buffer;
> @@ -1618,6 +1668,7 @@ void osd_sec_sign_cdb(struct osd_cdb *ocdb __unused, const u8 *cap_key __unused)
>  void osd_sec_sign_data(void *data_integ __unused,
>  		       struct bio *bio __unused, const u8 *cap_key __unused)
>  {
> +	/* NOTE: bio is a linked list */

Yep, that was the original idea. Please note that even with a single bio submitted at osd_req_read/write
today, at this point they are already chained with up to 4 additional segments.

>  }
>  
>  /*


Thanks Jeff. I now understand what is needed to support chained-bios
Please let me refresh my RFC to block-layer and add this patch to the patchset.

Then your osdblk patch can apply on top of that. That will also create some
needed pressure for this work to be done.

Sincerely yours
Boaz

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3 v3] osdblk: a Linux block device for OSD objects
  2009-04-27 15:59         ` Boaz Harrosh
@ 2009-04-27 18:24           ` Jens Axboe
  2009-04-28  9:40             ` Boaz Harrosh
  0 siblings, 1 reply; 24+ messages in thread
From: Jens Axboe @ 2009-04-27 18:24 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Jeff Garzik, LKML, linux-scsi, linux-fsdevel, Andrew Morton,
	James.Bottomley, osd-dev, Tejun Heo

On Mon, Apr 27 2009, Boaz Harrosh wrote:
> > +static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask)
> > +{
> > +	struct bio *tmp, *new_chain = NULL, *tail = NULL;
> > +
> > +	while (old_chain) {
> > +		tmp = bio_clone(old_chain, gfpmask);
> > +		if (!tmp)
> > +			goto err_out;
> > +
> > +		tmp->bi_next = NULL;
> > +		if (!new_chain)
> > +			new_chain = tail = tmp;
> > +		else {
> > +			tail->bi_next = tmp;
> > +			tail = tmp;
> > +		}
> > +
> > +		old_chain = old_chain->bi_next;
> > +	}
> > +
> > +	return new_chain;
> > +
> > +err_out:
> > +	bio_chain_put(new_chain);
> > +	return NULL;
> > +}
> > +
> 
> NOTE-TO-ME:
>   blk_bio_clone()

Note to Boaz - this is illegal, unless gfp_mask is GFP_ATOMIC (in which
case you should not pass it in). The only way to make this work is to:

1) Have a private bio pool, and
2) Make sure it has enough reserved entries to populate the chain, and
3) Ensure only a single caller at the time, or entries enough for the N
   users that are allowed. It has to be controlled either way, whether N
   is 1 or larger.

> > +static void osdblk_rq_fn(struct request_queue *q)
> > +{
> > +	struct osdblk_device *osdev = q->queuedata;
> > +	struct request *rq;
> > +	struct osdblk_request *orq;
> > +	struct osd_request *or;
> > +	struct bio *bio;
> > +	int do_write, do_flush;
> > +
> > +	while (1) {
> > +		/* peek at request from block layer */
> > +		rq = elv_next_request(q);
> > +		if (!rq)
> > +			break;
> > +
> > +		/* filter out block requests we don't understand */
> > +		if (!blk_fs_request(rq) && !blk_barrier_rq(rq)) {
> > +			end_request(rq, 0);
> > +			continue;
> > +		}
> > +
> > +		/* deduce our operation (read, write, flush) */
> > +		/* I wish the block layer simplified cmd_type/cmd_flags/cmd[]
> > +		 * into a clearly defined set of RPC commands:
> > +		 * read, write, flush, scsi command, power mgmt req,
> > +		 * driver-specific, etc.
> > +		 */
> > +
> > +		do_flush = (rq->special == (void *) 0xdeadbeefUL);
> 
> That's for real? or it needs a "FIXME" next to it?
> 
> > +		do_write = (rq_data_dir(rq) == WRITE);
> > +
> > +		/* a bio clone to be passed down to OSD request */
> > +		bio = bio_chain_clone(rq->bio, GFP_ATOMIC);
> > +		if (!bio)
> > +			break;
> 
> does blk_barrier_rq() have a rq->bio?

It may or may not.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3 v3] osdblk: a Linux block device for OSD objects
  2009-04-27 18:24           ` Jens Axboe
@ 2009-04-28  9:40             ` Boaz Harrosh
  0 siblings, 0 replies; 24+ messages in thread
From: Boaz Harrosh @ 2009-04-28  9:40 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Jeff Garzik, LKML, linux-scsi, linux-fsdevel, Andrew Morton,
	James.Bottomley, osd-dev, Tejun Heo

On 04/27/2009 09:24 PM, Jens Axboe wrote:
> On Mon, Apr 27 2009, Boaz Harrosh wrote:

Jens Hi, Thanks for your reply. Please I need some help with
this.

>>> +static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask)
>>> +{
>>> +	struct bio *tmp, *new_chain = NULL, *tail = NULL;
>>> +
>>> +	while (old_chain) {
>>> +		tmp = bio_clone(old_chain, gfpmask);
>>> +		if (!tmp)
>>> +			goto err_out;
>>> +
>>> +		tmp->bi_next = NULL;
>>> +		if (!new_chain)
>>> +			new_chain = tail = tmp;
>>> +		else {
>>> +			tail->bi_next = tmp;
>>> +			tail = tmp;
>>> +		}
>>> +
>>> +		old_chain = old_chain->bi_next;
>>> +	}
>>> +
>>> +	return new_chain;
>>> +
>>> +err_out:
>>> +	bio_chain_put(new_chain);
>>> +	return NULL;
>>> +}
>>> +
>> NOTE-TO-ME:
>>   blk_bio_clone()
> 
> Note to Boaz - this is illegal, unless gfp_mask is GFP_ATOMIC (in which
> case you should not pass it in). 

OK it is only used with GFP_ATOMIC.
Theoretically, what if I use a thread and am aloud to sleep,
(and executing requests in serial). Can I then wait on the first bio_clone.

> The only way to make this work is to:
> 

Are you saying that, in the case I'm a swap device? or do you mean that if I
use the above code I can break it for other devices that are swap-devices
and cause a live lock for them.

Currently the OSD library is not swap safe. And even if it was, iSCSI is not.
So there is no option of this ever been swap.

> 1) Have a private bio pool, and

Do I need to manage such a pool internally and use __bio_clone() or is there
an easy way to associate a private bio-pool to a request_queue?

> 2) Make sure it has enough reserved entries to populate the chain

Is there a system limit to that, that I can be sure of? Or just choose
a magic number and if it is exceeded just split the io up in parts?

> 3) Ensure only a single caller at the time, or entries enough for the N
>    users that are allowed. It has to be controlled either way, whether N
>    is 1 or larger.

osdblk_rq_fn (the queue fn) is it serialized to one thread? If not
could I use the queue lock for the duration of the clone?

> 

Thanks for your help
Boaz

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2009-04-28  9:40 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-02  1:54 [PATCH] osdblk: a Linux block device for OSD objects Jeff Garzik
2009-04-02  2:05 ` Jeff Garzik
2009-04-02 12:26 ` Boaz Harrosh
2009-04-02 16:46   ` Jeff Garzik
2009-04-03  9:38   ` Jeff Garzik
2009-04-05 10:22     ` Boaz Harrosh
2009-04-03  1:32 ` James Bottomley
2009-04-03 10:14   ` Jeff Garzik
2009-04-03  9:49 ` Jens Axboe
2009-04-03  9:58   ` Jeff Garzik
2009-04-05 10:18     ` Boaz Harrosh
2009-04-08  1:29       ` Jeff Garzik
2009-04-08  5:45         ` Jens Axboe
2009-04-08  6:02           ` Jeff Garzik
2009-04-08  6:08             ` Jens Axboe
2009-04-07  7:26 ` Pavel Machek
2009-04-07 22:53 ` [PATCH v2] " Jeff Garzik
2009-04-10 11:48   ` [PATCH 1/3] block/blk-map.c: blk_rq_append_bio should ensure it's not appending a chain Jeff Garzik
2009-04-10 11:49     ` [PATCH 2/3] osd_initiator: support bio chains Jeff Garzik
2009-04-10 11:50       ` [PATCH 3/3 v3] osdblk: a Linux block device for OSD objects Jeff Garzik
2009-04-27 15:59         ` Boaz Harrosh
2009-04-27 18:24           ` Jens Axboe
2009-04-28  9:40             ` Boaz Harrosh
2009-04-27 16:02       ` [PATCH 2/3] osd_initiator: support bio chains Boaz Harrosh

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.