All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] bcache: Use bcache without formatting existing device
@ 2022-07-04 15:13 Andrea Tomassetti
  2022-07-04 15:29 ` Coly Li
  0 siblings, 1 reply; 17+ messages in thread
From: Andrea Tomassetti @ 2022-07-04 15:13 UTC (permalink / raw)
  To: linux-bcache; +Cc: Coly Li, Kent Overstreet, Zhang Zhen, Andrea Tomassetti

Introducing a bcache control device (/dev/bcache_ctrl)
that allows communicating to the driver from user space
via IOCTL. The only IOCTL commands currently implemented,
receives a struct cache_sb and uses it to register the
backing device without any need of formatting them.

Signed-off-by: Andrea Tomassetti <andrea.tomassetti-opensource@devo.com>
---
Hi all,
At Devo we started to think of using bcache in our production systems
to boost performance. But, at the very beginning, we were faced with
one annoying issue, at least for our use-case: bcache needs the backing
devices to be formatted before being able to use them. What's really
needed is just to wipe any FS signature out of them. This is definitely
not an option we will consider in our production systems because we would
need to move hundreds of terabytes of data.

To circumvent the "formatting" problem, in the past weeks I worked on some
modifications to the actual bcache module. In particular, I added a bcache
control device (exported to /dev/bcache_ctrl) that allows communicating to
the driver from userspace via IOCTL. One of the IOCTL commands that I
implemented receives a struct cache_sb and uses it to register the backing
device. The modifications are really small and retro compatible. To then
re-create the same configuration after every boot (because the backing
devices now do not present the bcache super block anymore) I created an
udev rule that invokes a python script that will re-create the same
scenario based on a yaml configuration file.

I'm re-sending this patch without any confidentiality footer, sorry for that.

 drivers/md/bcache/Makefile      |   2 +-
 drivers/md/bcache/control.c     | 117 ++++++++++++++++++++++++++++++++
 drivers/md/bcache/control.h     |  12 ++++
 drivers/md/bcache/ioctl_codes.h |  19 ++++++
 drivers/md/bcache/super.c       |  50 +++++++++++---
 5 files changed, 189 insertions(+), 11 deletions(-)
 create mode 100644 drivers/md/bcache/control.c
 create mode 100644 drivers/md/bcache/control.h
 create mode 100644 drivers/md/bcache/ioctl_codes.h

diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 5b87e59676b8..46ed41baed7a 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -4,4 +4,4 @@ obj-$(CONFIG_BCACHE)	+= bcache.o

 bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
 	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
-	util.o writeback.o features.o
+	util.o writeback.o features.o control.o
diff --git a/drivers/md/bcache/control.c b/drivers/md/bcache/control.c
new file mode 100644
index 000000000000..69b5e554d093
--- /dev/null
+++ b/drivers/md/bcache/control.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cdev.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+
+#include "control.h"
+
+struct bch_ctrl_device {
+	struct cdev cdev;
+	struct class *class;
+	dev_t dev;
+};
+
+static struct bch_ctrl_device _control_device;
+
+/* this handles IOCTL for /dev/bcache_ctrl */
+/*********************************************/
+static long bch_service_ioctl_ctrl(struct file *filp, unsigned int cmd,
+		unsigned long arg)
+{
+	int retval = 0;
+
+	if (_IOC_TYPE(cmd) != BCH_IOCTL_MAGIC)
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		/* Must be root to issue ioctls */
+		return -EPERM;
+	}
+
+	switch (cmd) {
+	case BCH_IOCTL_REGISTER_DEVICE: {
+		struct bch_register_device *cmd_info;
+
+		cmd_info = vmalloc(sizeof(struct bch_register_device));
+		if (!cmd_info)
+			return -ENOMEM;
+
+		if (copy_from_user(cmd_info, (void __user *)arg,
+				sizeof(struct bch_register_device))) {
+			pr_err("Cannot copy cmd info from user space\n");
+			vfree(cmd_info);
+			return -EINVAL;
+		}
+
+		retval = register_bcache_ioctl(cmd_info);
+
+		vfree(cmd_info);
+		return retval;
+	}
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct file_operations _ctrl_dev_fops = {
+	.owner = THIS_MODULE,
+	.unlocked_ioctl = bch_service_ioctl_ctrl
+};
+
+int __init bch_ctrl_device_init(void)
+{
+	struct bch_ctrl_device *ctrl = &_control_device;
+	struct device *device;
+	int result = 0;
+
+	result = alloc_chrdev_region(&ctrl->dev, 0, 1, "bcache");
+	if (result) {
+		pr_err("Cannot allocate control chrdev number.\n");
+		goto error_alloc_chrdev_region;
+	}
+
+	cdev_init(&ctrl->cdev, &_ctrl_dev_fops);
+
+	result = cdev_add(&ctrl->cdev, ctrl->dev, 1);
+	if (result) {
+		pr_err("Cannot add control chrdev.\n");
+		goto error_cdev_add;
+	}
+
+	ctrl->class = class_create(THIS_MODULE, "bcache");
+	if (IS_ERR(ctrl->class)) {
+		pr_err("Cannot create control chrdev class.\n");
+		result = PTR_ERR(ctrl->class);
+		goto error_class_create;
+	}
+
+	device = device_create(ctrl->class, NULL, ctrl->dev, NULL,
+			"bcache_ctrl");
+	if (IS_ERR(device)) {
+		pr_err("Cannot create control chrdev.\n");
+		result = PTR_ERR(device);
+		goto error_device_create;
+	}
+
+	return result;
+
+error_device_create:
+	class_destroy(ctrl->class);
+error_class_create:
+	cdev_del(&ctrl->cdev);
+error_cdev_add:
+	unregister_chrdev_region(ctrl->dev, 1);
+error_alloc_chrdev_region:
+	return result;
+}
+
+void bch_ctrl_device_deinit(void)
+{
+	struct bch_ctrl_device *ctrl = &_control_device;
+
+	device_destroy(ctrl->class, ctrl->dev);
+	class_destroy(ctrl->class);
+	cdev_del(&ctrl->cdev);
+	unregister_chrdev_region(ctrl->dev, 1);
+}
diff --git a/drivers/md/bcache/control.h b/drivers/md/bcache/control.h
new file mode 100644
index 000000000000..3e4273db02a3
--- /dev/null
+++ b/drivers/md/bcache/control.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BCACHE_CONTROL_H__
+#define __BCACHE_CONTROL_H__
+
+#include "ioctl_codes.h"
+
+int __init bch_ctrl_device_init(void);
+void bch_ctrl_device_deinit(void);
+
+ssize_t register_bcache_ioctl(struct bch_register_device *brd);
+
+#endif
diff --git a/drivers/md/bcache/ioctl_codes.h b/drivers/md/bcache/ioctl_codes.h
new file mode 100644
index 000000000000..b004d60c29ff
--- /dev/null
+++ b/drivers/md/bcache/ioctl_codes.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BCACHE_IOCTL_CODES_H__
+#define __BCACHE_IOCTL_CODES_H__
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+struct bch_register_device {
+	const char *dev_name;
+	size_t size;
+	struct cache_sb *sb;
+};
+
+#define BCH_IOCTL_MAGIC (0xBC)
+
+/* Register a new backing device */
+#define BCH_IOCTL_REGISTER_DEVICE	_IOWR(BCH_IOCTL_MAGIC, 1, struct bch_register_device)
+
+#endif
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 140f35dc0c45..339a11d00fef 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -14,6 +14,7 @@
 #include "request.h"
 #include "writeback.h"
 #include "features.h"
+#include "control.h"

 #include <linux/blkdev.h>
 #include <linux/pagemap.h>
@@ -340,6 +341,9 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
 	struct closure *cl = &dc->sb_write;
 	struct bio *bio = &dc->sb_bio;

+	if (!dc->sb_disk)
+		return;
+
 	down(&dc->sb_write_mutex);
 	closure_init(cl, parent);

@@ -2403,14 +2407,14 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,

 /* Global interfaces/init */

-static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+static ssize_t register_bcache_sysfs(struct kobject *k, struct kobj_attribute *attr,
 			       const char *buffer, size_t size);
 static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
 					 struct kobj_attribute *attr,
 					 const char *buffer, size_t size);

-kobj_attribute_write(register,		register_bcache);
-kobj_attribute_write(register_quiet,	register_bcache);
+kobj_attribute_write(register,		register_bcache_sysfs);
+kobj_attribute_write(register_quiet,	register_bcache_sysfs);
 kobj_attribute_write(pendings_cleanup,	bch_pending_bdevs_cleanup);

 static bool bch_is_open_backing(dev_t dev)
@@ -2465,7 +2469,8 @@ static void register_bdev_worker(struct work_struct *work)
 	dc = kzalloc(sizeof(*dc), GFP_KERNEL);
 	if (!dc) {
 		fail = true;
-		put_page(virt_to_page(args->sb_disk));
+		if (args->sb_disk)
+			put_page(virt_to_page(args->sb_disk));
 		blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 		goto out;
 	}
@@ -2495,7 +2500,8 @@ static void register_cache_worker(struct work_struct *work)
 	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 	if (!ca) {
 		fail = true;
-		put_page(virt_to_page(args->sb_disk));
+		if (args->sb_disk)
+			put_page(virt_to_page(args->sb_disk));
 		blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 		goto out;
 	}
@@ -2525,7 +2531,7 @@ static void register_device_async(struct async_reg_args *args)
 	queue_delayed_work(system_wq, &args->reg_work, 10);
 }

-static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+static ssize_t register_bcache_common(void *k, struct kobj_attribute *attr,
 			       const char *buffer, size_t size)
 {
 	const char *err;
@@ -2587,9 +2593,14 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	if (set_blocksize(bdev, 4096))
 		goto out_blkdev_put;

-	err = read_super(sb, bdev, &sb_disk);
-	if (err)
-		goto out_blkdev_put;
+	if (!k) {
+		err = read_super(sb, bdev, &sb_disk);
+		if (err)
+			goto out_blkdev_put;
+	} else {
+		sb_disk =  NULL;
+		memcpy(sb, (struct cache_sb *)k, sizeof(struct cache_sb));
+	}

 	err = "failed to register device";

@@ -2651,7 +2662,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	return size;

 out_put_sb_page:
-	put_page(virt_to_page(sb_disk));
+	if (!k)
+		put_page(virt_to_page(sb_disk));
 out_blkdev_put:
 	blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 out_free_sb:
@@ -2666,6 +2678,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	return ret;
 }

+static ssize_t register_bcache_sysfs(struct kobject *k, struct kobj_attribute *attr,
+			       const char *buffer, size_t size)
+{
+	return register_bcache_common(NULL, attr, buffer, size);
+}
+
+ssize_t register_bcache_ioctl(struct bch_register_device *brd)
+{
+	return register_bcache_common((void *)brd->sb, NULL, brd->dev_name, brd->size);
+}
+
+

 struct pdev {
 	struct list_head list;
@@ -2819,6 +2843,7 @@ static void bcache_exit(void)
 {
 	bch_debug_exit();
 	bch_request_exit();
+	bch_ctrl_device_deinit();
 	if (bcache_kobj)
 		kobject_put(bcache_kobj);
 	if (bcache_wq)
@@ -2918,6 +2943,11 @@ static int __init bcache_init(void)
 	bch_debug_init();
 	closure_debug_init();

+	if (bch_ctrl_device_init()) {
+		pr_err("Cannot initialize control device\n");
+		goto err;
+	}
+
 	bcache_is_reboot = false;

 	return 0;
--
2.37.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] bcache: Use bcache without formatting existing device
  2022-07-04 15:13 [PATCH] bcache: Use bcache without formatting existing device Andrea Tomassetti
@ 2022-07-04 15:29 ` Coly Li
  2022-07-05  8:48   ` Andrea Tomassetti
  0 siblings, 1 reply; 17+ messages in thread
From: Coly Li @ 2022-07-04 15:29 UTC (permalink / raw)
  To: Andrea Tomassetti; +Cc: linux-bcache, Kent Overstreet, Zhang Zhen



> 2022年7月4日 23:13,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> 
> Introducing a bcache control device (/dev/bcache_ctrl)
> that allows communicating to the driver from user space
> via IOCTL. The only IOCTL commands currently implemented,
> receives a struct cache_sb and uses it to register the
> backing device without any need of formatting them.
> 
> Signed-off-by: Andrea Tomassetti <andrea.tomassetti-opensource@devo.com>
> ---
> Hi all,
> At Devo we started to think of using bcache in our production systems
> to boost performance. But, at the very beginning, we were faced with
> one annoying issue, at least for our use-case: bcache needs the backing
> devices to be formatted before being able to use them. What's really
> needed is just to wipe any FS signature out of them. This is definitely
> not an option we will consider in our production systems because we would
> need to move hundreds of terabytes of data.
> 
> To circumvent the "formatting" problem, in the past weeks I worked on some
> modifications to the actual bcache module. In particular, I added a bcache
> control device (exported to /dev/bcache_ctrl) that allows communicating to
> the driver from userspace via IOCTL. One of the IOCTL commands that I
> implemented receives a struct cache_sb and uses it to register the backing
> device. The modifications are really small and retro compatible. To then
> re-create the same configuration after every boot (because the backing
> devices now do not present the bcache super block anymore) I created an
> udev rule that invokes a python script that will re-create the same
> scenario based on a yaml configuration file.
> 
> I'm re-sending this patch without any confidentiality footer, sorry for that.

Thanks for removing that confidential and legal statement, that’s the reason I was not able to reply your email.

Back to the patch, I don’t support this idea. For the problem you are solving, indeed people uses device mapper linear target for many years, and it works perfectly without any code modification.

That is, create a 8K image and set it as a loop device, then write a dm table to combine it with the existing hard drive. Then you run “bcache make -B <combined dm target>”, you will get a bcache device whose first 8K in the loop device and existing super block of the hard driver located at expected offset.

It is unnecessary to create a new IOCTL interface, and I feel the way how it works is really unconvinced for potential security risk.

Thanks.

Coly Li

> 
> drivers/md/bcache/Makefile      |   2 +-
> drivers/md/bcache/control.c     | 117 ++++++++++++++++++++++++++++++++
> drivers/md/bcache/control.h     |  12 ++++
> drivers/md/bcache/ioctl_codes.h |  19 ++++++
> drivers/md/bcache/super.c       |  50 +++++++++++---
> 5 files changed, 189 insertions(+), 11 deletions(-)
> create mode 100644 drivers/md/bcache/control.c
> create mode 100644 drivers/md/bcache/control.h
> create mode 100644 drivers/md/bcache/ioctl_codes.h
> 
> diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
> index 5b87e59676b8..46ed41baed7a 100644
> --- a/drivers/md/bcache/Makefile
> +++ b/drivers/md/bcache/Makefile
> @@ -4,4 +4,4 @@ obj-$(CONFIG_BCACHE)	+= bcache.o
> 
> bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
> 	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
> -	util.o writeback.o features.o
> +	util.o writeback.o features.o control.o
> diff --git a/drivers/md/bcache/control.c b/drivers/md/bcache/control.c
> new file mode 100644
> index 000000000000..69b5e554d093
> --- /dev/null
> +++ b/drivers/md/bcache/control.c
> @@ -0,0 +1,117 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/cdev.h>
> +#include <linux/fs.h>
> +#include <linux/vmalloc.h>
> +
> +#include "control.h"
> +
> +struct bch_ctrl_device {
> +	struct cdev cdev;
> +	struct class *class;
> +	dev_t dev;
> +};
> +
> +static struct bch_ctrl_device _control_device;
> +
> +/* this handles IOCTL for /dev/bcache_ctrl */
> +/*********************************************/
> +static long bch_service_ioctl_ctrl(struct file *filp, unsigned int cmd,
> +		unsigned long arg)
> +{
> +	int retval = 0;
> +
> +	if (_IOC_TYPE(cmd) != BCH_IOCTL_MAGIC)
> +		return -EINVAL;
> +
> +	if (!capable(CAP_SYS_ADMIN)) {
> +		/* Must be root to issue ioctls */
> +		return -EPERM;
> +	}
> +
> +	switch (cmd) {
> +	case BCH_IOCTL_REGISTER_DEVICE: {
> +		struct bch_register_device *cmd_info;
> +
> +		cmd_info = vmalloc(sizeof(struct bch_register_device));
> +		if (!cmd_info)
> +			return -ENOMEM;
> +
> +		if (copy_from_user(cmd_info, (void __user *)arg,
> +				sizeof(struct bch_register_device))) {
> +			pr_err("Cannot copy cmd info from user space\n");
> +			vfree(cmd_info);
> +			return -EINVAL;
> +		}
> +
> +		retval = register_bcache_ioctl(cmd_info);
> +
> +		vfree(cmd_info);
> +		return retval;
> +	}
> +
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
> +static const struct file_operations _ctrl_dev_fops = {
> +	.owner = THIS_MODULE,
> +	.unlocked_ioctl = bch_service_ioctl_ctrl
> +};
> +
> +int __init bch_ctrl_device_init(void)
> +{
> +	struct bch_ctrl_device *ctrl = &_control_device;
> +	struct device *device;
> +	int result = 0;
> +
> +	result = alloc_chrdev_region(&ctrl->dev, 0, 1, "bcache");
> +	if (result) {
> +		pr_err("Cannot allocate control chrdev number.\n");
> +		goto error_alloc_chrdev_region;
> +	}
> +
> +	cdev_init(&ctrl->cdev, &_ctrl_dev_fops);
> +
> +	result = cdev_add(&ctrl->cdev, ctrl->dev, 1);
> +	if (result) {
> +		pr_err("Cannot add control chrdev.\n");
> +		goto error_cdev_add;
> +	}
> +
> +	ctrl->class = class_create(THIS_MODULE, "bcache");
> +	if (IS_ERR(ctrl->class)) {
> +		pr_err("Cannot create control chrdev class.\n");
> +		result = PTR_ERR(ctrl->class);
> +		goto error_class_create;
> +	}
> +
> +	device = device_create(ctrl->class, NULL, ctrl->dev, NULL,
> +			"bcache_ctrl");
> +	if (IS_ERR(device)) {
> +		pr_err("Cannot create control chrdev.\n");
> +		result = PTR_ERR(device);
> +		goto error_device_create;
> +	}
> +
> +	return result;
> +
> +error_device_create:
> +	class_destroy(ctrl->class);
> +error_class_create:
> +	cdev_del(&ctrl->cdev);
> +error_cdev_add:
> +	unregister_chrdev_region(ctrl->dev, 1);
> +error_alloc_chrdev_region:
> +	return result;
> +}
> +
> +void bch_ctrl_device_deinit(void)
> +{
> +	struct bch_ctrl_device *ctrl = &_control_device;
> +
> +	device_destroy(ctrl->class, ctrl->dev);
> +	class_destroy(ctrl->class);
> +	cdev_del(&ctrl->cdev);
> +	unregister_chrdev_region(ctrl->dev, 1);
> +}
> diff --git a/drivers/md/bcache/control.h b/drivers/md/bcache/control.h
> new file mode 100644
> index 000000000000..3e4273db02a3
> --- /dev/null
> +++ b/drivers/md/bcache/control.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __BCACHE_CONTROL_H__
> +#define __BCACHE_CONTROL_H__
> +
> +#include "ioctl_codes.h"
> +
> +int __init bch_ctrl_device_init(void);
> +void bch_ctrl_device_deinit(void);
> +
> +ssize_t register_bcache_ioctl(struct bch_register_device *brd);
> +
> +#endif
> diff --git a/drivers/md/bcache/ioctl_codes.h b/drivers/md/bcache/ioctl_codes.h
> new file mode 100644
> index 000000000000..b004d60c29ff
> --- /dev/null
> +++ b/drivers/md/bcache/ioctl_codes.h
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __BCACHE_IOCTL_CODES_H__
> +#define __BCACHE_IOCTL_CODES_H__
> +
> +#include <linux/ioctl.h>
> +#include <linux/types.h>
> +
> +struct bch_register_device {
> +	const char *dev_name;
> +	size_t size;
> +	struct cache_sb *sb;
> +};
> +
> +#define BCH_IOCTL_MAGIC (0xBC)
> +
> +/* Register a new backing device */
> +#define BCH_IOCTL_REGISTER_DEVICE	_IOWR(BCH_IOCTL_MAGIC, 1, struct bch_register_device)
> +
> +#endif
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index 140f35dc0c45..339a11d00fef 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -14,6 +14,7 @@
> #include "request.h"
> #include "writeback.h"
> #include "features.h"
> +#include "control.h"
> 
> #include <linux/blkdev.h>
> #include <linux/pagemap.h>
> @@ -340,6 +341,9 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
> 	struct closure *cl = &dc->sb_write;
> 	struct bio *bio = &dc->sb_bio;
> 
> +	if (!dc->sb_disk)
> +		return;
> +
> 	down(&dc->sb_write_mutex);
> 	closure_init(cl, parent);
> 
> @@ -2403,14 +2407,14 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
> 
> /* Global interfaces/init */
> 
> -static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
> +static ssize_t register_bcache_sysfs(struct kobject *k, struct kobj_attribute *attr,
> 			       const char *buffer, size_t size);
> static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
> 					 struct kobj_attribute *attr,
> 					 const char *buffer, size_t size);
> 
> -kobj_attribute_write(register,		register_bcache);
> -kobj_attribute_write(register_quiet,	register_bcache);
> +kobj_attribute_write(register,		register_bcache_sysfs);
> +kobj_attribute_write(register_quiet,	register_bcache_sysfs);
> kobj_attribute_write(pendings_cleanup,	bch_pending_bdevs_cleanup);
> 
> static bool bch_is_open_backing(dev_t dev)
> @@ -2465,7 +2469,8 @@ static void register_bdev_worker(struct work_struct *work)
> 	dc = kzalloc(sizeof(*dc), GFP_KERNEL);
> 	if (!dc) {
> 		fail = true;
> -		put_page(virt_to_page(args->sb_disk));
> +		if (args->sb_disk)
> +			put_page(virt_to_page(args->sb_disk));
> 		blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
> 		goto out;
> 	}
> @@ -2495,7 +2500,8 @@ static void register_cache_worker(struct work_struct *work)
> 	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
> 	if (!ca) {
> 		fail = true;
> -		put_page(virt_to_page(args->sb_disk));
> +		if (args->sb_disk)
> +			put_page(virt_to_page(args->sb_disk));
> 		blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
> 		goto out;
> 	}
> @@ -2525,7 +2531,7 @@ static void register_device_async(struct async_reg_args *args)
> 	queue_delayed_work(system_wq, &args->reg_work, 10);
> }
> 
> -static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
> +static ssize_t register_bcache_common(void *k, struct kobj_attribute *attr,
> 			       const char *buffer, size_t size)
> {
> 	const char *err;
> @@ -2587,9 +2593,14 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
> 	if (set_blocksize(bdev, 4096))
> 		goto out_blkdev_put;
> 
> -	err = read_super(sb, bdev, &sb_disk);
> -	if (err)
> -		goto out_blkdev_put;
> +	if (!k) {
> +		err = read_super(sb, bdev, &sb_disk);
> +		if (err)
> +			goto out_blkdev_put;
> +	} else {
> +		sb_disk =  NULL;
> +		memcpy(sb, (struct cache_sb *)k, sizeof(struct cache_sb));
> +	}
> 
> 	err = "failed to register device";
> 
> @@ -2651,7 +2662,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
> 	return size;
> 
> out_put_sb_page:
> -	put_page(virt_to_page(sb_disk));
> +	if (!k)
> +		put_page(virt_to_page(sb_disk));
> out_blkdev_put:
> 	blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
> out_free_sb:
> @@ -2666,6 +2678,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
> 	return ret;
> }
> 
> +static ssize_t register_bcache_sysfs(struct kobject *k, struct kobj_attribute *attr,
> +			       const char *buffer, size_t size)
> +{
> +	return register_bcache_common(NULL, attr, buffer, size);
> +}
> +
> +ssize_t register_bcache_ioctl(struct bch_register_device *brd)
> +{
> +	return register_bcache_common((void *)brd->sb, NULL, brd->dev_name, brd->size);
> +}
> +
> +
> 
> struct pdev {
> 	struct list_head list;
> @@ -2819,6 +2843,7 @@ static void bcache_exit(void)
> {
> 	bch_debug_exit();
> 	bch_request_exit();
> +	bch_ctrl_device_deinit();
> 	if (bcache_kobj)
> 		kobject_put(bcache_kobj);
> 	if (bcache_wq)
> @@ -2918,6 +2943,11 @@ static int __init bcache_init(void)
> 	bch_debug_init();
> 	closure_debug_init();
> 
> +	if (bch_ctrl_device_init()) {
> +		pr_err("Cannot initialize control device\n");
> +		goto err;
> +	}
> +
> 	bcache_is_reboot = false;
> 
> 	return 0;
> --
> 2.37.0
> 


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] bcache: Use bcache without formatting existing device
  2022-07-04 15:29 ` Coly Li
@ 2022-07-05  8:48   ` Andrea Tomassetti
  2022-07-05 14:04     ` Coly Li
  0 siblings, 1 reply; 17+ messages in thread
From: Andrea Tomassetti @ 2022-07-05  8:48 UTC (permalink / raw)
  To: Coly Li; +Cc: linux-bcache, Kent Overstreet, Zhang Zhen

On Mon, Jul 4, 2022 at 5:29 PM Coly Li <colyli@suse.de> wrote:
>
>
>
> > 2022年7月4日 23:13,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> >
> > Introducing a bcache control device (/dev/bcache_ctrl)
> > that allows communicating to the driver from user space
> > via IOCTL. The only IOCTL commands currently implemented,
> > receives a struct cache_sb and uses it to register the
> > backing device without any need of formatting them.
> >
> > Signed-off-by: Andrea Tomassetti <andrea.tomassetti-opensource@devo.com>
> > ---
> > Hi all,
> > At Devo we started to think of using bcache in our production systems
> > to boost performance. But, at the very beginning, we were faced with
> > one annoying issue, at least for our use-case: bcache needs the backing
> > devices to be formatted before being able to use them. What's really
> > needed is just to wipe any FS signature out of them. This is definitely
> > not an option we will consider in our production systems because we would
> > need to move hundreds of terabytes of data.
> >
> > To circumvent the "formatting" problem, in the past weeks I worked on some
> > modifications to the actual bcache module. In particular, I added a bcache
> > control device (exported to /dev/bcache_ctrl) that allows communicating to
> > the driver from userspace via IOCTL. One of the IOCTL commands that I
> > implemented receives a struct cache_sb and uses it to register the backing
> > device. The modifications are really small and retro compatible. To then
> > re-create the same configuration after every boot (because the backing
> > devices now do not present the bcache super block anymore) I created an
> > udev rule that invokes a python script that will re-create the same
> > scenario based on a yaml configuration file.
> >
> > I'm re-sending this patch without any confidentiality footer, sorry for that.
>
> Thanks for removing that confidential and legal statement, that’s the reason I was not able to reply your email.
>
Thank you for your patience and sorry for the newbie mistake.
> Back to the patch, I don’t support this idea. For the problem you are solving, indeed people uses device mapper linear target for many years, and it works perfectly without any code modification.
>
> That is, create a 8K image and set it as a loop device, then write a dm table to combine it with the existing hard drive. Then you run “bcache make -B <combined dm target>”, you will get a bcache device whose first 8K in the loop device and existing super block of the hard driver located at expected offset.
>
We evaluated this option but we weren't satisfied by the outcomes for,
at least, two main reasons: complexity and operational drawbacks.
For the complexity side: in production we use more than 20 HD that
need to be cached. It means we need to create 20+ header for them, 20+
loop devices and 20+ dm linear devices. So, basically, there's a 3x
factor for each HD that we want to cache. Moreover, we're currently
using ephemeral disks as cache devices. In case of a machine reboot,
ephemeral devices can get lost; at this point, there will be some trouble
to mount the dm-linear bcache backing device because there will be no
cache device.

For the operational drawbacks: from time to time, we exploit the
online filesystem resize capability of XFS to increase the volume
size. This would be at least tedious, if not nearly impossible, if the
volume is mapped inside a dm-linear.
> It is unnecessary to create a new IOCTL interface, and I feel the way how it works is really unconvinced for potential security risk.
>
Which potential security risks concern you?

Thank you,
Andrea
> Thanks.
>
> Coly Li
>
> >
> > drivers/md/bcache/Makefile      |   2 +-
> > drivers/md/bcache/control.c     | 117 ++++++++++++++++++++++++++++++++
> > drivers/md/bcache/control.h     |  12 ++++
> > drivers/md/bcache/ioctl_codes.h |  19 ++++++
> > drivers/md/bcache/super.c       |  50 +++++++++++---
> > 5 files changed, 189 insertions(+), 11 deletions(-)
> > create mode 100644 drivers/md/bcache/control.c
> > create mode 100644 drivers/md/bcache/control.h
> > create mode 100644 drivers/md/bcache/ioctl_codes.h
> >
> > diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
> > index 5b87e59676b8..46ed41baed7a 100644
> > --- a/drivers/md/bcache/Makefile
> > +++ b/drivers/md/bcache/Makefile
> > @@ -4,4 +4,4 @@ obj-$(CONFIG_BCACHE)  += bcache.o
> >
> > bcache-y              := alloc.o bset.o btree.o closure.o debug.o extents.o\
> >       io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
> > -     util.o writeback.o features.o
> > +     util.o writeback.o features.o control.o
> > diff --git a/drivers/md/bcache/control.c b/drivers/md/bcache/control.c
> > new file mode 100644
> > index 000000000000..69b5e554d093
> > --- /dev/null
> > +++ b/drivers/md/bcache/control.c
> > @@ -0,0 +1,117 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#include <linux/cdev.h>
> > +#include <linux/fs.h>
> > +#include <linux/vmalloc.h>
> > +
> > +#include "control.h"
> > +
> > +struct bch_ctrl_device {
> > +     struct cdev cdev;
> > +     struct class *class;
> > +     dev_t dev;
> > +};
> > +
> > +static struct bch_ctrl_device _control_device;
> > +
> > +/* this handles IOCTL for /dev/bcache_ctrl */
> > +/*********************************************/
> > +static long bch_service_ioctl_ctrl(struct file *filp, unsigned int cmd,
> > +             unsigned long arg)
> > +{
> > +     int retval = 0;
> > +
> > +     if (_IOC_TYPE(cmd) != BCH_IOCTL_MAGIC)
> > +             return -EINVAL;
> > +
> > +     if (!capable(CAP_SYS_ADMIN)) {
> > +             /* Must be root to issue ioctls */
> > +             return -EPERM;
> > +     }
> > +
> > +     switch (cmd) {
> > +     case BCH_IOCTL_REGISTER_DEVICE: {
> > +             struct bch_register_device *cmd_info;
> > +
> > +             cmd_info = vmalloc(sizeof(struct bch_register_device));
> > +             if (!cmd_info)
> > +                     return -ENOMEM;
> > +
> > +             if (copy_from_user(cmd_info, (void __user *)arg,
> > +                             sizeof(struct bch_register_device))) {
> > +                     pr_err("Cannot copy cmd info from user space\n");
> > +                     vfree(cmd_info);
> > +                     return -EINVAL;
> > +             }
> > +
> > +             retval = register_bcache_ioctl(cmd_info);
> > +
> > +             vfree(cmd_info);
> > +             return retval;
> > +     }
> > +
> > +     default:
> > +             return -EINVAL;
> > +     }
> > +}
> > +
> > +static const struct file_operations _ctrl_dev_fops = {
> > +     .owner = THIS_MODULE,
> > +     .unlocked_ioctl = bch_service_ioctl_ctrl
> > +};
> > +
> > +int __init bch_ctrl_device_init(void)
> > +{
> > +     struct bch_ctrl_device *ctrl = &_control_device;
> > +     struct device *device;
> > +     int result = 0;
> > +
> > +     result = alloc_chrdev_region(&ctrl->dev, 0, 1, "bcache");
> > +     if (result) {
> > +             pr_err("Cannot allocate control chrdev number.\n");
> > +             goto error_alloc_chrdev_region;
> > +     }
> > +
> > +     cdev_init(&ctrl->cdev, &_ctrl_dev_fops);
> > +
> > +     result = cdev_add(&ctrl->cdev, ctrl->dev, 1);
> > +     if (result) {
> > +             pr_err("Cannot add control chrdev.\n");
> > +             goto error_cdev_add;
> > +     }
> > +
> > +     ctrl->class = class_create(THIS_MODULE, "bcache");
> > +     if (IS_ERR(ctrl->class)) {
> > +             pr_err("Cannot create control chrdev class.\n");
> > +             result = PTR_ERR(ctrl->class);
> > +             goto error_class_create;
> > +     }
> > +
> > +     device = device_create(ctrl->class, NULL, ctrl->dev, NULL,
> > +                     "bcache_ctrl");
> > +     if (IS_ERR(device)) {
> > +             pr_err("Cannot create control chrdev.\n");
> > +             result = PTR_ERR(device);
> > +             goto error_device_create;
> > +     }
> > +
> > +     return result;
> > +
> > +error_device_create:
> > +     class_destroy(ctrl->class);
> > +error_class_create:
> > +     cdev_del(&ctrl->cdev);
> > +error_cdev_add:
> > +     unregister_chrdev_region(ctrl->dev, 1);
> > +error_alloc_chrdev_region:
> > +     return result;
> > +}
> > +
> > +void bch_ctrl_device_deinit(void)
> > +{
> > +     struct bch_ctrl_device *ctrl = &_control_device;
> > +
> > +     device_destroy(ctrl->class, ctrl->dev);
> > +     class_destroy(ctrl->class);
> > +     cdev_del(&ctrl->cdev);
> > +     unregister_chrdev_region(ctrl->dev, 1);
> > +}
> > diff --git a/drivers/md/bcache/control.h b/drivers/md/bcache/control.h
> > new file mode 100644
> > index 000000000000..3e4273db02a3
> > --- /dev/null
> > +++ b/drivers/md/bcache/control.h
> > @@ -0,0 +1,12 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef __BCACHE_CONTROL_H__
> > +#define __BCACHE_CONTROL_H__
> > +
> > +#include "ioctl_codes.h"
> > +
> > +int __init bch_ctrl_device_init(void);
> > +void bch_ctrl_device_deinit(void);
> > +
> > +ssize_t register_bcache_ioctl(struct bch_register_device *brd);
> > +
> > +#endif
> > diff --git a/drivers/md/bcache/ioctl_codes.h b/drivers/md/bcache/ioctl_codes.h
> > new file mode 100644
> > index 000000000000..b004d60c29ff
> > --- /dev/null
> > +++ b/drivers/md/bcache/ioctl_codes.h
> > @@ -0,0 +1,19 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef __BCACHE_IOCTL_CODES_H__
> > +#define __BCACHE_IOCTL_CODES_H__
> > +
> > +#include <linux/ioctl.h>
> > +#include <linux/types.h>
> > +
> > +struct bch_register_device {
> > +     const char *dev_name;
> > +     size_t size;
> > +     struct cache_sb *sb;
> > +};
> > +
> > +#define BCH_IOCTL_MAGIC (0xBC)
> > +
> > +/* Register a new backing device */
> > +#define BCH_IOCTL_REGISTER_DEVICE    _IOWR(BCH_IOCTL_MAGIC, 1, struct bch_register_device)
> > +
> > +#endif
> > diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> > index 140f35dc0c45..339a11d00fef 100644
> > --- a/drivers/md/bcache/super.c
> > +++ b/drivers/md/bcache/super.c
> > @@ -14,6 +14,7 @@
> > #include "request.h"
> > #include "writeback.h"
> > #include "features.h"
> > +#include "control.h"
> >
> > #include <linux/blkdev.h>
> > #include <linux/pagemap.h>
> > @@ -340,6 +341,9 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
> >       struct closure *cl = &dc->sb_write;
> >       struct bio *bio = &dc->sb_bio;
> >
> > +     if (!dc->sb_disk)
> > +             return;
> > +
> >       down(&dc->sb_write_mutex);
> >       closure_init(cl, parent);
> >
> > @@ -2403,14 +2407,14 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
> >
> > /* Global interfaces/init */
> >
> > -static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
> > +static ssize_t register_bcache_sysfs(struct kobject *k, struct kobj_attribute *attr,
> >                              const char *buffer, size_t size);
> > static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
> >                                        struct kobj_attribute *attr,
> >                                        const char *buffer, size_t size);
> >
> > -kobj_attribute_write(register,               register_bcache);
> > -kobj_attribute_write(register_quiet, register_bcache);
> > +kobj_attribute_write(register,               register_bcache_sysfs);
> > +kobj_attribute_write(register_quiet, register_bcache_sysfs);
> > kobj_attribute_write(pendings_cleanup,        bch_pending_bdevs_cleanup);
> >
> > static bool bch_is_open_backing(dev_t dev)
> > @@ -2465,7 +2469,8 @@ static void register_bdev_worker(struct work_struct *work)
> >       dc = kzalloc(sizeof(*dc), GFP_KERNEL);
> >       if (!dc) {
> >               fail = true;
> > -             put_page(virt_to_page(args->sb_disk));
> > +             if (args->sb_disk)
> > +                     put_page(virt_to_page(args->sb_disk));
> >               blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
> >               goto out;
> >       }
> > @@ -2495,7 +2500,8 @@ static void register_cache_worker(struct work_struct *work)
> >       ca = kzalloc(sizeof(*ca), GFP_KERNEL);
> >       if (!ca) {
> >               fail = true;
> > -             put_page(virt_to_page(args->sb_disk));
> > +             if (args->sb_disk)
> > +                     put_page(virt_to_page(args->sb_disk));
> >               blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
> >               goto out;
> >       }
> > @@ -2525,7 +2531,7 @@ static void register_device_async(struct async_reg_args *args)
> >       queue_delayed_work(system_wq, &args->reg_work, 10);
> > }
> >
> > -static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
> > +static ssize_t register_bcache_common(void *k, struct kobj_attribute *attr,
> >                              const char *buffer, size_t size)
> > {
> >       const char *err;
> > @@ -2587,9 +2593,14 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
> >       if (set_blocksize(bdev, 4096))
> >               goto out_blkdev_put;
> >
> > -     err = read_super(sb, bdev, &sb_disk);
> > -     if (err)
> > -             goto out_blkdev_put;
> > +     if (!k) {
> > +             err = read_super(sb, bdev, &sb_disk);
> > +             if (err)
> > +                     goto out_blkdev_put;
> > +     } else {
> > +             sb_disk =  NULL;
> > +             memcpy(sb, (struct cache_sb *)k, sizeof(struct cache_sb));
> > +     }
> >
> >       err = "failed to register device";
> >
> > @@ -2651,7 +2662,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
> >       return size;
> >
> > out_put_sb_page:
> > -     put_page(virt_to_page(sb_disk));
> > +     if (!k)
> > +             put_page(virt_to_page(sb_disk));
> > out_blkdev_put:
> >       blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
> > out_free_sb:
> > @@ -2666,6 +2678,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
> >       return ret;
> > }
> >
> > +static ssize_t register_bcache_sysfs(struct kobject *k, struct kobj_attribute *attr,
> > +                            const char *buffer, size_t size)
> > +{
> > +     return register_bcache_common(NULL, attr, buffer, size);
> > +}
> > +
> > +ssize_t register_bcache_ioctl(struct bch_register_device *brd)
> > +{
> > +     return register_bcache_common((void *)brd->sb, NULL, brd->dev_name, brd->size);
> > +}
> > +
> > +
> >
> > struct pdev {
> >       struct list_head list;
> > @@ -2819,6 +2843,7 @@ static void bcache_exit(void)
> > {
> >       bch_debug_exit();
> >       bch_request_exit();
> > +     bch_ctrl_device_deinit();
> >       if (bcache_kobj)
> >               kobject_put(bcache_kobj);
> >       if (bcache_wq)
> > @@ -2918,6 +2943,11 @@ static int __init bcache_init(void)
> >       bch_debug_init();
> >       closure_debug_init();
> >
> > +     if (bch_ctrl_device_init()) {
> > +             pr_err("Cannot initialize control device\n");
> > +             goto err;
> > +     }
> > +
> >       bcache_is_reboot = false;
> >
> >       return 0;
> > --
> > 2.37.0
> >
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] bcache: Use bcache without formatting existing device
  2022-07-05  8:48   ` Andrea Tomassetti
@ 2022-07-05 14:04     ` Coly Li
  2022-07-05 22:03       ` Eric Wheeler
  0 siblings, 1 reply; 17+ messages in thread
From: Coly Li @ 2022-07-05 14:04 UTC (permalink / raw)
  To: Andrea Tomassetti; +Cc: linux-bcache, Kent Overstreet, Zhang Zhen



> 2022年7月5日 16:48,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> 
> On Mon, Jul 4, 2022 at 5:29 PM Coly Li <colyli@suse.de> wrote:
>> 
>> 
>> 
>>> 2022年7月4日 23:13,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
>>> 
>>> Introducing a bcache control device (/dev/bcache_ctrl)
>>> that allows communicating to the driver from user space
>>> via IOCTL. The only IOCTL commands currently implemented,
>>> receives a struct cache_sb and uses it to register the
>>> backing device without any need of formatting them.
>>> 
>>> Signed-off-by: Andrea Tomassetti <andrea.tomassetti-opensource@devo.com>
>>> ---
>>> Hi all,
>>> At Devo we started to think of using bcache in our production systems
>>> to boost performance. But, at the very beginning, we were faced with
>>> one annoying issue, at least for our use-case: bcache needs the backing
>>> devices to be formatted before being able to use them. What's really
>>> needed is just to wipe any FS signature out of them. This is definitely
>>> not an option we will consider in our production systems because we would
>>> need to move hundreds of terabytes of data.
>>> 
>>> To circumvent the "formatting" problem, in the past weeks I worked on some
>>> modifications to the actual bcache module. In particular, I added a bcache
>>> control device (exported to /dev/bcache_ctrl) that allows communicating to
>>> the driver from userspace via IOCTL. One of the IOCTL commands that I
>>> implemented receives a struct cache_sb and uses it to register the backing
>>> device. The modifications are really small and retro compatible. To then
>>> re-create the same configuration after every boot (because the backing
>>> devices now do not present the bcache super block anymore) I created an
>>> udev rule that invokes a python script that will re-create the same
>>> scenario based on a yaml configuration file.
>>> 
>>> I'm re-sending this patch without any confidentiality footer, sorry for that.
>> 
>> Thanks for removing that confidential and legal statement, that’s the reason I was not able to reply your email.
>> 
> Thank you for your patience and sorry for the newbie mistake.
>> Back to the patch, I don’t support this idea. For the problem you are solving, indeed people uses device mapper linear target for many years, and it works perfectly without any code modification.
>> 
>> That is, create a 8K image and set it as a loop device, then write a dm table to combine it with the existing hard drive. Then you run “bcache make -B <combined dm target>”, you will get a bcache device whose first 8K in the loop device and existing super block of the hard driver located at expected offset.
>> 
> We evaluated this option but we weren't satisfied by the outcomes for,
> at least, two main reasons: complexity and operational drawbacks.
> For the complexity side: in production we use more than 20 HD that
> need to be cached. It means we need to create 20+ header for them, 20+
> loop devices and 20+ dm linear devices. So, basically, there's a 3x
> factor for each HD that we want to cache. Moreover, we're currently
> using ephemeral disks as cache devices. In case of a machine reboot,
> ephemeral devices can get lost; at this point, there will be some trouble
> to mount the dm-linear bcache backing device because there will be no
> cache device.

OK, I get your point. It might make sense for your situation, although I know some other people use the linear dm-table to solve similar situation but may be not perfect for you.
This patch may work in your procedure, but there are following things make me uncomfortable,
- Copying a user space memory and directly using it as a complicated in-kernel memory object.
- A new IOCTL interface added, even all rested interface are sysfs based.
- Do things in kernel space while there is solution in user space.

All the above opinions are quite personal to myself, I don’t say you are wrong or I am correct. If the patch works, that’s cool and you can use it as you want, but I don’t support it to be in upstream.

> 
> For the operational drawbacks: from time to time, we exploit the
> online filesystem resize capability of XFS to increase the volume
> size. This would be at least tedious, if not nearly impossible, if the
> volume is mapped inside a dm-linear.

Currently bcache doesn’t support cache or backing device resize. I don’t see the connection between above statement and feeding an in-memory super block via IOCTL command.


>> It is unnecessary to create a new IOCTL interface, and I feel the way how it works is really unconvinced for potential security risk.
>> 
> Which potential security risks concern you?
> 

Currently we don’t check all members of struct cache_sb_disk, what we do is to simply trust bcache-tools. Create a new IOCTL interface to send a user space memory into kernel space as superblock, that needs quite a lot of checking code to make sure it won’t panic the kernel. It is possible, but it is not worthy to add so many code for the purpose, especially adding them into upstream code.

I am not able to provide an explicit example for security risk, just the new adding interface makes me not so confident.

Thanks.

Coly Li




^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] bcache: Use bcache without formatting existing device
  2022-07-05 14:04     ` Coly Li
@ 2022-07-05 22:03       ` Eric Wheeler
  2022-07-07  9:56         ` Andrea Tomassetti
  0 siblings, 1 reply; 17+ messages in thread
From: Eric Wheeler @ 2022-07-05 22:03 UTC (permalink / raw)
  To: Coly Li; +Cc: Andrea Tomassetti, linux-bcache, Kent Overstreet, Zhang Zhen

[-- Attachment #1: Type: text/plain, Size: 9230 bytes --]

On Tue, 5 Jul 2022, Coly Li wrote:
> > 2022年7月5日 16:48,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> > On Mon, Jul 4, 2022 at 5:29 PM Coly Li <colyli@suse.de> wrote:
> >>> 2022年7月4日 23:13,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> >>> Introducing a bcache control device (/dev/bcache_ctrl) that allows 
> >>> communicating to the driver from user space via IOCTL. The only 
> >>> IOCTL commands currently implemented, receives a struct cache_sb and 
> >>> uses it to register the backing device without any need of 
> >>> formatting them.
> >>>
> >> Back to the patch, I don’t support this idea. For the problem you are 
> >> solving, indeed people uses device mapper linear target for many 
> >> years, and it works perfectly without any code modification.
> >> 
> >> That is, create a 8K image and set it as a loop device, then write a 
> >> dm table to combine it with the existing hard drive. Then you run 
> >> “bcache make -B <combined dm target>”, you will get a bcache device 
> >> whose first 8K in the loop device and existing super block of the 
> >> hard driver located at expected offset.
> >> 
> > We evaluated this option but we weren't satisfied by the outcomes for, 
> > at least, two main reasons: complexity and operational drawbacks. For 
> > the complexity side: in production we use more than 20 HD that need to 
> > be cached. It means we need to create 20+ header for them, 20+ loop 
> > devices and 20+ dm linear devices. So, basically, there's a 3x factor 
> > for each HD that we want to cache. Moreover, we're currently using 
> > ephemeral disks as cache devices. In case of a machine reboot, 
> > ephemeral devices can get lost; at this point, there will be some 
> > trouble to mount the dm-linear bcache backing device because there 
> > will be no cache device. 
>
> OK, I get your point. It might make sense for your situation, although I 
> know some other people use the linear dm-table to solve similar 
> situation but may be not perfect for you. This patch may work in your 
> procedure, but there are following things make me uncomfortable,

Coly is right: there are some issues to address.  

Coly, I do not wish to contradict you, only to add that we have had times 
where this would be useful. I like the idea, particularly avoiding placing 
dm-linear atop of the bdev which reduces the IO codepath.  Maybe there is 
an implementation that would fit everyone's needs.

For us, such a superblock-less registration could resolve two issues:

1. Most commonly we wish to add bcache to an existing device without
   re-formatting and without adding the dm-linear complexity.

2. Relatedly, IMHO, I've never liked the default location at 8k because we 
   like to align our bdev's to the RAID stride width and rarely is the 
   bdev array aligned at 8k (usually 64k or 256k for hardware RAID).  If 
   we forget to pass the --data-offset at make-bcache time then we are 
   stuck with an 8k-misalignment for the life of the device.

In #2 we usually reformat the volume to avoid dm-linear complexity (and in 
both cases we have wanted writeback cache support).  This process can take 
a while to `dd` ~30TBs of bdev on spinning disks and we have to find 
temporary disk space to move that data out and back in.

> - Copying a user space memory and directly using it as a complicated in-kernel memory object.

In the ioctl changes for bch_write_bdev_super() there does not appear to 
be a way to handle attaching caches to the running bcache.  For example:

1. Add a cacheless bcache0 volume via ioctl
2. Attach a writeback cache, write some data.
3. Unregister everything
4. Re-register the _dirty_ bdev from #1

In this scenario bcache will start "running" immediately because it 
cannot update its cset.uuid (as reported by bcache-super-show) which I 
believe is stored in cache_sb.set_uuid.  

I suppose in step #2 you could update your userspace state with the 
cset.uuid so your userspace ioctl register tool would put the cset.uuid in 
place, but this is fragile without good userspace integration.

I could imagine something like /etc/bcachetab (like crypttab or 
mdadm.conf) that maps cset UUID's to bdev UUIDs.  Then your userspace 
ioctl tool could be included in a udev script to auto-load volumes as they 
become available.

You want to make sure that when a dirty writeback-cached bdev is 
registered that it does not activate until either:

  1. The cache is attached
  2. echo 1 > /sys/block/bcache0/bcache/running

> - A new IOCTL interface added, even all rested interface are sysfs based.

True.  Perhaps something like this would be better, and it would avoid 
sharing a userspace page for blindly populating `struct cache_sb`, too:

  echo '/dev/bdev [bdev_uuid] [/dev/cachedev|cset_uuid] [options]' > \
          /sys/fs/bcache/register_raw

Because of the writeback issue above, the cache and bdev either need to be 
registered simultaneously or the cset uuid need to be specified.

> - Do things in kernel space while there is solution in user space.
> 
> All the above opinions are quite personal to myself, I don’t say you are 
> wrong or I am correct. If the patch works, that’s cool and you can use 
> it as you want, but I don’t support it to be in upstream.

An alternate implementation might be to create a dm-bcache target.  The 
core bcache code could be left alone except a few EXPORT_SYMBOL()'s so 
dm-bcache can reach the bcache registration bits.  

This would:
  * Minimally impact the bcache code base
  * Solve the blind-populating of `struct cache_sb` issue in the same way 
    as `register_raw` could work above.
  * Create a nicely segmented codebase (dm target) to upstream separately 
    through the DM team.
  * Could be maintained cleanly out-of-tree because the bcache 
    registration interfaces change very infrequently.
  * bdev resize could be done with a `dmsetup replace` but you'll need to 
    add resize support as below.

> > For the operational drawbacks: from time to time, we exploit the
> > online filesystem resize capability of XFS to increase the volume
> > size. This would be at least tedious, if not nearly impossible, if the
> > volume is mapped inside a dm-linear.
>
> Currently bcache doesn’t support cache or backing device resize. I don’t 
> see the connection between above statement and feeding an in-memory 
> super block via IOCTL command.

Resize is perhaps unrelated, so if you decide to tackle bdev or cachedev 
resize then please start a new list thread.  Briefly: supporting bdev 
resize is probably easy.  I've looked through the code a few times with 
this in mind but haven't had time.

Here's the summary, not sure if there are any other 
considerations:

  1. Create a sysfs file like /sys/block/bcache0/bcache/resize to trigger 
     resize on echo 1 >.
  2. Refactor the set_capacity() bits from  bcache_device_init() into its 
     own function.
  3. Put locks around bcache_device.full_dirty_stripes and 
     bcache_device.stripe_sectors_dirty.  Re-alloc+copy+free and zero the 
     new bytes at the end.

The cachedev's don't know anything about the bdev size, so (according to 
Kent) they will "just work" by referencing new offsets that didn't exist 
before when IOs come through.  (This is basically the same as resizing the 
bdev and then unregister/re-register which is how we resize bdevs now.)

As for resizing a cachedev, I've not looked at all---not sure about that 
one.  We always detach, resize, make-bcache and re-attach the new cache.  
Maybe it is similarly simple, but haven't looked.
 
> >> It is unnecessary to create a new IOCTL interface, and I feel the way 
> >> how it works is really unconvinced for potential security risk.
> >> 
> > Which potential security risks concern you?
> > 
> 
> Currently we don’t check all members of struct cache_sb_disk, what we do 
> is to simply trust bcache-tools. Create a new IOCTL interface to send a 
> user space memory into kernel space as superblock, that needs quite a 
> lot of checking code to make sure it won’t panic the kernel. It is 
> possible, but it is not worthy to add so many code for the purpose, 
> especially adding them into upstream code.
> 
> I am not able to provide an explicit example for security risk, just the 
> new adding interface makes me not so confident.

Maintaining a blind structure population from a userspace page would be 
difficult as well because even if the kernel validates _everything_ in 
cache_sb today, anyone extending `struct cache_sb` needs to remember to 
add checks for that. Failing to enforce validation inside the kernel could 
lead to kernel crashes or data corruption from userspace which is of 
course never good.

We always assume that, somehow, someone could leverage an insecure IOCTL 
call and crash the OS when they shouldn't be allowed to (even if they are 
not root, like from sudo).  This is a security issue from an assurance and 
integrity point of view, even if there might be neither an obvious 
privelege escalation nor privacy concern.

-Eric

 
> Thanks.
> 
> Coly Li
> 
> 
> 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] bcache: Use bcache without formatting existing device
  2022-07-05 22:03       ` Eric Wheeler
@ 2022-07-07  9:56         ` Andrea Tomassetti
  2022-07-12 20:29           ` Eric Wheeler
  0 siblings, 1 reply; 17+ messages in thread
From: Andrea Tomassetti @ 2022-07-07  9:56 UTC (permalink / raw)
  To: Eric Wheeler; +Cc: Coly Li, linux-bcache, Kent Overstreet, Zhang Zhen

Hi Eric and Coly,
thank you very much for the interesting discussion.

Let me give you a little bit more context that maybe will help you
better understand the specific use case that this patch is trying to
tackle.
As I stated in a previous email, we're currently using ephemeral disks
as cache devices so, for us, using bcache in writeback mode is out of
discussion because it could lead us to data loss.
Moreover, I fully understand the dangerous implications of registering
a backing device without formatting it and using it in wb mode. That's
why this patch is meant to register backing devices *only* (no need to
register a cache device without formatting it) and in wt mode *only*.
I'm not saying it cannot be used to register backing devices in wb
mode, I'm just saying that it needs further and deeper analysis on the
implications.
Sorry if this context wasn't clear enough.

On Wed, Jul 6, 2022 at 12:03 AM Eric Wheeler <bcache@lists.ewheeler.net> wrote:
>
> On Tue, 5 Jul 2022, Coly Li wrote:
> > > 2022年7月5日 16:48,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> > > On Mon, Jul 4, 2022 at 5:29 PM Coly Li <colyli@suse.de> wrote:
> > >>> 2022年7月4日 23:13,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> > >>> Introducing a bcache control device (/dev/bcache_ctrl) that allows
> > >>> communicating to the driver from user space via IOCTL. The only
> > >>> IOCTL commands currently implemented, receives a struct cache_sb and
> > >>> uses it to register the backing device without any need of
> > >>> formatting them.
> > >>>
> > >> Back to the patch, I don’t support this idea. For the problem you are
> > >> solving, indeed people uses device mapper linear target for many
> > >> years, and it works perfectly without any code modification.
> > >>
> > >> That is, create a 8K image and set it as a loop device, then write a
> > >> dm table to combine it with the existing hard drive. Then you run
> > >> “bcache make -B <combined dm target>”, you will get a bcache device
> > >> whose first 8K in the loop device and existing super block of the
> > >> hard driver located at expected offset.
> > >>
> > > We evaluated this option but we weren't satisfied by the outcomes for,
> > > at least, two main reasons: complexity and operational drawbacks. For
> > > the complexity side: in production we use more than 20 HD that need to
> > > be cached. It means we need to create 20+ header for them, 20+ loop
> > > devices and 20+ dm linear devices. So, basically, there's a 3x factor
> > > for each HD that we want to cache. Moreover, we're currently using
> > > ephemeral disks as cache devices. In case of a machine reboot,
> > > ephemeral devices can get lost; at this point, there will be some
> > > trouble to mount the dm-linear bcache backing device because there
> > > will be no cache device.
> >
> > OK, I get your point. It might make sense for your situation, although I
> > know some other people use the linear dm-table to solve similar
> > situation but may be not perfect for you. This patch may work in your
> > procedure, but there are following things make me uncomfortable,
>
> Coly is right: there are some issues to address.
>
> Coly, I do not wish to contradict you, only to add that we have had times
> where this would be useful. I like the idea, particularly avoiding placing
> dm-linear atop of the bdev which reduces the IO codepath.  Maybe there is
> an implementation that would fit everyone's needs.
>
> For us, such a superblock-less registration could resolve two issues:
>
> 1. Most commonly we wish to add bcache to an existing device without
>    re-formatting and without adding the dm-linear complexity.
>
That's exactly what was preventing us from using bcache in production
prior to this patch.

> 2. Relatedly, IMHO, I've never liked the default location at 8k because we
>    like to align our bdev's to the RAID stride width and rarely is the
>    bdev array aligned at 8k (usually 64k or 256k for hardware RAID).  If
>    we forget to pass the --data-offset at make-bcache time then we are
>    stuck with an 8k-misalignment for the life of the device.
>
> In #2 we usually reformat the volume to avoid dm-linear complexity (and in
> both cases we have wanted writeback cache support).  This process can take
> a while to `dd` ‾30TBs of bdev on spinning disks and we have to find
> temporary disk space to move that data out and back in.
>
> > - Copying a user space memory and directly using it as a complicated in-kernel memory object.
>
> In the ioctl changes for bch_write_bdev_super() there does not appear to
> be a way to handle attaching caches to the running bcache.  For example:
>
> 1. Add a cacheless bcache0 volume via ioctl
> 2. Attach a writeback cache, write some data.
> 3. Unregister everything
> 4. Re-register the _dirty_ bdev from #1
>
> In this scenario bcache will start "running" immediately because it
> cannot update its cset.uuid (as reported by bcache-super-show) which I
> believe is stored in cache_sb.set_uuid.
>
> I suppose in step #2 you could update your userspace state with the
> cset.uuid so your userspace ioctl register tool would put the cset.uuid in
> place, but this is fragile without good userspace integration.
>
> I could imagine something like /etc/bcachetab (like crypttab or
> mdadm.conf) that maps cset UUID's to bdev UUIDs.  Then your userspace
> ioctl tool could be included in a udev script to auto-load volumes as they
> become available.
Yes, in conjunction with this patch, I developed a python udev script
that manages device registration base on a YAML configuration file. I
even patched bcache-tools to support the new IOCTL registration. You
will find a link to the Github project at the end of this message.

>
> You want to make sure that when a dirty writeback-cached bdev is
> registered that it does not activate until either:
>
>   1. The cache is attached
>   2. echo 1 > /sys/block/bcache0/bcache/running
>
> > - A new IOCTL interface added, even all rested interface are sysfs based.
>
> True.  Perhaps something like this would be better, and it would avoid
> sharing a userspace page for blindly populating `struct cache_sb`, too:
>
>   echo '/dev/bdev [bdev_uuid] [/dev/cachedev|cset_uuid] [options]' > ¥
>           /sys/fs/bcache/register_raw
>
I thought that introducing an IOCTL interface could be a stopper for
this patch, so I'm more than welcome to refactor it using sysfs
entries only. But, I will do so only if there's even the slightest
chance that this patch can be merged.
> Because of the writeback issue above, the cache and bdev either need to be
> registered simultaneously or the cset uuid need to be specified.
>
> > - Do things in kernel space while there is solution in user space.
> >
> > All the above opinions are quite personal to myself, I don’t say you are
> > wrong or I am correct. If the patch works, that’s cool and you can use
> > it as you want, but I don’t support it to be in upstream.
>
> An alternate implementation might be to create a dm-bcache target.  The
> core bcache code could be left alone except a few EXPORT_SYMBOL()'s so
> dm-bcache can reach the bcache registration bits.
>
> This would:
>   * Minimally impact the bcache code base
>   * Solve the blind-populating of `struct cache_sb` issue in the same way
>     as `register_raw` could work above.
>   * Create a nicely segmented codebase (dm target) to upstream separately
>     through the DM team.
>   * Could be maintained cleanly out-of-tree because the bcache
>     registration interfaces change very infrequently.
>   * bdev resize could be done with a `dmsetup replace` but you'll need to
>     add resize support as below.
>
> > > For the operational drawbacks: from time to time, we exploit the
> > > online filesystem resize capability of XFS to increase the volume
> > > size. This would be at least tedious, if not nearly impossible, if the
> > > volume is mapped inside a dm-linear.
> >
> > Currently bcache doesn’t support cache or backing device resize. I don’t
> > see the connection between above statement and feeding an in-memory
> > super block via IOCTL command.
>
> Resize is perhaps unrelated, so if you decide to tackle bdev or cachedev
> resize then please start a new list thread.  Briefly: supporting bdev
> resize is probably easy.  I've looked through the code a few times with
> this in mind but haven't had time.
>
> Here's the summary, not sure if there are any other
> considerations:
>
>   1. Create a sysfs file like /sys/block/bcache0/bcache/resize to trigger
>      resize on echo 1 >.
>   2. Refactor the set_capacity() bits from  bcache_device_init() into its
>      own function.
>   3. Put locks around bcache_device.full_dirty_stripes and
>      bcache_device.stripe_sectors_dirty.  Re-alloc+copy+free and zero the
>      new bytes at the end.
>
> The cachedev's don't know anything about the bdev size, so (according to
> Kent) they will "just work" by referencing new offsets that didn't exist
> before when IOs come through.  (This is basically the same as resizing the
> bdev and then unregister/re-register which is how we resize bdevs now.)
>
> As for resizing a cachedev, I've not looked at all---not sure about that
> one.  We always detach, resize, make-bcache and re-attach the new cache.
> Maybe it is similarly simple, but haven't looked.
>
> > >> It is unnecessary to create a new IOCTL interface, and I feel the way
> > >> how it works is really unconvinced for potential security risk.
> > >>
> > > Which potential security risks concern you?
> > >
> >
> > Currently we don’t check all members of struct cache_sb_disk, what we do
> > is to simply trust bcache-tools. Create a new IOCTL interface to send a
> > user space memory into kernel space as superblock, that needs quite a
> > lot of checking code to make sure it won’t panic the kernel. It is
> > possible, but it is not worthy to add so many code for the purpose,
> > especially adding them into upstream code.
> >
> > I am not able to provide an explicit example for security risk, just the
> > new adding interface makes me not so confident.
>
> Maintaining a blind structure population from a userspace page would be
> difficult as well because even if the kernel validates _everything_ in
> cache_sb today, anyone extending `struct cache_sb` needs to remember to
> add checks for that. Failing to enforce validation inside the kernel could
> lead to kernel crashes or data corruption from userspace which is of
> course never good.
>
Can I solve this by refactoring the patch and using sysfs based
registration instead?

For anyone that is interested to try this solution, and as a
reference, I leave here below the links to my public Github
repositories:
- bcache-tools: https://github.com/andreatomassetti/bcache-tools
- bcache (standalone, patched): https://github.com/andreatomassetti/bcache

Thank you very much,
Andrea

> We always assume that, somehow, someone could leverage an insecure IOCTL
> call and crash the OS when they shouldn't be allowed to (even if they are
> not root, like from sudo).  This is a security issue from an assurance and
> integrity point of view, even if there might be neither an obvious
> privelege escalation nor privacy concern.
>
> -Eric
>
>
> > Thanks.
> >
> > Coly Li
> >
> >
> >
> >

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] bcache: Use bcache without formatting existing device
  2022-07-07  9:56         ` Andrea Tomassetti
@ 2022-07-12 20:29           ` Eric Wheeler
  2022-07-20  8:06             ` Andrea Tomassetti
  0 siblings, 1 reply; 17+ messages in thread
From: Eric Wheeler @ 2022-07-12 20:29 UTC (permalink / raw)
  To: Andrea Tomassetti; +Cc: Coly Li, linux-bcache, Kent Overstreet, Zhang Zhen

[-- Attachment #1: Type: text/plain, Size: 12033 bytes --]

On Thu, 7 Jul 2022, Andrea Tomassetti wrote:
> On Wed, Jul 6, 2022 at 12:03 AM Eric Wheeler <bcache@lists.ewheeler.net> wrote:
> > On Tue, 5 Jul 2022, Coly Li wrote:
> > > > 2022年7月5日 16:48,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> > > > On Mon, Jul 4, 2022 at 5:29 PM Coly Li <colyli@suse.de> wrote:
> > > >>> 2022年7月4日 23:13,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> > > >>> Introducing a bcache control device (/dev/bcache_ctrl) that allows
> > > >>> communicating to the driver from user space via IOCTL. The only
> > > >>> IOCTL commands currently implemented, receives a struct cache_sb and
> > > >>> uses it to register the backing device without any need of
> > > >>> formatting them.
> > > >>>
> > > >> Back to the patch, I don’t support this idea. For the problem you are
> > > >> solving, indeed people uses device mapper linear target for many
> > > >> years, and it works perfectly without any code modification.
> > > >>
> > > >> That is, create a 8K image and set it as a loop device, then write a
> > > >> dm table to combine it with the existing hard drive. Then you run
> > > >> “bcache make -B <combined dm target>”, you will get a bcache device
> > > >> whose first 8K in the loop device and existing super block of the
> > > >> hard driver located at expected offset.
> > > >>
> > > > We evaluated this option but we weren't satisfied by the outcomes for,
> > > > at least, two main reasons: complexity and operational drawbacks. For
> > > > the complexity side: in production we use more than 20 HD that need to
> > > > be cached. It means we need to create 20+ header for them, 20+ loop
> > > > devices and 20+ dm linear devices. So, basically, there's a 3x factor
> > > > for each HD that we want to cache. Moreover, we're currently using
> > > > ephemeral disks as cache devices. In case of a machine reboot,
> > > > ephemeral devices can get lost; at this point, there will be some
> > > > trouble to mount the dm-linear bcache backing device because there
> > > > will be no cache device.
> > >
> > > OK, I get your point. It might make sense for your situation, although I
> > > know some other people use the linear dm-table to solve similar
> > > situation but may be not perfect for you. This patch may work in your
> > > procedure, but there are following things make me uncomfortable,
> >
> > Coly is right: there are some issues to address.
> >
> > Coly, I do not wish to contradict you, only to add that we have had times
> > where this would be useful. I like the idea, particularly avoiding placing
> > dm-linear atop of the bdev which reduces the IO codepath.  Maybe there is
> > an implementation that would fit everyone's needs.
> >
> > For us, such a superblock-less registration could resolve two issues:
> >
> > 1. Most commonly we wish to add bcache to an existing device without
> >    re-formatting and without adding the dm-linear complexity.
>
> That's exactly what was preventing us from using bcache in production
> prior to this patch.

Ok, but we always use writeback...and others may wish to, too.  I think 
any patch that introduces a feature needs to support existing features 
without introducing limitations on the behavior.
 
> > 2. Relatedly, IMHO, I've never liked the default location at 8k because we
> >    like to align our bdev's to the RAID stride width and rarely is the
> >    bdev array aligned at 8k (usually 64k or 256k for hardware RAID).  If
> >    we forget to pass the --data-offset at make-bcache time then we are
> >    stuck with an 8k-misalignment for the life of the device.
> >
> > In #2 we usually reformat the volume to avoid dm-linear complexity (and in
> > both cases we have wanted writeback cache support).  This process can take
> > a while to `dd` ‾30TBs of bdev on spinning disks and we have to find
> > temporary disk space to move that data out and back in.
> >
> > > - Copying a user space memory and directly using it as a complicated in-kernel memory object.
> >
> > In the ioctl changes for bch_write_bdev_super() there does not appear to
> > be a way to handle attaching caches to the running bcache.  For example:
> >
> > 1. Add a cacheless bcache0 volume via ioctl
> > 2. Attach a writeback cache, write some data.
> > 3. Unregister everything
> > 4. Re-register the _dirty_ bdev from #1
> >
> > In this scenario bcache will start "running" immediately because it
> > cannot update its cset.uuid (as reported by bcache-super-show) which I
> > believe is stored in cache_sb.set_uuid.
> >
> > I suppose in step #2 you could update your userspace state with the
> > cset.uuid so your userspace ioctl register tool would put the cset.uuid in
> > place, but this is fragile without good userspace integration.
> >
> > I could imagine something like /etc/bcachetab (like crypttab or
> > mdadm.conf) that maps cset UUID's to bdev UUIDs.  Then your userspace
> > ioctl tool could be included in a udev script to auto-load volumes as they
> > become available.
> Yes, in conjunction with this patch, I developed a python udev script
> that manages device registration base on a YAML configuration file. I
> even patched bcache-tools to support the new IOCTL registration. You
> will find a link to the Github project at the end of this message.

Fewer dependencies are better: There are still python2 vs python3 
conflicts out there---and loading python and its dependencies into an 
initrd/initramfs for early bcache loading could be very messy, indeed!

You've already put some work into make-bcache so creating a bcache_udev 
function and a bcache-udev.c file (like make-bcache.c) is probably easy 
enough.  IMHO, a single-line grepable format (instead of YAML) could be 
used to minimize dependencies so that it is clean in an initramfs in the 
same way that mdadm.conf and crypttab already do.  You could then parse it 
with C or bash pretty easily...

> > You want to make sure that when a dirty writeback-cached bdev is
> > registered that it does not activate until either:
> >
> >   1. The cache is attached
> >   2. echo 1 > /sys/block/bcache0/bcache/running
> >
> > > - A new IOCTL interface added, even all rested interface are sysfs based.
> >
> > True.  Perhaps something like this would be better, and it would avoid
> > sharing a userspace page for blindly populating `struct cache_sb`, too:
> >
> >   echo '/dev/bdev [bdev_uuid] [/dev/cachedev|cset_uuid] [options]' > ¥
> >           /sys/fs/bcache/register_raw
>
> I thought that introducing an IOCTL interface could be a stopper for
> this patch, so I'm more than welcome to refactor it using sysfs
> entries only. But, I will do so only if there's even the slightest
> chance that this patch can be merged.

I'm for the patch with sysfs, but I'm not the decision maker either.

Coly: what would your requirements be to accept this upstream?

> > Because of the writeback issue above, the cache and bdev either need to be
> > registered simultaneously or the cset uuid need to be specified.
> >
> > > - Do things in kernel space while there is solution in user space.
> > >
> > > All the above opinions are quite personal to myself, I don’t say you are
> > > wrong or I am correct. If the patch works, that’s cool and you can use
> > > it as you want, but I don’t support it to be in upstream.
> >
> > An alternate implementation might be to create a dm-bcache target.  The
> > core bcache code could be left alone except a few EXPORT_SYMBOL()'s so
> > dm-bcache can reach the bcache registration bits.
> >
> > This would:
> >   * Minimally impact the bcache code base
> >   * Solve the blind-populating of `struct cache_sb` issue in the same way
> >     as `register_raw` could work above.
> >   * Create a nicely segmented codebase (dm target) to upstream separately
> >     through the DM team.
> >   * Could be maintained cleanly out-of-tree because the bcache
> >     registration interfaces change very infrequently.
> >   * bdev resize could be done with a `dmsetup replace` but you'll need to
> >     add resize support as below.
> >
> > > > For the operational drawbacks: from time to time, we exploit the
> > > > online filesystem resize capability of XFS to increase the volume
> > > > size. This would be at least tedious, if not nearly impossible, if the
> > > > volume is mapped inside a dm-linear.
> > >
> > > Currently bcache doesn’t support cache or backing device resize. I don’t
> > > see the connection between above statement and feeding an in-memory
> > > super block via IOCTL command.
> >
> > Resize is perhaps unrelated, so if you decide to tackle bdev or cachedev
> > resize then please start a new list thread.  Briefly: supporting bdev
> > resize is probably easy.  I've looked through the code a few times with
> > this in mind but haven't had time.
> >
> > Here's the summary, not sure if there are any other
> > considerations:
> >
> >   1. Create a sysfs file like /sys/block/bcache0/bcache/resize to trigger
> >      resize on echo 1 >.
> >   2. Refactor the set_capacity() bits from  bcache_device_init() into its
> >      own function.
> >   3. Put locks around bcache_device.full_dirty_stripes and
> >      bcache_device.stripe_sectors_dirty.  Re-alloc+copy+free and zero the
> >      new bytes at the end.
> >
> > The cachedev's don't know anything about the bdev size, so (according to
> > Kent) they will "just work" by referencing new offsets that didn't exist
> > before when IOs come through.  (This is basically the same as resizing the
> > bdev and then unregister/re-register which is how we resize bdevs now.)
> >
> > As for resizing a cachedev, I've not looked at all---not sure about that
> > one.  We always detach, resize, make-bcache and re-attach the new cache.
> > Maybe it is similarly simple, but haven't looked.
> >
> > > >> It is unnecessary to create a new IOCTL interface, and I feel the way
> > > >> how it works is really unconvinced for potential security risk.
> > > >>
> > > > Which potential security risks concern you?
> > > >
> > >
> > > Currently we don’t check all members of struct cache_sb_disk, what we do
> > > is to simply trust bcache-tools. Create a new IOCTL interface to send a
> > > user space memory into kernel space as superblock, that needs quite a
> > > lot of checking code to make sure it won’t panic the kernel. It is
> > > possible, but it is not worthy to add so many code for the purpose,
> > > especially adding them into upstream code.
> > >
> > > I am not able to provide an explicit example for security risk, just the
> > > new adding interface makes me not so confident.
> >
> > Maintaining a blind structure population from a userspace page would be
> > difficult as well because even if the kernel validates _everything_ in
> > cache_sb today, anyone extending `struct cache_sb` needs to remember to
> > add checks for that. Failing to enforce validation inside the kernel could
> > lead to kernel crashes or data corruption from userspace which is of
> > course never good.
> >
>
> Can I solve this by refactoring the patch and using sysfs based
> registration instead?

Sure, just make the sysfs interface configurable enough that:
  - all configuration is text (not binary)
  - bcache.txt documentation is updated

-Eric

> For anyone that is interested to try this solution, and as a
> reference, I leave here below the links to my public Github
> repositories:
> - bcache-tools: https://github.com/andreatomassetti/bcache-tools
> - bcache (standalone, patched): https://github.com/andreatomassetti/bcache
> 
> Thank you very much,
> Andrea
> 
> > We always assume that, somehow, someone could leverage an insecure IOCTL
> > call and crash the OS when they shouldn't be allowed to (even if they are
> > not root, like from sudo).  This is a security issue from an assurance and
> > integrity point of view, even if there might be neither an obvious
> > privelege escalation nor privacy concern.
> >
> > -Eric
> >
> >
> > > Thanks.
> > >
> > > Coly Li
> > >
> > >
> > >
> > >
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] bcache: Use bcache without formatting existing device
  2022-07-12 20:29           ` Eric Wheeler
@ 2022-07-20  8:06             ` Andrea Tomassetti
  2022-07-20 13:31               ` Coly Li
  0 siblings, 1 reply; 17+ messages in thread
From: Andrea Tomassetti @ 2022-07-20  8:06 UTC (permalink / raw)
  To: Eric Wheeler; +Cc: Coly Li, linux-bcache, Kent Overstreet, Zhang Zhen

On Tue, Jul 12, 2022 at 10:29 PM Eric Wheeler <bcache@lists.ewheeler.net> wrote:
>
> On Thu, 7 Jul 2022, Andrea Tomassetti wrote:
> > On Wed, Jul 6, 2022 at 12:03 AM Eric Wheeler <bcache@lists.ewheeler.net> wrote:
> > > On Tue, 5 Jul 2022, Coly Li wrote:
> > > > > 2022年7月5日 16:48,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> > > > > On Mon, Jul 4, 2022 at 5:29 PM Coly Li <colyli@suse.de> wrote:
> > > > >>> 2022年7月4日 23:13,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> > > > >>> Introducing a bcache control device (/dev/bcache_ctrl) that allows
> > > > >>> communicating to the driver from user space via IOCTL. The only
> > > > >>> IOCTL commands currently implemented, receives a struct cache_sb and
> > > > >>> uses it to register the backing device without any need of
> > > > >>> formatting them.
> > > > >>>
> > > > >> Back to the patch, I don’t support this idea. For the problem you are
> > > > >> solving, indeed people uses device mapper linear target for many
> > > > >> years, and it works perfectly without any code modification.
> > > > >>
> > > > >> That is, create a 8K image and set it as a loop device, then write a
> > > > >> dm table to combine it with the existing hard drive. Then you run
> > > > >> “bcache make -B <combined dm target>”, you will get a bcache device
> > > > >> whose first 8K in the loop device and existing super block of the
> > > > >> hard driver located at expected offset.
> > > > >>
> > > > > We evaluated this option but we weren't satisfied by the outcomes for,
> > > > > at least, two main reasons: complexity and operational drawbacks. For
> > > > > the complexity side: in production we use more than 20 HD that need to
> > > > > be cached. It means we need to create 20+ header for them, 20+ loop
> > > > > devices and 20+ dm linear devices. So, basically, there's a 3x factor
> > > > > for each HD that we want to cache. Moreover, we're currently using
> > > > > ephemeral disks as cache devices. In case of a machine reboot,
> > > > > ephemeral devices can get lost; at this point, there will be some
> > > > > trouble to mount the dm-linear bcache backing device because there
> > > > > will be no cache device.
> > > >
> > > > OK, I get your point. It might make sense for your situation, although I
> > > > know some other people use the linear dm-table to solve similar
> > > > situation but may be not perfect for you. This patch may work in your
> > > > procedure, but there are following things make me uncomfortable,
> > >
> > > Coly is right: there are some issues to address.
> > >
> > > Coly, I do not wish to contradict you, only to add that we have had times
> > > where this would be useful. I like the idea, particularly avoiding placing
> > > dm-linear atop of the bdev which reduces the IO codepath.  Maybe there is
> > > an implementation that would fit everyone's needs.
> > >
> > > For us, such a superblock-less registration could resolve two issues:
> > >
> > > 1. Most commonly we wish to add bcache to an existing device without
> > >    re-formatting and without adding the dm-linear complexity.
> >
> > That's exactly what was preventing us from using bcache in production
> > prior to this patch.
>
> Ok, but we always use writeback...and others may wish to, too.  I think
> any patch that introduces a feature needs to support existing features
> without introducing limitations on the behavior.
>
Totally agree. My only point was that I extensively tested this patch
with wt mode. It works in wb mode as well, for sure because the
backing device's header is almost never used. The only issue I can
foresee in wb mode is in case of a machine reboot: the backing device
will lose the virtual header and, at boot time, another one will be
generated. It will get attached again to its cache device with a new
UID and I'm not sure if this will imply the loss of the data that was
not previously written to it, but was only present on the cache
device. But I think that losing data it's a well-known risk of wb
mode.

> > > 2. Relatedly, IMHO, I've never liked the default location at 8k because we
> > >    like to align our bdev's to the RAID stride width and rarely is the
> > >    bdev array aligned at 8k (usually 64k or 256k for hardware RAID).  If
> > >    we forget to pass the --data-offset at make-bcache time then we are
> > >    stuck with an 8k-misalignment for the life of the device.
> > >
> > > In #2 we usually reformat the volume to avoid dm-linear complexity (and in
> > > both cases we have wanted writeback cache support).  This process can take
> > > a while to `dd` ‾30TBs of bdev on spinning disks and we have to find
> > > temporary disk space to move that data out and back in.
> > >
> > > > - Copying a user space memory and directly using it as a complicated in-kernel memory object.
> > >
> > > In the ioctl changes for bch_write_bdev_super() there does not appear to
> > > be a way to handle attaching caches to the running bcache.  For example:
> > >
> > > 1. Add a cacheless bcache0 volume via ioctl
> > > 2. Attach a writeback cache, write some data.
> > > 3. Unregister everything
> > > 4. Re-register the _dirty_ bdev from #1
> > >
> > > In this scenario bcache will start "running" immediately because it
> > > cannot update its cset.uuid (as reported by bcache-super-show) which I
> > > believe is stored in cache_sb.set_uuid.
> > >
> > > I suppose in step #2 you could update your userspace state with the
> > > cset.uuid so your userspace ioctl register tool would put the cset.uuid in
> > > place, but this is fragile without good userspace integration.
> > >
> > > I could imagine something like /etc/bcachetab (like crypttab or
> > > mdadm.conf) that maps cset UUID's to bdev UUIDs.  Then your userspace
> > > ioctl tool could be included in a udev script to auto-load volumes as they
> > > become available.
> > Yes, in conjunction with this patch, I developed a python udev script
> > that manages device registration base on a YAML configuration file. I
> > even patched bcache-tools to support the new IOCTL registration. You
> > will find a link to the Github project at the end of this message.
>
> Fewer dependencies are better: There are still python2 vs python3
> conflicts out there---and loading python and its dependencies into an
> initrd/initramfs for early bcache loading could be very messy, indeed!
>
> You've already put some work into make-bcache so creating a bcache_udev
> function and a bcache-udev.c file (like make-bcache.c) is probably easy
> enough.  IMHO, a single-line grepable format (instead of YAML) could be
> used to minimize dependencies so that it is clean in an initramfs in the
> same way that mdadm.conf and crypttab already do.  You could then parse it
> with C or bash pretty easily...
>
I will be really glad to rework the patch if we can agree on some
modifications that will make it suitable to be merged upstream.

> > > You want to make sure that when a dirty writeback-cached bdev is
> > > registered that it does not activate until either:
> > >
> > >   1. The cache is attached
> > >   2. echo 1 > /sys/block/bcache0/bcache/running
> > >
> > > > - A new IOCTL interface added, even all rested interface are sysfs based.
> > >
> > > True.  Perhaps something like this would be better, and it would avoid
> > > sharing a userspace page for blindly populating `struct cache_sb`, too:
> > >
> > >   echo '/dev/bdev [bdev_uuid] [/dev/cachedev|cset_uuid] [options]' > ¥
> > >           /sys/fs/bcache/register_raw
> >
> > I thought that introducing an IOCTL interface could be a stopper for
> > this patch, so I'm more than welcome to refactor it using sysfs
> > entries only. But, I will do so only if there's even the slightest
> > chance that this patch can be merged.
>
> I'm for the patch with sysfs, but I'm not the decision maker either.
>

As Eric already asked,
> Coly: what would your requirements be to accept this upstream?
>

Andrea

> > > Because of the writeback issue above, the cache and bdev either need to be
> > > registered simultaneously or the cset uuid need to be specified.
> > >
> > > > - Do things in kernel space while there is solution in user space.
> > > >
> > > > All the above opinions are quite personal to myself, I don’t say you are
> > > > wrong or I am correct. If the patch works, that’s cool and you can use
> > > > it as you want, but I don’t support it to be in upstream.
> > >
> > > An alternate implementation might be to create a dm-bcache target.  The
> > > core bcache code could be left alone except a few EXPORT_SYMBOL()'s so
> > > dm-bcache can reach the bcache registration bits.
> > >
> > > This would:
> > >   * Minimally impact the bcache code base
> > >   * Solve the blind-populating of `struct cache_sb` issue in the same way
> > >     as `register_raw` could work above.
> > >   * Create a nicely segmented codebase (dm target) to upstream separately
> > >     through the DM team.
> > >   * Could be maintained cleanly out-of-tree because the bcache
> > >     registration interfaces change very infrequently.
> > >   * bdev resize could be done with a `dmsetup replace` but you'll need to
> > >     add resize support as below.
> > >
> > > > > For the operational drawbacks: from time to time, we exploit the
> > > > > online filesystem resize capability of XFS to increase the volume
> > > > > size. This would be at least tedious, if not nearly impossible, if the
> > > > > volume is mapped inside a dm-linear.
> > > >
> > > > Currently bcache doesn’t support cache or backing device resize. I don’t
> > > > see the connection between above statement and feeding an in-memory
> > > > super block via IOCTL command.
> > >
> > > Resize is perhaps unrelated, so if you decide to tackle bdev or cachedev
> > > resize then please start a new list thread.  Briefly: supporting bdev
> > > resize is probably easy.  I've looked through the code a few times with
> > > this in mind but haven't had time.
> > >
> > > Here's the summary, not sure if there are any other
> > > considerations:
> > >
> > >   1. Create a sysfs file like /sys/block/bcache0/bcache/resize to trigger
> > >      resize on echo 1 >.
> > >   2. Refactor the set_capacity() bits from  bcache_device_init() into its
> > >      own function.
> > >   3. Put locks around bcache_device.full_dirty_stripes and
> > >      bcache_device.stripe_sectors_dirty.  Re-alloc+copy+free and zero the
> > >      new bytes at the end.
> > >
> > > The cachedev's don't know anything about the bdev size, so (according to
> > > Kent) they will "just work" by referencing new offsets that didn't exist
> > > before when IOs come through.  (This is basically the same as resizing the
> > > bdev and then unregister/re-register which is how we resize bdevs now.)
> > >
> > > As for resizing a cachedev, I've not looked at all---not sure about that
> > > one.  We always detach, resize, make-bcache and re-attach the new cache.
> > > Maybe it is similarly simple, but haven't looked.
> > >
> > > > >> It is unnecessary to create a new IOCTL interface, and I feel the way
> > > > >> how it works is really unconvinced for potential security risk.
> > > > >>
> > > > > Which potential security risks concern you?
> > > > >
> > > >
> > > > Currently we don’t check all members of struct cache_sb_disk, what we do
> > > > is to simply trust bcache-tools. Create a new IOCTL interface to send a
> > > > user space memory into kernel space as superblock, that needs quite a
> > > > lot of checking code to make sure it won’t panic the kernel. It is
> > > > possible, but it is not worthy to add so many code for the purpose,
> > > > especially adding them into upstream code.
> > > >
> > > > I am not able to provide an explicit example for security risk, just the
> > > > new adding interface makes me not so confident.
> > >
> > > Maintaining a blind structure population from a userspace page would be
> > > difficult as well because even if the kernel validates _everything_ in
> > > cache_sb today, anyone extending `struct cache_sb` needs to remember to
> > > add checks for that. Failing to enforce validation inside the kernel could
> > > lead to kernel crashes or data corruption from userspace which is of
> > > course never good.
> > >
> >
> > Can I solve this by refactoring the patch and using sysfs based
> > registration instead?
>
> Sure, just make the sysfs interface configurable enough that:
>   - all configuration is text (not binary)
>   - bcache.txt documentation is updated
>
> -Eric
>
> > For anyone that is interested to try this solution, and as a
> > reference, I leave here below the links to my public Github
> > repositories:
> > - bcache-tools: https://github.com/andreatomassetti/bcache-tools
> > - bcache (standalone, patched): https://github.com/andreatomassetti/bcache
> >
> > Thank you very much,
> > Andrea
> >
> > > We always assume that, somehow, someone could leverage an insecure IOCTL
> > > call and crash the OS when they shouldn't be allowed to (even if they are
> > > not root, like from sudo).  This is a security issue from an assurance and
> > > integrity point of view, even if there might be neither an obvious
> > > privelege escalation nor privacy concern.
> > >
> > > -Eric
> > >
> > >
> > > > Thanks.
> > > >
> > > > Coly Li
> > > >
> > > >
> > > >
> > > >
> >

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] bcache: Use bcache without formatting existing device
  2022-07-20  8:06             ` Andrea Tomassetti
@ 2022-07-20 13:31               ` Coly Li
  2022-07-20 14:23                 ` Andrea Tomassetti
  0 siblings, 1 reply; 17+ messages in thread
From: Coly Li @ 2022-07-20 13:31 UTC (permalink / raw)
  To: Andrea Tomassetti; +Cc: Eric Wheeler, linux-bcache, Kent Overstreet, Zhang Zhen



> 2022年7月20日 16:06,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> 
> On Tue, Jul 12, 2022 at 10:29 PM Eric Wheeler <bcache@lists.ewheeler.net> wrote:
>> 
>> On Thu, 7 Jul 2022, Andrea Tomassetti wrote:
>>> On Wed, Jul 6, 2022 at 12:03 AM Eric Wheeler <bcache@lists.ewheeler.net> wrote:
>>>> On Tue, 5 Jul 2022, Coly Li wrote:
>>>>>> 2022年7月5日 16:48,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
>>>>>> On Mon, Jul 4, 2022 at 5:29 PM Coly Li <colyli@suse.de> wrote:
>>>>>>>> 2022年7月4日 23:13,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
>>>>>>>> Introducing a bcache control device (/dev/bcache_ctrl) that allows
>>>>>>>> communicating to the driver from user space via IOCTL. The only
>>>>>>>> IOCTL commands currently implemented, receives a struct cache_sb and
>>>>>>>> uses it to register the backing device without any need of
>>>>>>>> formatting them.
>>>>>>>> 
>>>>>>> Back to the patch, I don’t support this idea. For the problem you are
>>>>>>> solving, indeed people uses device mapper linear target for many
>>>>>>> years, and it works perfectly without any code modification.
>>>>>>> 
>>>>>>> That is, create a 8K image and set it as a loop device, then write a
>>>>>>> dm table to combine it with the existing hard drive. Then you run
>>>>>>> “bcache make -B <combined dm target>”, you will get a bcache device
>>>>>>> whose first 8K in the loop device and existing super block of the
>>>>>>> hard driver located at expected offset.
>>>>>>> 
>>>>>> We evaluated this option but we weren't satisfied by the outcomes for,
>>>>>> at least, two main reasons: complexity and operational drawbacks. For
>>>>>> the complexity side: in production we use more than 20 HD that need to
>>>>>> be cached. It means we need to create 20+ header for them, 20+ loop
>>>>>> devices and 20+ dm linear devices. So, basically, there's a 3x factor
>>>>>> for each HD that we want to cache. Moreover, we're currently using
>>>>>> ephemeral disks as cache devices. In case of a machine reboot,
>>>>>> ephemeral devices can get lost; at this point, there will be some
>>>>>> trouble to mount the dm-linear bcache backing device because there
>>>>>> will be no cache device.
>>>>> 
>>>>> OK, I get your point. It might make sense for your situation, although I
>>>>> know some other people use the linear dm-table to solve similar
>>>>> situation but may be not perfect for you. This patch may work in your
>>>>> procedure, but there are following things make me uncomfortable,
>>>> 
>>>> Coly is right: there are some issues to address.
>>>> 
>>>> Coly, I do not wish to contradict you, only to add that we have had times
>>>> where this would be useful. I like the idea, particularly avoiding placing
>>>> dm-linear atop of the bdev which reduces the IO codepath. Maybe there is
>>>> an implementation that would fit everyone's needs.
>>>> 
>>>> For us, such a superblock-less registration could resolve two issues:
>>>> 
>>>> 1. Most commonly we wish to add bcache to an existing device without
>>>> re-formatting and without adding the dm-linear complexity.
>>> 
>>> That's exactly what was preventing us from using bcache in production
>>> prior to this patch.
>> 
>> Ok, but we always use writeback...and others may wish to, too. I think
>> any patch that introduces a feature needs to support existing features
>> without introducing limitations on the behavior.
>> 
> Totally agree. My only point was that I extensively tested this patch
> with wt mode. It works in wb mode as well, for sure because the
> backing device's header is almost never used. The only issue I can
> foresee in wb mode is in case of a machine reboot: the backing device
> will lose the virtual header and, at boot time, another one will be
> generated. It will get attached again to its cache device with a new
> UID and I'm not sure if this will imply the loss of the data that was
> not previously written to it, but was only present on the cache
> device. But I think that losing data it's a well-known risk of wb
> mode.

NO, losing dirty data on cache device is unacceptable. If the previous attached cache device is not ready, the backing device will be suspended and its bcache device won’t show up in /dev/.


> 
>>>> 2. Relatedly, IMHO, I've never liked the default location at 8k because we
>>>> like to align our bdev's to the RAID stride width and rarely is the
>>>> bdev array aligned at 8k (usually 64k or 256k for hardware RAID). If
>>>> we forget to pass the --data-offset at make-bcache time then we are
>>>> stuck with an 8k-misalignment for the life of the device.
>>>> 
>>>> In #2 we usually reformat the volume to avoid dm-linear complexity (and in
>>>> both cases we have wanted writeback cache support). This process can take
>>>> a while to `dd` ‾30TBs of bdev on spinning disks and we have to find
>>>> temporary disk space to move that data out and back in.
>>>> 
>>>>> - Copying a user space memory and directly using it as a complicated in-kernel memory object.
>>>> 
>>>> In the ioctl changes for bch_write_bdev_super() there does not appear to
>>>> be a way to handle attaching caches to the running bcache. For example:
>>>> 
>>>> 1. Add a cacheless bcache0 volume via ioctl
>>>> 2. Attach a writeback cache, write some data.
>>>> 3. Unregister everything
>>>> 4. Re-register the _dirty_ bdev from #1
>>>> 
>>>> In this scenario bcache will start "running" immediately because it
>>>> cannot update its cset.uuid (as reported by bcache-super-show) which I
>>>> believe is stored in cache_sb.set_uuid.
>>>> 
>>>> I suppose in step #2 you could update your userspace state with the
>>>> cset.uuid so your userspace ioctl register tool would put the cset.uuid in
>>>> place, but this is fragile without good userspace integration.
>>>> 
>>>> I could imagine something like /etc/bcachetab (like crypttab or
>>>> mdadm.conf) that maps cset UUID's to bdev UUIDs. Then your userspace
>>>> ioctl tool could be included in a udev script to auto-load volumes as they
>>>> become available.
>>> Yes, in conjunction with this patch, I developed a python udev script
>>> that manages device registration base on a YAML configuration file. I
>>> even patched bcache-tools to support the new IOCTL registration. You
>>> will find a link to the Github project at the end of this message.
>> 
>> Fewer dependencies are better: There are still python2 vs python3
>> conflicts out there---and loading python and its dependencies into an
>> initrd/initramfs for early bcache loading could be very messy, indeed!
>> 
>> You've already put some work into make-bcache so creating a bcache_udev
>> function and a bcache-udev.c file (like make-bcache.c) is probably easy
>> enough. IMHO, a single-line grepable format (instead of YAML) could be
>> used to minimize dependencies so that it is clean in an initramfs in the
>> same way that mdadm.conf and crypttab already do. You could then parse it
>> with C or bash pretty easily...
>> 
> I will be really glad to rework the patch if we can agree on some
> modifications that will make it suitable to be merged upstream.


I don’t support the idea to copy a block of memory from user space to kernel space and reference it as a super block, neither IOCTL nor sysfs interface.
It is very easy to generate a corrupted super block memory and send it into kernel space and panic the whole system, I will not take this potential risk.

> 
>>>> You want to make sure that when a dirty writeback-cached bdev is
>>>> registered that it does not activate until either:
>>>> 
>>>> 1. The cache is attached
>>>> 2. echo 1 > /sys/block/bcache0/bcache/running
>>>> 
>>>>> - A new IOCTL interface added, even all rested interface are sysfs based.
>>>> 
>>>> True. Perhaps something like this would be better, and it would avoid
>>>> sharing a userspace page for blindly populating `struct cache_sb`, too:
>>>> 
>>>> echo '/dev/bdev [bdev_uuid] [/dev/cachedev|cset_uuid] [options]' > ¥
>>>> /sys/fs/bcache/register_raw
>>> 
>>> I thought that introducing an IOCTL interface could be a stopper for
>>> this patch, so I'm more than welcome to refactor it using sysfs
>>> entries only. But, I will do so only if there's even the slightest
>>> chance that this patch can be merged.
>> 
>> I'm for the patch with sysfs, but I'm not the decision maker either.
>> 
> 
> As Eric already asked,
>> Coly: what would your requirements be to accept this upstream?
>>>>> 

You may use it as you want, but I won’t accept it into upstream via my path.

Coly Li



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] bcache: Use bcache without formatting existing device
  2022-07-20 13:31               ` Coly Li
@ 2022-07-20 14:23                 ` Andrea Tomassetti
  2022-07-20 14:46                   ` Coly Li
  0 siblings, 1 reply; 17+ messages in thread
From: Andrea Tomassetti @ 2022-07-20 14:23 UTC (permalink / raw)
  To: Coly Li; +Cc: Eric Wheeler, linux-bcache, Kent Overstreet, Zhang Zhen

On Wed, Jul 20, 2022 at 3:31 PM Coly Li <colyli@suse.de> wrote:
>
>
>
> > 2022年7月20日 16:06,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> >
> > On Tue, Jul 12, 2022 at 10:29 PM Eric Wheeler <bcache@lists.ewheeler.net> wrote:
> >>
> >> On Thu, 7 Jul 2022, Andrea Tomassetti wrote:
> >>> On Wed, Jul 6, 2022 at 12:03 AM Eric Wheeler <bcache@lists.ewheeler.net> wrote:
> >>>> On Tue, 5 Jul 2022, Coly Li wrote:
> >>>>>> 2022年7月5日 16:48,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> >>>>>> On Mon, Jul 4, 2022 at 5:29 PM Coly Li <colyli@suse.de> wrote:
> >>>>>>>> 2022年7月4日 23:13,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> >>>>>>>> Introducing a bcache control device (/dev/bcache_ctrl) that allows
> >>>>>>>> communicating to the driver from user space via IOCTL. The only
> >>>>>>>> IOCTL commands currently implemented, receives a struct cache_sb and
> >>>>>>>> uses it to register the backing device without any need of
> >>>>>>>> formatting them.
> >>>>>>>>
> >>>>>>> Back to the patch, I don’t support this idea. For the problem you are
> >>>>>>> solving, indeed people uses device mapper linear target for many
> >>>>>>> years, and it works perfectly without any code modification.
> >>>>>>>
> >>>>>>> That is, create a 8K image and set it as a loop device, then write a
> >>>>>>> dm table to combine it with the existing hard drive. Then you run
> >>>>>>> “bcache make -B <combined dm target>”, you will get a bcache device
> >>>>>>> whose first 8K in the loop device and existing super block of the
> >>>>>>> hard driver located at expected offset.
> >>>>>>>
> >>>>>> We evaluated this option but we weren't satisfied by the outcomes for,
> >>>>>> at least, two main reasons: complexity and operational drawbacks. For
> >>>>>> the complexity side: in production we use more than 20 HD that need to
> >>>>>> be cached. It means we need to create 20+ header for them, 20+ loop
> >>>>>> devices and 20+ dm linear devices. So, basically, there's a 3x factor
> >>>>>> for each HD that we want to cache. Moreover, we're currently using
> >>>>>> ephemeral disks as cache devices. In case of a machine reboot,
> >>>>>> ephemeral devices can get lost; at this point, there will be some
> >>>>>> trouble to mount the dm-linear bcache backing device because there
> >>>>>> will be no cache device.
> >>>>>
> >>>>> OK, I get your point. It might make sense for your situation, although I
> >>>>> know some other people use the linear dm-table to solve similar
> >>>>> situation but may be not perfect for you. This patch may work in your
> >>>>> procedure, but there are following things make me uncomfortable,
> >>>>
> >>>> Coly is right: there are some issues to address.
> >>>>
> >>>> Coly, I do not wish to contradict you, only to add that we have had times
> >>>> where this would be useful. I like the idea, particularly avoiding placing
> >>>> dm-linear atop of the bdev which reduces the IO codepath. Maybe there is
> >>>> an implementation that would fit everyone's needs.
> >>>>
> >>>> For us, such a superblock-less registration could resolve two issues:
> >>>>
> >>>> 1. Most commonly we wish to add bcache to an existing device without
> >>>> re-formatting and without adding the dm-linear complexity.
> >>>
> >>> That's exactly what was preventing us from using bcache in production
> >>> prior to this patch.
> >>
> >> Ok, but we always use writeback...and others may wish to, too. I think
> >> any patch that introduces a feature needs to support existing features
> >> without introducing limitations on the behavior.
> >>
> > Totally agree. My only point was that I extensively tested this patch
> > with wt mode. It works in wb mode as well, for sure because the
> > backing device's header is almost never used. The only issue I can
> > foresee in wb mode is in case of a machine reboot: the backing device
> > will lose the virtual header and, at boot time, another one will be
> > generated. It will get attached again to its cache device with a new
> > UID and I'm not sure if this will imply the loss of the data that was
> > not previously written to it, but was only present on the cache
> > device. But I think that losing data it's a well-known risk of wb
> > mode.
>
> NO, losing dirty data on cache device is unacceptable. If the previous attached cache device is not ready, the backing device will be suspended and its bcache device won’t show up in /dev/.
>
>
> >
> >>>> 2. Relatedly, IMHO, I've never liked the default location at 8k because we
> >>>> like to align our bdev's to the RAID stride width and rarely is the
> >>>> bdev array aligned at 8k (usually 64k or 256k for hardware RAID). If
> >>>> we forget to pass the --data-offset at make-bcache time then we are
> >>>> stuck with an 8k-misalignment for the life of the device.
> >>>>
> >>>> In #2 we usually reformat the volume to avoid dm-linear complexity (and in
> >>>> both cases we have wanted writeback cache support). This process can take
> >>>> a while to `dd` ‾30TBs of bdev on spinning disks and we have to find
> >>>> temporary disk space to move that data out and back in.
> >>>>
> >>>>> - Copying a user space memory and directly using it as a complicated in-kernel memory object.
> >>>>
> >>>> In the ioctl changes for bch_write_bdev_super() there does not appear to
> >>>> be a way to handle attaching caches to the running bcache. For example:
> >>>>
> >>>> 1. Add a cacheless bcache0 volume via ioctl
> >>>> 2. Attach a writeback cache, write some data.
> >>>> 3. Unregister everything
> >>>> 4. Re-register the _dirty_ bdev from #1
> >>>>
> >>>> In this scenario bcache will start "running" immediately because it
> >>>> cannot update its cset.uuid (as reported by bcache-super-show) which I
> >>>> believe is stored in cache_sb.set_uuid.
> >>>>
> >>>> I suppose in step #2 you could update your userspace state with the
> >>>> cset.uuid so your userspace ioctl register tool would put the cset.uuid in
> >>>> place, but this is fragile without good userspace integration.
> >>>>
> >>>> I could imagine something like /etc/bcachetab (like crypttab or
> >>>> mdadm.conf) that maps cset UUID's to bdev UUIDs. Then your userspace
> >>>> ioctl tool could be included in a udev script to auto-load volumes as they
> >>>> become available.
> >>> Yes, in conjunction with this patch, I developed a python udev script
> >>> that manages device registration base on a YAML configuration file. I
> >>> even patched bcache-tools to support the new IOCTL registration. You
> >>> will find a link to the Github project at the end of this message.
> >>
> >> Fewer dependencies are better: There are still python2 vs python3
> >> conflicts out there---and loading python and its dependencies into an
> >> initrd/initramfs for early bcache loading could be very messy, indeed!
> >>
> >> You've already put some work into make-bcache so creating a bcache_udev
> >> function and a bcache-udev.c file (like make-bcache.c) is probably easy
> >> enough. IMHO, a single-line grepable format (instead of YAML) could be
> >> used to minimize dependencies so that it is clean in an initramfs in the
> >> same way that mdadm.conf and crypttab already do. You could then parse it
> >> with C or bash pretty easily...
> >>
> > I will be really glad to rework the patch if we can agree on some
> > modifications that will make it suitable to be merged upstream.
>
Hi Coly,
thank you very much for your time.

>
> I don’t support the idea to copy a block of memory from user space to kernel space and reference it as a super block, neither IOCTL nor sysfs interface.
> It is very easy to generate a corrupted super block memory and send it into kernel space and panic the whole system, I will not take this potential risk.
>
I think I'm missing something here because I cannot see the difference
between passing the structure through the sysfs interface or reading
it from the header of the block device. In both cases the source of
such structure will be the same: the user via the make-bcache command.
My understanding of the part involved is:
    udev_rule -> bcache-register.c -> sysfs/register ->
bcache_module/register_bcache -> read_super
So, in read_super the bcache module will read what the userspace
utility make-bcache wrote as a super block. Correct?
If I'm correct, a big if, I cannot see why it should be "easier" to
generate a corrupted super block with this patch. Can you please
elaborate on this?

Thank you in advance,
Andrea

> >
> >>>> You want to make sure that when a dirty writeback-cached bdev is
> >>>> registered that it does not activate until either:
> >>>>
> >>>> 1. The cache is attached
> >>>> 2. echo 1 > /sys/block/bcache0/bcache/running
> >>>>
> >>>>> - A new IOCTL interface added, even all rested interface are sysfs based.
> >>>>
> >>>> True. Perhaps something like this would be better, and it would avoid
> >>>> sharing a userspace page for blindly populating `struct cache_sb`, too:
> >>>>
> >>>> echo '/dev/bdev [bdev_uuid] [/dev/cachedev|cset_uuid] [options]' > ¥
> >>>> /sys/fs/bcache/register_raw
> >>>
> >>> I thought that introducing an IOCTL interface could be a stopper for
> >>> this patch, so I'm more than welcome to refactor it using sysfs
> >>> entries only. But, I will do so only if there's even the slightest
> >>> chance that this patch can be merged.
> >>
> >> I'm for the patch with sysfs, but I'm not the decision maker either.
> >>
> >
> > As Eric already asked,
> >> Coly: what would your requirements be to accept this upstream?
> >>>>>
>
> You may use it as you want, but I won’t accept it into upstream via my path.
>
> Coly Li
>
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] bcache: Use bcache without formatting existing device
  2022-07-20 14:23                 ` Andrea Tomassetti
@ 2022-07-20 14:46                   ` Coly Li
  0 siblings, 0 replies; 17+ messages in thread
From: Coly Li @ 2022-07-20 14:46 UTC (permalink / raw)
  To: Andrea Tomassetti; +Cc: Eric Wheeler, linux-bcache, Kent Overstreet, Zhang Zhen



> 2022年7月20日 22:23,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
> 
> On Wed, Jul 20, 2022 at 3:31 PM Coly Li <colyli@suse.de> wrote:
>> 
>> 
>> 
>>> 2022年7月20日 16:06,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
>>> 
>>> On Tue, Jul 12, 2022 at 10:29 PM Eric Wheeler <bcache@lists.ewheeler.net> wrote:
>>>> 
>>>> On Thu, 7 Jul 2022, Andrea Tomassetti wrote:
>>>>> On Wed, Jul 6, 2022 at 12:03 AM Eric Wheeler <bcache@lists.ewheeler.net> wrote:
>>>>>> On Tue, 5 Jul 2022, Coly Li wrote:
>>>>>>>> 2022年7月5日 16:48,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
>>>>>>>> On Mon, Jul 4, 2022 at 5:29 PM Coly Li <colyli@suse.de> wrote:
>>>>>>>>>> 2022年7月4日 23:13,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
>>>>>>>>>> Introducing a bcache control device (/dev/bcache_ctrl) that allows
>>>>>>>>>> communicating to the driver from user space via IOCTL. The only
>>>>>>>>>> IOCTL commands currently implemented, receives a struct cache_sb and
>>>>>>>>>> uses it to register the backing device without any need of
>>>>>>>>>> formatting them.
>>>>>>>>>> 
>>>>>>>>> Back to the patch, I don’t support this idea. For the problem you are
>>>>>>>>> solving, indeed people uses device mapper linear target for many
>>>>>>>>> years, and it works perfectly without any code modification.
>>>>>>>>> 
>>>>>>>>> That is, create a 8K image and set it as a loop device, then write a
>>>>>>>>> dm table to combine it with the existing hard drive. Then you run
>>>>>>>>> “bcache make -B <combined dm target>”, you will get a bcache device
>>>>>>>>> whose first 8K in the loop device and existing super block of the
>>>>>>>>> hard driver located at expected offset.
>>>>>>>>> 
>>>>>>>> We evaluated this option but we weren't satisfied by the outcomes for,
>>>>>>>> at least, two main reasons: complexity and operational drawbacks. For
>>>>>>>> the complexity side: in production we use more than 20 HD that need to
>>>>>>>> be cached. It means we need to create 20+ header for them, 20+ loop
>>>>>>>> devices and 20+ dm linear devices. So, basically, there's a 3x factor
>>>>>>>> for each HD that we want to cache. Moreover, we're currently using
>>>>>>>> ephemeral disks as cache devices. In case of a machine reboot,
>>>>>>>> ephemeral devices can get lost; at this point, there will be some
>>>>>>>> trouble to mount the dm-linear bcache backing device because there
>>>>>>>> will be no cache device.
>>>>>>> 
>>>>>>> OK, I get your point. It might make sense for your situation, although I
>>>>>>> know some other people use the linear dm-table to solve similar
>>>>>>> situation but may be not perfect for you. This patch may work in your
>>>>>>> procedure, but there are following things make me uncomfortable,
>>>>>> 
>>>>>> Coly is right: there are some issues to address.
>>>>>> 
>>>>>> Coly, I do not wish to contradict you, only to add that we have had times
>>>>>> where this would be useful. I like the idea, particularly avoiding placing
>>>>>> dm-linear atop of the bdev which reduces the IO codepath. Maybe there is
>>>>>> an implementation that would fit everyone's needs.
>>>>>> 
>>>>>> For us, such a superblock-less registration could resolve two issues:
>>>>>> 
>>>>>> 1. Most commonly we wish to add bcache to an existing device without
>>>>>> re-formatting and without adding the dm-linear complexity.
>>>>> 
>>>>> That's exactly what was preventing us from using bcache in production
>>>>> prior to this patch.
>>>> 
>>>> Ok, but we always use writeback...and others may wish to, too. I think
>>>> any patch that introduces a feature needs to support existing features
>>>> without introducing limitations on the behavior.
>>>> 
>>> Totally agree. My only point was that I extensively tested this patch
>>> with wt mode. It works in wb mode as well, for sure because the
>>> backing device's header is almost never used. The only issue I can
>>> foresee in wb mode is in case of a machine reboot: the backing device
>>> will lose the virtual header and, at boot time, another one will be
>>> generated. It will get attached again to its cache device with a new
>>> UID and I'm not sure if this will imply the loss of the data that was
>>> not previously written to it, but was only present on the cache
>>> device. But I think that losing data it's a well-known risk of wb
>>> mode.
>> 
>> NO, losing dirty data on cache device is unacceptable. If the previous attached cache device is not ready, the backing device will be suspended and its bcache device won’t show up in /dev/.
>> 
>> 
>>> 
>>>>>> 2. Relatedly, IMHO, I've never liked the default location at 8k because we
>>>>>> like to align our bdev's to the RAID stride width and rarely is the
>>>>>> bdev array aligned at 8k (usually 64k or 256k for hardware RAID). If
>>>>>> we forget to pass the --data-offset at make-bcache time then we are
>>>>>> stuck with an 8k-misalignment for the life of the device.
>>>>>> 
>>>>>> In #2 we usually reformat the volume to avoid dm-linear complexity (and in
>>>>>> both cases we have wanted writeback cache support). This process can take
>>>>>> a while to `dd` ‾30TBs of bdev on spinning disks and we have to find
>>>>>> temporary disk space to move that data out and back in.
>>>>>> 
>>>>>>> - Copying a user space memory and directly using it as a complicated in-kernel memory object.
>>>>>> 
>>>>>> In the ioctl changes for bch_write_bdev_super() there does not appear to
>>>>>> be a way to handle attaching caches to the running bcache. For example:
>>>>>> 
>>>>>> 1. Add a cacheless bcache0 volume via ioctl
>>>>>> 2. Attach a writeback cache, write some data.
>>>>>> 3. Unregister everything
>>>>>> 4. Re-register the _dirty_ bdev from #1
>>>>>> 
>>>>>> In this scenario bcache will start "running" immediately because it
>>>>>> cannot update its cset.uuid (as reported by bcache-super-show) which I
>>>>>> believe is stored in cache_sb.set_uuid.
>>>>>> 
>>>>>> I suppose in step #2 you could update your userspace state with the
>>>>>> cset.uuid so your userspace ioctl register tool would put the cset.uuid in
>>>>>> place, but this is fragile without good userspace integration.
>>>>>> 
>>>>>> I could imagine something like /etc/bcachetab (like crypttab or
>>>>>> mdadm.conf) that maps cset UUID's to bdev UUIDs. Then your userspace
>>>>>> ioctl tool could be included in a udev script to auto-load volumes as they
>>>>>> become available.
>>>>> Yes, in conjunction with this patch, I developed a python udev script
>>>>> that manages device registration base on a YAML configuration file. I
>>>>> even patched bcache-tools to support the new IOCTL registration. You
>>>>> will find a link to the Github project at the end of this message.
>>>> 
>>>> Fewer dependencies are better: There are still python2 vs python3
>>>> conflicts out there---and loading python and its dependencies into an
>>>> initrd/initramfs for early bcache loading could be very messy, indeed!
>>>> 
>>>> You've already put some work into make-bcache so creating a bcache_udev
>>>> function and a bcache-udev.c file (like make-bcache.c) is probably easy
>>>> enough. IMHO, a single-line grepable format (instead of YAML) could be
>>>> used to minimize dependencies so that it is clean in an initramfs in the
>>>> same way that mdadm.conf and crypttab already do. You could then parse it
>>>> with C or bash pretty easily...
>>>> 
>>> I will be really glad to rework the patch if we can agree on some
>>> modifications that will make it suitable to be merged upstream.
>> 
> Hi Coly,
> thank you very much for your time.
> 
>> 
>> I don’t support the idea to copy a block of memory from user space to kernel space and reference it as a super block, neither IOCTL nor sysfs interface.
>> It is very easy to generate a corrupted super block memory and send it into kernel space and panic the whole system, I will not take this potential risk.
>> 
> I think I'm missing something here because I cannot see the difference
> between passing the structure through the sysfs interface or reading
> it from the header of the block device. In both cases the source of
> such structure will be the same: the user via the make-bcache command.
> My understanding of the part involved is:
>    udev_rule -> bcache-register.c -> sysfs/register ->
> bcache_module/register_bcache -> read_super
> So, in read_super the bcache module will read what the userspace
> utility make-bcache wrote as a super block. Correct?
> If I'm correct, a big if, I cannot see why it should be "easier" to
> generate a corrupted super block with this patch. Can you please
> elaborate on this?

If you do all things correctly, all things will work as you mentioned. But if some cracker tries to use this interface to send a special composed super block which has incorrect journal bucket LBAs and corrupts cached data, what should you do?

Now bcache kernel code fully trusts bcache-tools, if someone modifies bcache-tools and generate corrupted super block, he or she may pay more effort on it. But the memory copy interface makes similar risky things too much easier. I am not an expert on code attack, and am not able to provide an exact example. But creating an interface and permit user space to send arbitrary memory object into kernel space, this is really risky and I won’t take it for upstream kernel.

Coly Li



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] bcache: Use bcache without formatting existing device
  2022-03-10  8:52 Andrea Tomassetti
  2022-03-10 10:58 ` kernel test robot
  2022-03-10 11:18 ` kernel test robot
@ 2022-03-10 12:09 ` kernel test robot
  2 siblings, 0 replies; 17+ messages in thread
From: kernel test robot @ 2022-03-10 12:09 UTC (permalink / raw)
  To: Andrea Tomassetti, linux-bcache
  Cc: kbuild-all, Coly Li, Kent Overstreet, Zhang Zhen, Andrea Tomassetti

Hi Andrea,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v5.17-rc7 next-20220309]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Andrea-Tomassetti/bcache-Use-bcache-without-formatting-existing-device/20220310-165353
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 3bf7edc84a9eb4007dd9a0cb8878a7e1d5ec6a3b
config: i386-randconfig-s002 (https://download.01.org/0day-ci/archive/20220310/202203102007.cDiegien-lkp@intel.com/config)
compiler: gcc-9 (Ubuntu 9.4.0-1ubuntu1~20.04) 9.4.0
reproduce:
        # apt-get install sparse
        # sparse version: v0.6.4-dirty
        # https://github.com/0day-ci/linux/commit/b5073e4ece2a86f002ca66fb1d864034c12be3e2
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Andrea-Tomassetti/bcache-Use-bcache-without-formatting-existing-device/20220310-165353
        git checkout b5073e4ece2a86f002ca66fb1d864034c12be3e2
        # save the config file to linux build tree
        mkdir build_dir
        make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=i386 SHELL=/bin/bash drivers/md/bcache/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>


sparse warnings: (new ones prefixed by >>)
>> drivers/md/bcache/control.c:18:6: sparse: sparse: symbol 'bch_service_ioctl_ctrl' was not declared. Should it be static?

Please review and possibly fold the followup patch.

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] bcache: Use bcache without formatting existing device
  2022-03-10  8:52 Andrea Tomassetti
  2022-03-10 10:58 ` kernel test robot
@ 2022-03-10 11:18 ` kernel test robot
  2022-03-10 12:09 ` kernel test robot
  2 siblings, 0 replies; 17+ messages in thread
From: kernel test robot @ 2022-03-10 11:18 UTC (permalink / raw)
  To: Andrea Tomassetti, linux-bcache
  Cc: kbuild-all, Coly Li, Kent Overstreet, Zhang Zhen, Andrea Tomassetti

Hi Andrea,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v5.17-rc7 next-20220309]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Andrea-Tomassetti/bcache-Use-bcache-without-formatting-existing-device/20220310-165353
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 3bf7edc84a9eb4007dd9a0cb8878a7e1d5ec6a3b
config: i386-allyesconfig (https://download.01.org/0day-ci/archive/20220310/202203101939.OFoiVCSH-lkp@intel.com/config)
compiler: gcc-9 (Ubuntu 9.4.0-1ubuntu1~20.04) 9.4.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/b5073e4ece2a86f002ca66fb1d864034c12be3e2
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Andrea-Tomassetti/bcache-Use-bcache-without-formatting-existing-device/20220310-165353
        git checkout b5073e4ece2a86f002ca66fb1d864034c12be3e2
        # save the config file to linux build tree
        mkdir build_dir
        make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash drivers/md/bcache/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/md/bcache/control.c:18:6: warning: no previous prototype for 'bch_service_ioctl_ctrl' [-Wmissing-prototypes]
      18 | long bch_service_ioctl_ctrl(struct file *filp, unsigned int cmd,
         |      ^~~~~~~~~~~~~~~~~~~~~~


vim +/bch_service_ioctl_ctrl +18 drivers/md/bcache/control.c

    15	
    16	/* this handles IOCTL for /dev/bcache_ctrl */
    17	/*********************************************/
  > 18	long bch_service_ioctl_ctrl(struct file *filp, unsigned int cmd,
    19			unsigned long arg)
    20	{
    21		int retval = 0;
    22	
    23		if (_IOC_TYPE(cmd) != BCH_IOCTL_MAGIC)
    24			return -EINVAL;
    25	
    26		if (!capable(CAP_SYS_ADMIN)) {
    27			/* Must be root to issue ioctls */
    28			return -EPERM;
    29		}
    30	
    31		switch (cmd) {
    32		case BCH_IOCTL_REGISTER_DEVICE: {
    33			struct bch_register_device *cmd_info;
    34	
    35			cmd_info = vmalloc(sizeof(struct bch_register_device));
    36			if (!cmd_info)
    37				return -ENOMEM;
    38	
    39			if (copy_from_user(cmd_info, (void __user *)arg,
    40					sizeof(struct bch_register_device))) {
    41				pr_err("Cannot copy cmd info from user space\n");
    42				vfree(cmd_info);
    43				return -EINVAL;
    44			}
    45	
    46			retval = register_bcache_ioctl(cmd_info);
    47	
    48			vfree(cmd_info);
    49			return retval;
    50		}
    51	
    52		default:
    53			return -EINVAL;
    54		}
    55	}
    56	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] bcache: Use bcache without formatting existing device
  2022-03-10  8:52 Andrea Tomassetti
@ 2022-03-10 10:58 ` kernel test robot
  2022-03-10 11:18 ` kernel test robot
  2022-03-10 12:09 ` kernel test robot
  2 siblings, 0 replies; 17+ messages in thread
From: kernel test robot @ 2022-03-10 10:58 UTC (permalink / raw)
  To: Andrea Tomassetti, linux-bcache
  Cc: llvm, kbuild-all, Coly Li, Kent Overstreet, Zhang Zhen,
	Andrea Tomassetti

Hi Andrea,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v5.17-rc7 next-20220309]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Andrea-Tomassetti/bcache-Use-bcache-without-formatting-existing-device/20220310-165353
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 3bf7edc84a9eb4007dd9a0cb8878a7e1d5ec6a3b
config: riscv-buildonly-randconfig-r002-20220310 (https://download.01.org/0day-ci/archive/20220310/202203101836.DQDgIjIF-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project 276ca87382b8f16a65bddac700202924228982f6)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install riscv cross compiling tool for clang build
        # apt-get install binutils-riscv64-linux-gnu
        # https://github.com/0day-ci/linux/commit/b5073e4ece2a86f002ca66fb1d864034c12be3e2
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Andrea-Tomassetti/bcache-Use-bcache-without-formatting-existing-device/20220310-165353
        git checkout b5073e4ece2a86f002ca66fb1d864034c12be3e2
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=riscv SHELL=/bin/bash drivers/md/bcache/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/md/bcache/control.c:18:6: warning: no previous prototype for function 'bch_service_ioctl_ctrl' [-Wmissing-prototypes]
   long bch_service_ioctl_ctrl(struct file *filp, unsigned int cmd,
        ^
   drivers/md/bcache/control.c:18:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   long bch_service_ioctl_ctrl(struct file *filp, unsigned int cmd,
   ^
   static 
   1 warning generated.


vim +/bch_service_ioctl_ctrl +18 drivers/md/bcache/control.c

    15	
    16	/* this handles IOCTL for /dev/bcache_ctrl */
    17	/*********************************************/
  > 18	long bch_service_ioctl_ctrl(struct file *filp, unsigned int cmd,
    19			unsigned long arg)
    20	{
    21		int retval = 0;
    22	
    23		if (_IOC_TYPE(cmd) != BCH_IOCTL_MAGIC)
    24			return -EINVAL;
    25	
    26		if (!capable(CAP_SYS_ADMIN)) {
    27			/* Must be root to issue ioctls */
    28			return -EPERM;
    29		}
    30	
    31		switch (cmd) {
    32		case BCH_IOCTL_REGISTER_DEVICE: {
    33			struct bch_register_device *cmd_info;
    34	
    35			cmd_info = vmalloc(sizeof(struct bch_register_device));
    36			if (!cmd_info)
    37				return -ENOMEM;
    38	
    39			if (copy_from_user(cmd_info, (void __user *)arg,
    40					sizeof(struct bch_register_device))) {
    41				pr_err("Cannot copy cmd info from user space\n");
    42				vfree(cmd_info);
    43				return -EINVAL;
    44			}
    45	
    46			retval = register_bcache_ioctl(cmd_info);
    47	
    48			vfree(cmd_info);
    49			return retval;
    50		}
    51	
    52		default:
    53			return -EINVAL;
    54		}
    55	}
    56	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] bcache: Use bcache without formatting existing device
@ 2022-03-10  8:52 Andrea Tomassetti
  2022-03-10 10:58 ` kernel test robot
                   ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Andrea Tomassetti @ 2022-03-10  8:52 UTC (permalink / raw)
  To: linux-bcache; +Cc: Coly Li, Kent Overstreet, Zhang Zhen, Andrea Tomassetti

v2: Fixed small typo

Introducing a bcache control device (/dev/bcache_ctrl)
that allows communicating to the driver from user space
via IOCTL. The only IOCTL commands currently implemented,
receives a struct cache_sb and uses it to register the
backing device.

Signed-off-by: Andrea Tomassetti <andrea.tomassetti@devo.com>
---
 drivers/md/bcache/Makefile      |   2 +-
 drivers/md/bcache/control.c     | 117 ++++++++++++++++++++++++++++++++
 drivers/md/bcache/control.h     |  12 ++++
 drivers/md/bcache/ioctl_codes.h |  19 ++++++
 drivers/md/bcache/super.c       |  62 ++++++++++++-----
 drivers/md/bcache/sysfs.c       |   4 ++
 drivers/md/bcache/writeback.h   |   2 +-
 7 files changed, 200 insertions(+), 18 deletions(-)
 create mode 100644 drivers/md/bcache/control.c
 create mode 100644 drivers/md/bcache/control.h
 create mode 100644 drivers/md/bcache/ioctl_codes.h

diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 5b87e59676b8..46ed41baed7a 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -4,4 +4,4 @@ obj-$(CONFIG_BCACHE)	+= bcache.o
 
 bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
 	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
-	util.o writeback.o features.o
+	util.o writeback.o features.o control.o
diff --git a/drivers/md/bcache/control.c b/drivers/md/bcache/control.c
new file mode 100644
index 000000000000..dad432613474
--- /dev/null
+++ b/drivers/md/bcache/control.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cdev.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+
+#include "control.h"
+
+struct bch_ctrl_device {
+	struct cdev cdev;
+	struct class *class;
+	dev_t dev;
+};
+
+static struct bch_ctrl_device _control_device;
+
+/* this handles IOCTL for /dev/bcache_ctrl */
+/*********************************************/
+long bch_service_ioctl_ctrl(struct file *filp, unsigned int cmd,
+		unsigned long arg)
+{
+	int retval = 0;
+
+	if (_IOC_TYPE(cmd) != BCH_IOCTL_MAGIC)
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		/* Must be root to issue ioctls */
+		return -EPERM;
+	}
+
+	switch (cmd) {
+	case BCH_IOCTL_REGISTER_DEVICE: {
+		struct bch_register_device *cmd_info;
+
+		cmd_info = vmalloc(sizeof(struct bch_register_device));
+		if (!cmd_info)
+			return -ENOMEM;
+
+		if (copy_from_user(cmd_info, (void __user *)arg,
+				sizeof(struct bch_register_device))) {
+			pr_err("Cannot copy cmd info from user space\n");
+			vfree(cmd_info);
+			return -EINVAL;
+		}
+
+		retval = register_bcache_ioctl(cmd_info);
+
+		vfree(cmd_info);
+		return retval;
+	}
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct file_operations _ctrl_dev_fops = {
+	.owner = THIS_MODULE,
+	.unlocked_ioctl = bch_service_ioctl_ctrl
+};
+
+int __init bch_ctrl_device_init(void)
+{
+	struct bch_ctrl_device *ctrl = &_control_device;
+	struct device *device;
+	int result = 0;
+
+	result = alloc_chrdev_region(&ctrl->dev, 0, 1, "bcache");
+	if (result) {
+		pr_err("Cannot allocate control chrdev number.\n");
+		goto error_alloc_chrdev_region;
+	}
+
+	cdev_init(&ctrl->cdev, &_ctrl_dev_fops);
+
+	result = cdev_add(&ctrl->cdev, ctrl->dev, 1);
+	if (result) {
+		pr_err("Cannot add control chrdev.\n");
+		goto error_cdev_add;
+	}
+
+	ctrl->class = class_create(THIS_MODULE, "bcache");
+	if (IS_ERR(ctrl->class)) {
+		pr_err("Cannot create control chrdev class.\n");
+		result = PTR_ERR(ctrl->class);
+		goto error_class_create;
+	}
+
+	device = device_create(ctrl->class, NULL, ctrl->dev, NULL,
+			"bcache_ctrl");
+	if (IS_ERR(device)) {
+		pr_err("Cannot create control chrdev.\n");
+		result = PTR_ERR(device);
+		goto error_device_create;
+	}
+
+	return result;
+
+error_device_create:
+	class_destroy(ctrl->class);
+error_class_create:
+	cdev_del(&ctrl->cdev);
+error_cdev_add:
+	unregister_chrdev_region(ctrl->dev, 1);
+error_alloc_chrdev_region:
+	return result;
+}
+
+void bch_ctrl_device_deinit(void)
+{
+	struct bch_ctrl_device *ctrl = &_control_device;
+
+	device_destroy(ctrl->class, ctrl->dev);
+	class_destroy(ctrl->class);
+	cdev_del(&ctrl->cdev);
+	unregister_chrdev_region(ctrl->dev, 1);
+}
diff --git a/drivers/md/bcache/control.h b/drivers/md/bcache/control.h
new file mode 100644
index 000000000000..3e4273db02a3
--- /dev/null
+++ b/drivers/md/bcache/control.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BCACHE_CONTROL_H__
+#define __BCACHE_CONTROL_H__
+
+#include "ioctl_codes.h"
+
+int __init bch_ctrl_device_init(void);
+void bch_ctrl_device_deinit(void);
+
+ssize_t register_bcache_ioctl(struct bch_register_device *brd);
+
+#endif
diff --git a/drivers/md/bcache/ioctl_codes.h b/drivers/md/bcache/ioctl_codes.h
new file mode 100644
index 000000000000..f25e038bee30
--- /dev/null
+++ b/drivers/md/bcache/ioctl_codes.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BCACHE_IOCTL_CODES_H__
+#define __BCACHE_IOCTL_CODES_H__
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+struct bch_register_device {
+	const char *dev_name;
+	size_t size;
+	struct cache_sb *sb;
+};
+
+#define BCH_IOCTL_MAGIC (0xBC)
+
+/** Start new cache instance, load cache or recover cache */
+#define BCH_IOCTL_REGISTER_DEVICE	_IOWR(BCH_IOCTL_MAGIC, 1, struct bch_register_device)
+
+#endif
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 140f35dc0c45..95db3785a6e0 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -14,6 +14,7 @@
 #include "request.h"
 #include "writeback.h"
 #include "features.h"
+#include "control.h"
 
 #include <linux/blkdev.h>
 #include <linux/pagemap.h>
@@ -1069,7 +1070,7 @@ int bch_cached_dev_run(struct cached_dev *dc)
 		goto out;
 	}
 
-	if (!d->c &&
+	if (!d->c && dc->sb_disk &&
 	    BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
 		struct closure cl;
 
@@ -1259,9 +1260,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 	 */
 
 	if (bch_is_zero(u->uuid, 16)) {
-		struct closure cl;
-
-		closure_init_stack(&cl);
 
 		memcpy(u->uuid, dc->sb.uuid, 16);
 		memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
@@ -1271,8 +1269,14 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 		memcpy(dc->sb.set_uuid, c->set_uuid, 16);
 		SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
 
-		bch_write_bdev_super(dc, &cl);
-		closure_sync(&cl);
+		if (dc->sb_disk) {
+			struct closure cl;
+
+			closure_init_stack(&cl);
+			bch_write_bdev_super(dc, &cl);
+			closure_sync(&cl);
+		}
+
 	} else {
 		u->last_reg = rtime;
 		bch_uuid_write(c);
@@ -2403,14 +2407,14 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 
 /* Global interfaces/init */
 
-static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+static ssize_t register_bcache_sysfs(struct kobject *k, struct kobj_attribute *attr,
 			       const char *buffer, size_t size);
 static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
 					 struct kobj_attribute *attr,
 					 const char *buffer, size_t size);
 
-kobj_attribute_write(register,		register_bcache);
-kobj_attribute_write(register_quiet,	register_bcache);
+kobj_attribute_write(register,		register_bcache_sysfs);
+kobj_attribute_write(register_quiet,	register_bcache_sysfs);
 kobj_attribute_write(pendings_cleanup,	bch_pending_bdevs_cleanup);
 
 static bool bch_is_open_backing(dev_t dev)
@@ -2465,7 +2469,8 @@ static void register_bdev_worker(struct work_struct *work)
 	dc = kzalloc(sizeof(*dc), GFP_KERNEL);
 	if (!dc) {
 		fail = true;
-		put_page(virt_to_page(args->sb_disk));
+		if (args->sb_disk)
+			put_page(virt_to_page(args->sb_disk));
 		blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 		goto out;
 	}
@@ -2495,7 +2500,8 @@ static void register_cache_worker(struct work_struct *work)
 	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 	if (!ca) {
 		fail = true;
-		put_page(virt_to_page(args->sb_disk));
+		if (args->sb_disk)
+			put_page(virt_to_page(args->sb_disk));
 		blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 		goto out;
 	}
@@ -2525,7 +2531,7 @@ static void register_device_async(struct async_reg_args *args)
 	queue_delayed_work(system_wq, &args->reg_work, 10);
 }
 
-static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+static ssize_t register_bcache_common(void *k, struct kobj_attribute *attr,
 			       const char *buffer, size_t size)
 {
 	const char *err;
@@ -2587,9 +2593,14 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	if (set_blocksize(bdev, 4096))
 		goto out_blkdev_put;
 
-	err = read_super(sb, bdev, &sb_disk);
-	if (err)
-		goto out_blkdev_put;
+	if (!k) {
+		err = read_super(sb, bdev, &sb_disk);
+		if (err)
+			goto out_blkdev_put;
+	} else {
+		sb_disk =  NULL;
+		memcpy(sb, (struct cache_sb *)k, sizeof(struct cache_sb));
+	}
 
 	err = "failed to register device";
 
@@ -2651,7 +2662,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	return size;
 
 out_put_sb_page:
-	put_page(virt_to_page(sb_disk));
+	if (!k)
+		put_page(virt_to_page(sb_disk));
 out_blkdev_put:
 	blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 out_free_sb:
@@ -2666,6 +2678,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	return ret;
 }
 
+static ssize_t register_bcache_sysfs(struct kobject *k, struct kobj_attribute *attr,
+			       const char *buffer, size_t size)
+{
+	return register_bcache_common(NULL, attr, buffer, size);
+}
+
+ssize_t register_bcache_ioctl(struct bch_register_device *brd)
+{
+	return register_bcache_common((void *)brd->sb, NULL, brd->dev_name, brd->size);
+}
+
+
 
 struct pdev {
 	struct list_head list;
@@ -2819,6 +2843,7 @@ static void bcache_exit(void)
 {
 	bch_debug_exit();
 	bch_request_exit();
+	bch_ctrl_device_deinit();
 	if (bcache_kobj)
 		kobject_put(bcache_kobj);
 	if (bcache_wq)
@@ -2918,6 +2943,11 @@ static int __init bcache_init(void)
 	bch_debug_init();
 	closure_debug_init();
 
+	if (bch_ctrl_device_init()) {
+		pr_err("Cannot initialize control device\n");
+		goto err;
+	}
+
 	bcache_is_reboot = false;
 
 	return 0;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 1f0dce30fa75..984cc97a1d55 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -379,6 +379,10 @@ STORE(__cached_dev)
 		if (v < 0)
 			return v;
 
+		// XXX(atom): Devices created by IOCTL don't support changing cache mode
+		if (!dc->sb_disk)
+			return -EINVAL;
+
 		if ((unsigned int) v != BDEV_CACHE_MODE(&dc->sb)) {
 			SET_BDEV_CACHE_MODE(&dc->sb, v);
 			bch_write_bdev_super(dc, NULL);
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 02b2f9df73f6..bd7b95bd2da7 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -135,7 +135,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
 {
 	if (!atomic_read(&dc->has_dirty) &&
 	    !atomic_xchg(&dc->has_dirty, 1)) {
-		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
+		if (dc->sb_disk && BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
 			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
 			/* XXX: should do this synchronously */
 			bch_write_bdev_super(dc, NULL);
-- 
2.25.1


-- 







The contents of this email are confidential. If the reader of this 
message is not the intended recipient, you are hereby notified that any 
dissemination, distribution or copying of this communication is strictly 
prohibited. If you have received this communication in error, please notify 
us immediately by replying to this message and deleting it from your 
computer. Thank you. Devo, Inc; arco@devo.com <mailto:arco@devo.com>;  
Calle Estébanez Calderón 3-5, 5th Floor. Madrid, Spain 28020


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] bcache: Use bcache without formatting existing device
  2022-03-08 14:56 Andrea Tomassetti
@ 2022-03-09  1:41 ` Zhang Zhen
  0 siblings, 0 replies; 17+ messages in thread
From: Zhang Zhen @ 2022-03-09  1:41 UTC (permalink / raw)
  To: Andrea Tomassetti, linux-bcache; +Cc: Coly Li, Kent Overstreet

Hi Andrea,

Good job.
We just right need this feature.
This feature store super block in yaml configuration file.
Do I understand correctly?

Would you send out the udev rule and python script?
I want to try it.

Thanks.

On 3/8/22 10:56 PM, Andrea Tomassetti wrote:
> Introducing a bcache control device (/dev/bcache_ctrl)
> that allows communicating to the driver from user space
> via IOCTL. The only IOCTL commands currently implemented,
> receives a struct cache_sb and uses it to register the
> backing device.
> 
> Signed-off-by: Andrea Tomassetti <andrea.tomassetti@devo.com>
> ---
> Hi all,
> At Devo we started to think of using bcache in our production systems
> to boost performance. But, at the very beginning, we were faced with
> one annoying issue, at least for our use-case: bcache needs the backing
> devices to be formatted before being able to use them. What's really
> needed is just to wipe any FS signature out of them. This is definitely
> not an option we will consider in our production systems because we would
> need to move hundreds of terabytes of data.
> 
> To circumvent the "formatting" problem, in the past weeks I worked on some
> modifications to the actual bcache module. In particular, I added a bcache
> control device (exported to /dev/bcache_ctrl) that allows communicating to
> the driver from userspace via IOCTL. One of the IOCTL commands that I
> implemented receives a struct cache_sb and uses it to register the backing
> device. The modifications are really small and retro compatible. To then
> re-create the same configuration after every boot (because the backing
> devices now do not present the bcache super block anymore) I created an
> udev rule that invokes a python script that will re-create the same
> scenario based on a yaml configuration file.
> 
>   drivers/md/bcache/Makefile      |   2 +-
>   drivers/md/bcache/control.c     | 117 ++++++++++++++++++++++++++++++++
>   drivers/md/bcache/control.h     |  12 ++++
>   drivers/md/bcache/ioctl_codes.h |  19 ++++++
>   drivers/md/bcache/super.c       |  62 ++++++++++++-----
>   drivers/md/bcache/sysfs.c       |   4 ++
>   drivers/md/bcache/writeback.h   |   2 +-
>   7 files changed, 200 insertions(+), 18 deletions(-)
>   create mode 100644 drivers/md/bcache/control.c
>   create mode 100644 drivers/md/bcache/control.h
>   create mode 100644 drivers/md/bcache/ioctl_codes.h
> 
> diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
> index 5b87e59676b8..46ed41baed7a 100644
> --- a/drivers/md/bcache/Makefile
> +++ b/drivers/md/bcache/Makefile
> @@ -4,4 +4,4 @@ obj-$(CONFIG_BCACHE)	+= bcache.o
> 
>   bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
>   	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
> -	util.o writeback.o features.o
> +	util.o writeback.o features.o control.o
> diff --git a/drivers/md/bcache/control.c b/drivers/md/bcache/control.c
> new file mode 100644
> index 000000000000..3d04d2218171
> --- /dev/null
> +++ b/drivers/md/bcache/control.c
> @@ -0,0 +1,117 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/cdev.h>
> +#include <linux/fs.h>
> +#include <linux/vmalloc.h>
> +
> +#include "control.h"
> +
> +struct bch_ctrl_device {
> +	struct cdev cdev;
> +	struct class *class;
> +	dev_t dev;
> +};
> +
> +static struct bch_ctrl_device _control_device;
> +
> +/* this handles IOCTL for /dev/bcache_ctrl */
> +/*********************************************/
> +long bch_service_ioctl_ctrl(struct file *filp, unsigned int cmd,
> +		unsigned long arg)
> +{
> +	int retval = 0;
> +
> +	if (_IOC_TYPE(cmd) != BCH_IOCTL_MAGIC)
> +		return -EINVAL;
> +
> +	if (!capable(CAP_SYS_ADMIN)) {
> +		/* Must be root to issue ioctls */
> +		return -EPERM;
> +	}
> +
> +	switch (cmd) {
> +	case BCH_IOCTL_REGISTER_DEVICE: {
> +		struct bch_register_device *cmd_info;
> +
> +		cmd_info = vmalloc(sizeof(struct bch_register_device));
> +		if (!cmd_info)
> +			return -ENOMEM;
> +
> +		if (copy_from_user(cmd_info, (void __user *)arg,
> +				sizeof(struct bch_register_device))) {
> +			pr_err("Cannot copy cmd info from user space\n");
> +			vfree(cmd_info);
> +			return -EINVAL;
> +		}
> +
> +		retval = register_bcache_ioctl(cmd_info);
> +
> +		vfree(cmd_info);
> +		return result;
> +	}
> +
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
> +static const struct file_operations _ctrl_dev_fops = {
> +	.owner = THIS_MODULE,
> +	.unlocked_ioctl = bch_service_ioctl_ctrl
> +};
> +
> +int __init bch_ctrl_device_init(void)
> +{
> +	struct bch_ctrl_device *ctrl = &_control_device;
> +	struct device *device;
> +	int result = 0;
> +
> +	result = alloc_chrdev_region(&ctrl->dev, 0, 1, "bcache");
> +	if (result) {
> +		pr_err("Cannot allocate control chrdev number.\n");
> +		goto error_alloc_chrdev_region;
> +	}
> +
> +	cdev_init(&ctrl->cdev, &_ctrl_dev_fops);
> +
> +	result = cdev_add(&ctrl->cdev, ctrl->dev, 1);
> +	if (result) {
> +		pr_err("Cannot add control chrdev.\n");
> +		goto error_cdev_add;
> +	}
> +
> +	ctrl->class = class_create(THIS_MODULE, "bcache");
> +	if (IS_ERR(ctrl->class)) {
> +		pr_err("Cannot create control chrdev class.\n");
> +		result = PTR_ERR(ctrl->class);
> +		goto error_class_create;
> +	}
> +
> +	device = device_create(ctrl->class, NULL, ctrl->dev, NULL,
> +			"bcache_ctrl");
> +	if (IS_ERR(device)) {
> +		pr_err("Cannot create control chrdev.\n");
> +		result = PTR_ERR(device);
> +		goto error_device_create;
> +	}
> +
> +	return result;
> +
> +error_device_create:
> +	class_destroy(ctrl->class);
> +error_class_create:
> +	cdev_del(&ctrl->cdev);
> +error_cdev_add:
> +	unregister_chrdev_region(ctrl->dev, 1);
> +error_alloc_chrdev_region:
> +	return result;
> +}
> +
> +void bch_ctrl_device_deinit(void)
> +{
> +	struct bch_ctrl_device *ctrl = &_control_device;
> +
> +	device_destroy(ctrl->class, ctrl->dev);
> +	class_destroy(ctrl->class);
> +	cdev_del(&ctrl->cdev);
> +	unregister_chrdev_region(ctrl->dev, 1);
> +}
> diff --git a/drivers/md/bcache/control.h b/drivers/md/bcache/control.h
> new file mode 100644
> index 000000000000..3e4273db02a3
> --- /dev/null
> +++ b/drivers/md/bcache/control.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __BCACHE_CONTROL_H__
> +#define __BCACHE_CONTROL_H__
> +
> +#include "ioctl_codes.h"
> +
> +int __init bch_ctrl_device_init(void);
> +void bch_ctrl_device_deinit(void);
> +
> +ssize_t register_bcache_ioctl(struct bch_register_device *brd);
> +
> +#endif
> diff --git a/drivers/md/bcache/ioctl_codes.h b/drivers/md/bcache/ioctl_codes.h
> new file mode 100644
> index 000000000000..f25e038bee30
> --- /dev/null
> +++ b/drivers/md/bcache/ioctl_codes.h
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __BCACHE_IOCTL_CODES_H__
> +#define __BCACHE_IOCTL_CODES_H__
> +
> +#include <linux/ioctl.h>
> +#include <linux/types.h>
> +
> +struct bch_register_device {
> +	const char *dev_name;
> +	size_t size;
> +	struct cache_sb *sb;
> +};
> +
> +#define BCH_IOCTL_MAGIC (0xBC)
> +
> +/** Start new cache instance, load cache or recover cache */
> +#define BCH_IOCTL_REGISTER_DEVICE	_IOWR(BCH_IOCTL_MAGIC, 1, struct bch_register_device)
> +
> +#endif
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index 140f35dc0c45..95db3785a6e0 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -14,6 +14,7 @@
>   #include "request.h"
>   #include "writeback.h"
>   #include "features.h"
> +#include "control.h"
> 
>   #include <linux/blkdev.h>
>   #include <linux/pagemap.h>
> @@ -1069,7 +1070,7 @@ int bch_cached_dev_run(struct cached_dev *dc)
>   		goto out;
>   	}
> 
> -	if (!d->c &&
> +	if (!d->c && dc->sb_disk &&
>   	    BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
>   		struct closure cl;
> 
> @@ -1259,9 +1260,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
>   	 */
> 
>   	if (bch_is_zero(u->uuid, 16)) {
> -		struct closure cl;
> -
> -		closure_init_stack(&cl);
> 
>   		memcpy(u->uuid, dc->sb.uuid, 16);
>   		memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
> @@ -1271,8 +1269,14 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
>   		memcpy(dc->sb.set_uuid, c->set_uuid, 16);
>   		SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
> 
> -		bch_write_bdev_super(dc, &cl);
> -		closure_sync(&cl);
> +		if (dc->sb_disk) {
> +			struct closure cl;
> +
> +			closure_init_stack(&cl);
> +			bch_write_bdev_super(dc, &cl);
> +			closure_sync(&cl);
> +		}
> +
>   	} else {
>   		u->last_reg = rtime;
>   		bch_uuid_write(c);
> @@ -2403,14 +2407,14 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
> 
>   /* Global interfaces/init */
> 
> -static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
> +static ssize_t register_bcache_sysfs(struct kobject *k, struct kobj_attribute *attr,
>   			       const char *buffer, size_t size);
>   static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
>   					 struct kobj_attribute *attr,
>   					 const char *buffer, size_t size);
> 
> -kobj_attribute_write(register,		register_bcache);
> -kobj_attribute_write(register_quiet,	register_bcache);
> +kobj_attribute_write(register,		register_bcache_sysfs);
> +kobj_attribute_write(register_quiet,	register_bcache_sysfs);
>   kobj_attribute_write(pendings_cleanup,	bch_pending_bdevs_cleanup);
> 
>   static bool bch_is_open_backing(dev_t dev)
> @@ -2465,7 +2469,8 @@ static void register_bdev_worker(struct work_struct *work)
>   	dc = kzalloc(sizeof(*dc), GFP_KERNEL);
>   	if (!dc) {
>   		fail = true;
> -		put_page(virt_to_page(args->sb_disk));
> +		if (args->sb_disk)
> +			put_page(virt_to_page(args->sb_disk));
>   		blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
>   		goto out;
>   	}
> @@ -2495,7 +2500,8 @@ static void register_cache_worker(struct work_struct *work)
>   	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
>   	if (!ca) {
>   		fail = true;
> -		put_page(virt_to_page(args->sb_disk));
> +		if (args->sb_disk)
> +			put_page(virt_to_page(args->sb_disk));
>   		blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
>   		goto out;
>   	}
> @@ -2525,7 +2531,7 @@ static void register_device_async(struct async_reg_args *args)
>   	queue_delayed_work(system_wq, &args->reg_work, 10);
>   }
> 
> -static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
> +static ssize_t register_bcache_common(void *k, struct kobj_attribute *attr,
>   			       const char *buffer, size_t size)
>   {
>   	const char *err;
> @@ -2587,9 +2593,14 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
>   	if (set_blocksize(bdev, 4096))
>   		goto out_blkdev_put;
> 
> -	err = read_super(sb, bdev, &sb_disk);
> -	if (err)
> -		goto out_blkdev_put;
> +	if (!k) {
> +		err = read_super(sb, bdev, &sb_disk);
> +		if (err)
> +			goto out_blkdev_put;
> +	} else {
> +		sb_disk =  NULL;
> +		memcpy(sb, (struct cache_sb *)k, sizeof(struct cache_sb));
> +	}
> 
>   	err = "failed to register device";
> 
> @@ -2651,7 +2662,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
>   	return size;
> 
>   out_put_sb_page:
> -	put_page(virt_to_page(sb_disk));
> +	if (!k)
> +		put_page(virt_to_page(sb_disk));
>   out_blkdev_put:
>   	blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
>   out_free_sb:
> @@ -2666,6 +2678,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
>   	return ret;
>   }
> 
> +static ssize_t register_bcache_sysfs(struct kobject *k, struct kobj_attribute *attr,
> +			       const char *buffer, size_t size)
> +{
> +	return register_bcache_common(NULL, attr, buffer, size);
> +}
> +
> +ssize_t register_bcache_ioctl(struct bch_register_device *brd)
> +{
> +	return register_bcache_common((void *)brd->sb, NULL, brd->dev_name, brd->size);
> +}
> +
> +
> 
>   struct pdev {
>   	struct list_head list;
> @@ -2819,6 +2843,7 @@ static void bcache_exit(void)
>   {
>   	bch_debug_exit();
>   	bch_request_exit();
> +	bch_ctrl_device_deinit();
>   	if (bcache_kobj)
>   		kobject_put(bcache_kobj);
>   	if (bcache_wq)
> @@ -2918,6 +2943,11 @@ static int __init bcache_init(void)
>   	bch_debug_init();
>   	closure_debug_init();
> 
> +	if (bch_ctrl_device_init()) {
> +		pr_err("Cannot initialize control device\n");
> +		goto err;
> +	}
> +
>   	bcache_is_reboot = false;
> 
>   	return 0;
> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
> index 1f0dce30fa75..984cc97a1d55 100644
> --- a/drivers/md/bcache/sysfs.c
> +++ b/drivers/md/bcache/sysfs.c
> @@ -379,6 +379,10 @@ STORE(__cached_dev)
>   		if (v < 0)
>   			return v;
> 
> +		// XXX(atom): Devices created by IOCTL don't support changing cache mode
> +		if (!dc->sb_disk)
> +			return -EINVAL;
> +
>   		if ((unsigned int) v != BDEV_CACHE_MODE(&dc->sb)) {
>   			SET_BDEV_CACHE_MODE(&dc->sb, v);
>   			bch_write_bdev_super(dc, NULL);
> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
> index 02b2f9df73f6..bd7b95bd2da7 100644
> --- a/drivers/md/bcache/writeback.h
> +++ b/drivers/md/bcache/writeback.h
> @@ -135,7 +135,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
>   {
>   	if (!atomic_read(&dc->has_dirty) &&
>   	    !atomic_xchg(&dc->has_dirty, 1)) {
> -		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
> +		if (dc->sb_disk && BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
>   			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
>   			/* XXX: should do this synchronously */
>   			bch_write_bdev_super(dc, NULL);
> --
> 2.25.1
> 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] bcache: Use bcache without formatting existing device
@ 2022-03-08 14:56 Andrea Tomassetti
  2022-03-09  1:41 ` Zhang Zhen
  0 siblings, 1 reply; 17+ messages in thread
From: Andrea Tomassetti @ 2022-03-08 14:56 UTC (permalink / raw)
  To: linux-bcache; +Cc: Coly Li, Kent Overstreet, Andrea Tomassetti

Introducing a bcache control device (/dev/bcache_ctrl)
that allows communicating to the driver from user space
via IOCTL. The only IOCTL commands currently implemented,
receives a struct cache_sb and uses it to register the
backing device.

Signed-off-by: Andrea Tomassetti <andrea.tomassetti@devo.com>
---
Hi all,
At Devo we started to think of using bcache in our production systems
to boost performance. But, at the very beginning, we were faced with
one annoying issue, at least for our use-case: bcache needs the backing
devices to be formatted before being able to use them. What's really
needed is just to wipe any FS signature out of them. This is definitely
not an option we will consider in our production systems because we would
need to move hundreds of terabytes of data.

To circumvent the "formatting" problem, in the past weeks I worked on some
modifications to the actual bcache module. In particular, I added a bcache
control device (exported to /dev/bcache_ctrl) that allows communicating to
the driver from userspace via IOCTL. One of the IOCTL commands that I
implemented receives a struct cache_sb and uses it to register the backing
device. The modifications are really small and retro compatible. To then
re-create the same configuration after every boot (because the backing
devices now do not present the bcache super block anymore) I created an
udev rule that invokes a python script that will re-create the same
scenario based on a yaml configuration file.

 drivers/md/bcache/Makefile      |   2 +-
 drivers/md/bcache/control.c     | 117 ++++++++++++++++++++++++++++++++
 drivers/md/bcache/control.h     |  12 ++++
 drivers/md/bcache/ioctl_codes.h |  19 ++++++
 drivers/md/bcache/super.c       |  62 ++++++++++++-----
 drivers/md/bcache/sysfs.c       |   4 ++
 drivers/md/bcache/writeback.h   |   2 +-
 7 files changed, 200 insertions(+), 18 deletions(-)
 create mode 100644 drivers/md/bcache/control.c
 create mode 100644 drivers/md/bcache/control.h
 create mode 100644 drivers/md/bcache/ioctl_codes.h

diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 5b87e59676b8..46ed41baed7a 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -4,4 +4,4 @@ obj-$(CONFIG_BCACHE)	+= bcache.o

 bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
 	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
-	util.o writeback.o features.o
+	util.o writeback.o features.o control.o
diff --git a/drivers/md/bcache/control.c b/drivers/md/bcache/control.c
new file mode 100644
index 000000000000..3d04d2218171
--- /dev/null
+++ b/drivers/md/bcache/control.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cdev.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+
+#include "control.h"
+
+struct bch_ctrl_device {
+	struct cdev cdev;
+	struct class *class;
+	dev_t dev;
+};
+
+static struct bch_ctrl_device _control_device;
+
+/* this handles IOCTL for /dev/bcache_ctrl */
+/*********************************************/
+long bch_service_ioctl_ctrl(struct file *filp, unsigned int cmd,
+		unsigned long arg)
+{
+	int retval = 0;
+
+	if (_IOC_TYPE(cmd) != BCH_IOCTL_MAGIC)
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		/* Must be root to issue ioctls */
+		return -EPERM;
+	}
+
+	switch (cmd) {
+	case BCH_IOCTL_REGISTER_DEVICE: {
+		struct bch_register_device *cmd_info;
+
+		cmd_info = vmalloc(sizeof(struct bch_register_device));
+		if (!cmd_info)
+			return -ENOMEM;
+
+		if (copy_from_user(cmd_info, (void __user *)arg,
+				sizeof(struct bch_register_device))) {
+			pr_err("Cannot copy cmd info from user space\n");
+			vfree(cmd_info);
+			return -EINVAL;
+		}
+
+		retval = register_bcache_ioctl(cmd_info);
+
+		vfree(cmd_info);
+		return result;
+	}
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct file_operations _ctrl_dev_fops = {
+	.owner = THIS_MODULE,
+	.unlocked_ioctl = bch_service_ioctl_ctrl
+};
+
+int __init bch_ctrl_device_init(void)
+{
+	struct bch_ctrl_device *ctrl = &_control_device;
+	struct device *device;
+	int result = 0;
+
+	result = alloc_chrdev_region(&ctrl->dev, 0, 1, "bcache");
+	if (result) {
+		pr_err("Cannot allocate control chrdev number.\n");
+		goto error_alloc_chrdev_region;
+	}
+
+	cdev_init(&ctrl->cdev, &_ctrl_dev_fops);
+
+	result = cdev_add(&ctrl->cdev, ctrl->dev, 1);
+	if (result) {
+		pr_err("Cannot add control chrdev.\n");
+		goto error_cdev_add;
+	}
+
+	ctrl->class = class_create(THIS_MODULE, "bcache");
+	if (IS_ERR(ctrl->class)) {
+		pr_err("Cannot create control chrdev class.\n");
+		result = PTR_ERR(ctrl->class);
+		goto error_class_create;
+	}
+
+	device = device_create(ctrl->class, NULL, ctrl->dev, NULL,
+			"bcache_ctrl");
+	if (IS_ERR(device)) {
+		pr_err("Cannot create control chrdev.\n");
+		result = PTR_ERR(device);
+		goto error_device_create;
+	}
+
+	return result;
+
+error_device_create:
+	class_destroy(ctrl->class);
+error_class_create:
+	cdev_del(&ctrl->cdev);
+error_cdev_add:
+	unregister_chrdev_region(ctrl->dev, 1);
+error_alloc_chrdev_region:
+	return result;
+}
+
+void bch_ctrl_device_deinit(void)
+{
+	struct bch_ctrl_device *ctrl = &_control_device;
+
+	device_destroy(ctrl->class, ctrl->dev);
+	class_destroy(ctrl->class);
+	cdev_del(&ctrl->cdev);
+	unregister_chrdev_region(ctrl->dev, 1);
+}
diff --git a/drivers/md/bcache/control.h b/drivers/md/bcache/control.h
new file mode 100644
index 000000000000..3e4273db02a3
--- /dev/null
+++ b/drivers/md/bcache/control.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BCACHE_CONTROL_H__
+#define __BCACHE_CONTROL_H__
+
+#include "ioctl_codes.h"
+
+int __init bch_ctrl_device_init(void);
+void bch_ctrl_device_deinit(void);
+
+ssize_t register_bcache_ioctl(struct bch_register_device *brd);
+
+#endif
diff --git a/drivers/md/bcache/ioctl_codes.h b/drivers/md/bcache/ioctl_codes.h
new file mode 100644
index 000000000000..f25e038bee30
--- /dev/null
+++ b/drivers/md/bcache/ioctl_codes.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BCACHE_IOCTL_CODES_H__
+#define __BCACHE_IOCTL_CODES_H__
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+struct bch_register_device {
+	const char *dev_name;
+	size_t size;
+	struct cache_sb *sb;
+};
+
+#define BCH_IOCTL_MAGIC (0xBC)
+
+/** Start new cache instance, load cache or recover cache */
+#define BCH_IOCTL_REGISTER_DEVICE	_IOWR(BCH_IOCTL_MAGIC, 1, struct bch_register_device)
+
+#endif
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 140f35dc0c45..95db3785a6e0 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -14,6 +14,7 @@
 #include "request.h"
 #include "writeback.h"
 #include "features.h"
+#include "control.h"

 #include <linux/blkdev.h>
 #include <linux/pagemap.h>
@@ -1069,7 +1070,7 @@ int bch_cached_dev_run(struct cached_dev *dc)
 		goto out;
 	}

-	if (!d->c &&
+	if (!d->c && dc->sb_disk &&
 	    BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
 		struct closure cl;

@@ -1259,9 +1260,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 	 */

 	if (bch_is_zero(u->uuid, 16)) {
-		struct closure cl;
-
-		closure_init_stack(&cl);

 		memcpy(u->uuid, dc->sb.uuid, 16);
 		memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
@@ -1271,8 +1269,14 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 		memcpy(dc->sb.set_uuid, c->set_uuid, 16);
 		SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);

-		bch_write_bdev_super(dc, &cl);
-		closure_sync(&cl);
+		if (dc->sb_disk) {
+			struct closure cl;
+
+			closure_init_stack(&cl);
+			bch_write_bdev_super(dc, &cl);
+			closure_sync(&cl);
+		}
+
 	} else {
 		u->last_reg = rtime;
 		bch_uuid_write(c);
@@ -2403,14 +2407,14 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,

 /* Global interfaces/init */

-static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+static ssize_t register_bcache_sysfs(struct kobject *k, struct kobj_attribute *attr,
 			       const char *buffer, size_t size);
 static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
 					 struct kobj_attribute *attr,
 					 const char *buffer, size_t size);

-kobj_attribute_write(register,		register_bcache);
-kobj_attribute_write(register_quiet,	register_bcache);
+kobj_attribute_write(register,		register_bcache_sysfs);
+kobj_attribute_write(register_quiet,	register_bcache_sysfs);
 kobj_attribute_write(pendings_cleanup,	bch_pending_bdevs_cleanup);

 static bool bch_is_open_backing(dev_t dev)
@@ -2465,7 +2469,8 @@ static void register_bdev_worker(struct work_struct *work)
 	dc = kzalloc(sizeof(*dc), GFP_KERNEL);
 	if (!dc) {
 		fail = true;
-		put_page(virt_to_page(args->sb_disk));
+		if (args->sb_disk)
+			put_page(virt_to_page(args->sb_disk));
 		blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 		goto out;
 	}
@@ -2495,7 +2500,8 @@ static void register_cache_worker(struct work_struct *work)
 	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 	if (!ca) {
 		fail = true;
-		put_page(virt_to_page(args->sb_disk));
+		if (args->sb_disk)
+			put_page(virt_to_page(args->sb_disk));
 		blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 		goto out;
 	}
@@ -2525,7 +2531,7 @@ static void register_device_async(struct async_reg_args *args)
 	queue_delayed_work(system_wq, &args->reg_work, 10);
 }

-static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+static ssize_t register_bcache_common(void *k, struct kobj_attribute *attr,
 			       const char *buffer, size_t size)
 {
 	const char *err;
@@ -2587,9 +2593,14 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	if (set_blocksize(bdev, 4096))
 		goto out_blkdev_put;

-	err = read_super(sb, bdev, &sb_disk);
-	if (err)
-		goto out_blkdev_put;
+	if (!k) {
+		err = read_super(sb, bdev, &sb_disk);
+		if (err)
+			goto out_blkdev_put;
+	} else {
+		sb_disk =  NULL;
+		memcpy(sb, (struct cache_sb *)k, sizeof(struct cache_sb));
+	}

 	err = "failed to register device";

@@ -2651,7 +2662,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	return size;

 out_put_sb_page:
-	put_page(virt_to_page(sb_disk));
+	if (!k)
+		put_page(virt_to_page(sb_disk));
 out_blkdev_put:
 	blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 out_free_sb:
@@ -2666,6 +2678,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	return ret;
 }

+static ssize_t register_bcache_sysfs(struct kobject *k, struct kobj_attribute *attr,
+			       const char *buffer, size_t size)
+{
+	return register_bcache_common(NULL, attr, buffer, size);
+}
+
+ssize_t register_bcache_ioctl(struct bch_register_device *brd)
+{
+	return register_bcache_common((void *)brd->sb, NULL, brd->dev_name, brd->size);
+}
+
+

 struct pdev {
 	struct list_head list;
@@ -2819,6 +2843,7 @@ static void bcache_exit(void)
 {
 	bch_debug_exit();
 	bch_request_exit();
+	bch_ctrl_device_deinit();
 	if (bcache_kobj)
 		kobject_put(bcache_kobj);
 	if (bcache_wq)
@@ -2918,6 +2943,11 @@ static int __init bcache_init(void)
 	bch_debug_init();
 	closure_debug_init();

+	if (bch_ctrl_device_init()) {
+		pr_err("Cannot initialize control device\n");
+		goto err;
+	}
+
 	bcache_is_reboot = false;

 	return 0;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 1f0dce30fa75..984cc97a1d55 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -379,6 +379,10 @@ STORE(__cached_dev)
 		if (v < 0)
 			return v;

+		// XXX(atom): Devices created by IOCTL don't support changing cache mode
+		if (!dc->sb_disk)
+			return -EINVAL;
+
 		if ((unsigned int) v != BDEV_CACHE_MODE(&dc->sb)) {
 			SET_BDEV_CACHE_MODE(&dc->sb, v);
 			bch_write_bdev_super(dc, NULL);
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 02b2f9df73f6..bd7b95bd2da7 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -135,7 +135,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
 {
 	if (!atomic_read(&dc->has_dirty) &&
 	    !atomic_xchg(&dc->has_dirty, 1)) {
-		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
+		if (dc->sb_disk && BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
 			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
 			/* XXX: should do this synchronously */
 			bch_write_bdev_super(dc, NULL);
--
2.25.1


-- 







The contents of this email are confidential. If the reader of this 
message is not the intended recipient, you are hereby notified that any 
dissemination, distribution or copying of this communication is strictly 
prohibited. If you have received this communication in error, please notify 
us immediately by replying to this message and deleting it from your 
computer. Thank you. Devo, Inc; arco@devo.com <mailto:arco@devo.com>;  
Calle Estébanez Calderón 3-5, 5th Floor. Madrid, Spain 28020


^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2022-07-20 14:46 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-04 15:13 [PATCH] bcache: Use bcache without formatting existing device Andrea Tomassetti
2022-07-04 15:29 ` Coly Li
2022-07-05  8:48   ` Andrea Tomassetti
2022-07-05 14:04     ` Coly Li
2022-07-05 22:03       ` Eric Wheeler
2022-07-07  9:56         ` Andrea Tomassetti
2022-07-12 20:29           ` Eric Wheeler
2022-07-20  8:06             ` Andrea Tomassetti
2022-07-20 13:31               ` Coly Li
2022-07-20 14:23                 ` Andrea Tomassetti
2022-07-20 14:46                   ` Coly Li
  -- strict thread matches above, loose matches on Subject: below --
2022-03-10  8:52 Andrea Tomassetti
2022-03-10 10:58 ` kernel test robot
2022-03-10 11:18 ` kernel test robot
2022-03-10 12:09 ` kernel test robot
2022-03-08 14:56 Andrea Tomassetti
2022-03-09  1:41 ` Zhang Zhen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.