All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.
@ 2013-08-09 17:48 Frank Mayhar
  2013-08-13 16:41 ` Frank Mayhar
                   ` (4 more replies)
  0 siblings, 5 replies; 63+ messages in thread
From: Frank Mayhar @ 2013-08-09 17:48 UTC (permalink / raw)
  To: linux-kernel

The device mapper and some of its modules allocate memory pools at
various points when setting up a device.  In some cases, these pools are
fairly large, for example the multipath module allocates a 256-entry
pool and the dm itself allocates three of that size.  In a
memory-constrained environment where we're creating a lot of these
devices, the memory use can quickly become significant.  Unfortunately,
there's currently no way to change the size of the pools other than by
changing a constant and rebuilding the kernel.

This patch fixes that by changing the hardcoded MIN_IOS (and certain
other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
sysctl-modifiable values.  This lets us change the size of these pools
on the fly, we can reduce the size of the pools and reduce memory
pressure.

We tested performance of dm-mpath with smaller MIN_IOS sizes for both dm
and dm-mpath, from a value of 32 all the way down to zero.  Bearing in
mind that the underlying devices were network-based, we saw essentially
no performance degradation; if there was any, it was down in the noise.
One might wonder why these sizes are the way they are; I investigated
and they've been unchanged since at least 2006.

Signed-off-by: Frank Mayhar <fmayhar@google.com>
--- 
 drivers/md/dm-crypt.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++----
 drivers/md/dm-io.c    | 58 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/dm-mpath.c | 48 ++++++++++++++++++++++++++++++++++++++-
 drivers/md/dm-snap.c  | 45 +++++++++++++++++++++++++++++++++++-
 drivers/md/dm.c       | 41 ++++++++++++++++++++++++++++++---
 5 files changed, 244 insertions(+), 11 deletions(-)
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 6d2d41a..d68f10a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -178,12 +178,55 @@ struct crypt_config {
 #define MIN_IOS        16
 #define MIN_POOL_PAGES 32
 
+static int sysctl_dm_crypt_min_ios = MIN_IOS;
+static int sysctl_dm_crypt_min_pool_pages = MIN_POOL_PAGES;
+
 static struct kmem_cache *_crypt_io_pool;
 
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
 
+static struct ctl_table_header *dm_crypt_table_header;
+static ctl_table dm_crypt_ctl_table[] = {
+	{
+		.procname	= "min_ios",
+		.data		= &sysctl_dm_crypt_min_ios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "min_pool_pages",
+		.data		= &sysctl_dm_crypt_min_pool_pages,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table dm_crypt_dir_table[] = {
+	{ .procname	= "crypt",
+	  .mode		= 0555,
+	  .child	= dm_crypt_ctl_table },
+	{ }
+};
+
+static ctl_table dm_crypt_dm_dir_table[] = {
+	{ .procname	= "dm",
+	  .mode		= 0555,
+	  .child	= dm_crypt_dir_table },
+	{ }
+};
+
+static ctl_table dm_crypt_root_table[] = {
+	{ .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= dm_crypt_dm_dir_table },
+	{ }
+};
+
 static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
 {
 	return this_cpu_ptr(cc->cpu);
@@ -1571,7 +1614,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 
 	ret = -ENOMEM;
-	cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
+	cc->io_pool = mempool_create_slab_pool(sysctl_dm_crypt_min_ios,
+					       _crypt_io_pool);
 	if (!cc->io_pool) {
 		ti->error = "Cannot allocate crypt io mempool";
 		goto bad;
@@ -1583,20 +1627,22 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
 			   ~(crypto_tfm_ctx_alignment() - 1);
 
-	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
+	cc->req_pool = mempool_create_kmalloc_pool(sysctl_dm_crypt_min_ios,
+			cc->dmreq_start +
 			sizeof(struct dm_crypt_request) + cc->iv_size);
 	if (!cc->req_pool) {
 		ti->error = "Cannot allocate crypt request mempool";
 		goto bad;
 	}
 
-	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
+	cc->page_pool = mempool_create_page_pool(sysctl_dm_crypt_min_pool_pages,
+						 0);
 	if (!cc->page_pool) {
 		ti->error = "Cannot allocate page mempool";
 		goto bad;
 	}
 
-	cc->bs = bioset_create(MIN_IOS, 0);
+	cc->bs = bioset_create(sysctl_dm_crypt_min_ios, 0);
 	if (!cc->bs) {
 		ti->error = "Cannot allocate crypt bioset";
 		goto bad;
@@ -1851,11 +1897,20 @@ static int __init dm_crypt_init(void)
 		kmem_cache_destroy(_crypt_io_pool);
 	}
 
+	dm_crypt_table_header = register_sysctl_table(dm_crypt_root_table);
+	if (!dm_crypt_table_header) {
+		DMERR("failed to register sysctl");
+		dm_unregister_target(&crypt_target);
+		kmem_cache_destroy(_crypt_io_pool);
+		return -ENOMEM;
+	}
+
 	return r;
 }
 
 static void __exit dm_crypt_exit(void)
 {
+	unregister_sysctl_table(dm_crypt_table_header);
 	dm_unregister_target(&crypt_target);
 	kmem_cache_destroy(_crypt_io_pool);
 }
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea49834..8c45c60 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -22,6 +22,9 @@
 #define MIN_IOS		16
 #define MIN_BIOS	16
 
+static int sysctl_dm_io_min_ios = MIN_IOS;
+static int sysctl_dm_io_min_bios = MIN_BIOS;
+
 struct dm_io_client {
 	mempool_t *pool;
 	struct bio_set *bios;
@@ -44,6 +47,46 @@ struct io {
 
 static struct kmem_cache *_dm_io_cache;
 
+static struct ctl_table_header *dm_io_table_header;
+static ctl_table dm_io_ctl_table[] = {
+	{
+		.procname	= "min_ios",
+		.data		= &sysctl_dm_io_min_ios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "min_bios",
+		.data		= &sysctl_dm_io_min_bios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table dm_io_dir_table[] = {
+	{ .procname	= "io",
+	  .mode		= 0555,
+	  .child	= dm_io_ctl_table },
+	{ }
+};
+
+static ctl_table dm_io_dm_dir_table[] = {
+	{ .procname	= "dm",
+	  .mode		= 0555,
+	  .child	= dm_io_dir_table },
+	{ }
+};
+
+static ctl_table dm_io_root_table[] = {
+	{ .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= dm_io_dm_dir_table },
+	{ }
+};
+
 /*
  * Create a client with mempool and bioset.
  */
@@ -55,11 +98,12 @@ struct dm_io_client *dm_io_client_create(void)
 	if (!client)
 		return ERR_PTR(-ENOMEM);
 
-	client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
+	client->pool = mempool_create_slab_pool(sysctl_dm_io_min_ios,
+						_dm_io_cache);
 	if (!client->pool)
 		goto bad;
 
-	client->bios = bioset_create(MIN_BIOS, 0);
+	client->bios = bioset_create(sysctl_dm_io_min_bios, 0);
 	if (!client->bios)
 		goto bad;
 
@@ -515,11 +559,21 @@ int __init dm_io_init(void)
 	if (!_dm_io_cache)
 		return -ENOMEM;
 
+	dm_io_table_header = register_sysctl_table(dm_io_root_table);
+	if (!dm_io_table_header) {
+		DMERR("failed to register sysctl");
+		kmem_cache_destroy(_dm_io_cache);
+		_dm_io_cache = NULL;
+		return -ENOMEM;
+	}
+
+
 	return 0;
 }
 
 void dm_io_exit(void)
 {
+	unregister_sysctl_table(dm_io_table_header);
 	kmem_cache_destroy(_dm_io_cache);
 	_dm_io_cache = NULL;
 }
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 5adede1..099af1b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -118,6 +118,8 @@ typedef int (*action_fn) (struct pgpath *pgpath);
 
 #define MIN_IOS 256	/* Mempool size */
 
+static int sysctl_dm_mpath_min_ios = MIN_IOS;
+
 static struct kmem_cache *_mpio_cache;
 
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
@@ -125,6 +127,38 @@ static void process_queued_ios(struct work_struct *work);
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
 
+static struct ctl_table_header *dm_mpath_table_header;
+static ctl_table dm_mpath_ctl_table[] = {
+	{
+		.procname	= "min_ios",
+		.data		= &sysctl_dm_mpath_min_ios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table dm_mpath_dir_table[] = {
+	{ .procname	= "mpath",
+	  .mode		= 0555,
+	  .child	= dm_mpath_ctl_table },
+	{ }
+};
+
+static ctl_table dm_mpath_dm_dir_table[] = {
+	{ .procname	= "dm",
+	  .mode		= 0555,
+	  .child	= dm_mpath_dir_table },
+	{ }
+};
+
+static ctl_table dm_mpath_root_table[] = {
+	{ .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= dm_mpath_dm_dir_table },
+	{ }
+};
 
 /*-----------------------------------------------
  * Allocation routines
@@ -202,7 +236,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 		INIT_WORK(&m->trigger_event, trigger_event);
 		init_waitqueue_head(&m->pg_init_wait);
 		mutex_init(&m->work_mutex);
-		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
+		m->mpio_pool = mempool_create_slab_pool(sysctl_dm_mpath_min_ios,
+							_mpio_cache);
 		if (!m->mpio_pool) {
 			kfree(m);
 			return NULL;
@@ -1745,6 +1780,15 @@ static int __init dm_multipath_init(void)
 		kmem_cache_destroy(_mpio_cache);
 		return -ENOMEM;
 	}
+	dm_mpath_table_header = register_sysctl_table(dm_mpath_root_table);
+	if (!dm_mpath_table_header) {
+		DMERR("failed to register sysctl");
+		destroy_workqueue(kmpath_handlerd);
+		destroy_workqueue(kmultipathd);
+		dm_unregister_target(&multipath_target);
+		kmem_cache_destroy(_mpio_cache);
+		return -ENOMEM;
+	}
 
 	DMINFO("version %u.%u.%u loaded",
 	       multipath_target.version[0], multipath_target.version[1],
@@ -1755,6 +1799,8 @@ static int __init dm_multipath_init(void)
 
 static void __exit dm_multipath_exit(void)
 {
+	unregister_sysctl_table(dm_mpath_table_header);
+
 	destroy_workqueue(kmpath_handlerd);
 	destroy_workqueue(kmultipathd);
 
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c434e5a..723b3c2 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -33,11 +33,45 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
  * The size of the mempool used to track chunks in use.
  */
 #define MIN_IOS 256
+static int sysctl_dm_snap_min_ios = MIN_IOS;
 
 #define DM_TRACKED_CHUNK_HASH_SIZE	16
 #define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & \
 					 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
 
+static struct ctl_table_header *dm_snap_table_header;
+static ctl_table dm_snap_ctl_table[] = {
+	{
+		.procname	= "min_ios",
+		.data		= &sysctl_dm_snap_min_ios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table dm_snap_dir_table[] = {
+	{ .procname	= "snap",
+	  .mode		= 0555,
+	  .child	= dm_snap_ctl_table },
+	{ }
+};
+
+static ctl_table dm_snap_dm_dir_table[] = {
+	{ .procname	= "dm",
+	  .mode		= 0555,
+	  .child	= dm_snap_dir_table },
+	{ }
+};
+
+static ctl_table dm_snap_root_table[] = {
+	{ .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= dm_snap_dm_dir_table },
+	{ }
+};
+
 struct dm_exception_table {
 	uint32_t hash_mask;
 	unsigned hash_shift;
@@ -1118,7 +1152,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_kcopyd;
 	}
 
-	s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
+	s->pending_pool = mempool_create_slab_pool(sysctl_dm_snap_min_ios,
+											   pending_cache);
 	if (!s->pending_pool) {
 		ti->error = "Could not allocate mempool for pending exceptions";
 		r = -ENOMEM;
@@ -2268,8 +2303,14 @@ static int __init dm_snapshot_init(void)
 		goto bad_pending_cache;
 	}
 
+	dm_snap_table_header = register_sysctl_table(dm_snap_root_table);
+	if (!dm_snap_table_header)
+		goto bad_sysctl_table;
+
 	return 0;
 
+bad_sysctl_table:
+	kmem_cache_destroy(pending_cache);
 bad_pending_cache:
 	kmem_cache_destroy(exception_cache);
 bad_exception_cache:
@@ -2288,6 +2329,8 @@ bad_register_snapshot_target:
 
 static void __exit dm_snapshot_exit(void)
 {
+	unregister_sysctl_table(dm_snap_table_header);
+
 	dm_unregister_target(&snapshot_target);
 	dm_unregister_target(&origin_target);
 	dm_unregister_target(&merge_target);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9e39d2b..fdff32f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -19,6 +19,7 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/delay.h>
+#include <linux/sysctl.h>
 
 #include <trace/events/block.h>
 
@@ -209,9 +210,36 @@ struct dm_md_mempools {
 };
 
 #define MIN_IOS 256
+static int sysctl_dm_min_ios = MIN_IOS;
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 
+static struct ctl_table_header *dm_table_header;
+static ctl_table dm_ctl_table[] = {
+	{
+		.procname	= "min_ios",
+		.data		= &sysctl_dm_min_ios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table dm_dir_table[] = {
+	{ .procname	= "dm",
+	  .mode		= 0555,
+	  .child	= dm_ctl_table },
+	{ }
+};
+
+static ctl_table dm_root_table[] = {
+	{ .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= dm_dir_table },
+	{ }
+};
+
 static int __init local_init(void)
 {
 	int r = -ENOMEM;
@@ -229,16 +257,22 @@ static int __init local_init(void)
 	if (r)
 		goto out_free_rq_tio_cache;
 
+	dm_table_header = register_sysctl_table(dm_root_table);
+	if (!dm_table_header)
+		goto out_uevent_exit;
+
 	_major = major;
 	r = register_blkdev(_major, _name);
 	if (r < 0)
-		goto out_uevent_exit;
+		goto out_unregister_sysctl_table;
 
 	if (!_major)
 		_major = r;
 
 	return 0;
 
+out_unregister_sysctl_table:
+	unregister_sysctl_table(dm_table_header);
 out_uevent_exit:
 	dm_uevent_exit();
 out_free_rq_tio_cache:
@@ -251,6 +285,7 @@ out_free_io_cache:
 
 static void local_exit(void)
 {
+	unregister_sysctl_table(dm_table_header);
 	kmem_cache_destroy(_rq_tio_cache);
 	kmem_cache_destroy(_io_cache);
 	unregister_blkdev(_major, _name);
@@ -2806,14 +2841,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 	} else if (type == DM_TYPE_REQUEST_BASED) {
 		cachep = _rq_tio_cache;
-		pool_size = MIN_IOS;
+		pool_size = sysctl_dm_min_ios;
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
 		/* per_bio_data_size is not used. See __bind_mempools(). */
 		WARN_ON(per_bio_data_size != 0);
 	} else
 		goto out;
 
-	pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep);
+	pools->io_pool = mempool_create_slab_pool(sysctl_dm_min_ios, cachep);
 	if (!pools->io_pool)
 		goto out;
 

-- 
Frank Mayhar
310-460-4042


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-09 17:48 [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl Frank Mayhar
@ 2013-08-13 16:41 ` Frank Mayhar
  2013-08-16 22:58 ` Frank Mayhar
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 63+ messages in thread
From: Frank Mayhar @ 2013-08-13 16:41 UTC (permalink / raw)
  To: linux-kernel

Ping?  Has anyone glanced at this?

On Fri, 2013-08-09 at 10:48 -0700, Frank Mayhar wrote:
> The device mapper and some of its modules allocate memory pools at
> various points when setting up a device.  In some cases, these pools are
> fairly large, for example the multipath module allocates a 256-entry
> pool and the dm itself allocates three of that size.  In a
> memory-constrained environment where we're creating a lot of these
> devices, the memory use can quickly become significant.  Unfortunately,
> there's currently no way to change the size of the pools other than by
> changing a constant and rebuilding the kernel.
> 
> This patch fixes that by changing the hardcoded MIN_IOS (and certain
> other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
> sysctl-modifiable values.  This lets us change the size of these pools
> on the fly, we can reduce the size of the pools and reduce memory
> pressure.
> 
> We tested performance of dm-mpath with smaller MIN_IOS sizes for both dm
> and dm-mpath, from a value of 32 all the way down to zero.  Bearing in
> mind that the underlying devices were network-based, we saw essentially
> no performance degradation; if there was any, it was down in the noise.
> One might wonder why these sizes are the way they are; I investigated
> and they've been unchanged since at least 2006.
> 
> Signed-off-by: Frank Mayhar <fmayhar@google.com>


-- 
Frank Mayhar
310-460-4042


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-09 17:48 [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl Frank Mayhar
  2013-08-13 16:41 ` Frank Mayhar
@ 2013-08-16 22:58 ` Frank Mayhar
  2013-08-17 12:30 ` [dm-devel] " Alasdair G Kergon
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 63+ messages in thread
From: Frank Mayhar @ 2013-08-16 22:58 UTC (permalink / raw)
  To: linux-kernel

Sorry for the repeats, mailer issues (was supposed to go to dm-devel).
-- 
Frank Mayhar
310-460-4042


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [dm-devel] [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-09 17:48 [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl Frank Mayhar
  2013-08-13 16:41 ` Frank Mayhar
  2013-08-16 22:58 ` Frank Mayhar
@ 2013-08-17 12:30 ` Alasdair G Kergon
  2013-08-19 13:40   ` Mike Snitzer
  2013-08-19 14:00   ` Mike Snitzer
  2013-08-20 21:22 ` [dm-devel] [PATCH] " Mikulas Patocka
  4 siblings, 1 reply; 63+ messages in thread
From: Alasdair G Kergon @ 2013-08-17 12:30 UTC (permalink / raw)
  To: Frank Mayhar; +Cc: linux-kernel, dm-devel

On Fri, Aug 16, 2013 at 03:55:21PM -0700, Frank Mayhar wrote:
> This patch fixes that by changing the hardcoded MIN_IOS (and certain
> other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
> sysctl-modifiable values.  This lets us change the size of these pools
> on the fly, we can reduce the size of the pools and reduce memory
> pressure.
 
Did you consider using module_param_named()?
(E.g. dm_verity_prefetch_cluster in dm-verity.)

Alasdair


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-17 12:30 ` [dm-devel] " Alasdair G Kergon
@ 2013-08-19 13:40   ` Mike Snitzer
  2013-08-19 15:04     ` Frank Mayhar
  0 siblings, 1 reply; 63+ messages in thread
From: Mike Snitzer @ 2013-08-19 13:40 UTC (permalink / raw)
  To: Frank Mayhar, linux-kernel, dm-devel

On Sat, Aug 17 2013 at  8:30am -0400,
Alasdair G Kergon <agk@redhat.com> wrote:

> On Fri, Aug 16, 2013 at 03:55:21PM -0700, Frank Mayhar wrote:
> > This patch fixes that by changing the hardcoded MIN_IOS (and certain
> > other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
> > sysctl-modifiable values.  This lets us change the size of these pools
> > on the fly, we can reduce the size of the pools and reduce memory
> > pressure.
>  
> Did you consider using module_param_named()?
> (E.g. dm_verity_prefetch_cluster in dm-verity.)

Right, I'd prefer to see this be implemented in terms of
module_param_named().  Aside from dm-verity, dm-bufio.c makes the most
use of it.

Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-09 17:48 [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl Frank Mayhar
@ 2013-08-19 14:00   ` Mike Snitzer
  2013-08-16 22:58 ` Frank Mayhar
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-08-19 14:00 UTC (permalink / raw)
  To: Frank Mayhar; +Cc: linux-kernel, dm-devel

On Fri, Aug 16 2013 at  6:55pm -0400,
Frank Mayhar <fmayhar@google.com> wrote:

> The device mapper and some of its modules allocate memory pools at
> various points when setting up a device.  In some cases, these pools are
> fairly large, for example the multipath module allocates a 256-entry
> pool and the dm itself allocates three of that size.  In a
> memory-constrained environment where we're creating a lot of these
> devices, the memory use can quickly become significant.  Unfortunately,
> there's currently no way to change the size of the pools other than by
> changing a constant and rebuilding the kernel.
> 
> This patch fixes that by changing the hardcoded MIN_IOS (and certain
> other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
> sysctl-modifiable values.  This lets us change the size of these pools
> on the fly, we can reduce the size of the pools and reduce memory
> pressure.

These memory reserves are a long-standing issue with DM (made worse when
request-based mpath was introduced).  Two years ago, I assembled a patch
series that took one approach to trying to fix it:
http://people.redhat.com/msnitzer/patches/upstream/dm-rq-based-mempool-sharing/series.html

But in the end I wasn't convinced sharing the memory reserve would allow
for 100s of mpath devices to make forward progress if memory is
depleted.

All said, I think adding the ability to control the size of the memory
reserves is reasonable.  It allows for informed admins to establish
lower reserves (based on the awareness that rq-based mpath doesn't need
to support really large IOs, etc) without compromising the ability to
make forward progress.

But, as mentioned in my porevious mail, I'd like to see this implemnted
in terms of module_param_named().

> We tested performance of dm-mpath with smaller MIN_IOS sizes for both dm
> and dm-mpath, from a value of 32 all the way down to zero.

Bio-based can safely be reduced, as this older (uncommitted) patch did:
http://people.redhat.com/msnitzer/patches/upstream/dm-rq-based-mempool-sharing/0000-dm-lower-bio-based-reservation.patch

> Bearing in mind that the underlying devices were network-based, we saw
> essentially no performance degradation; if there was any, it was down
> in the noise.  One might wonder why these sizes are the way they are;
> I investigated and they've been unchanged since at least 2006.

Performance isn't the concern.  The concern is: does DM allow for
forward progress if the system's memory is completely exhausted?

This is why request-based has such an extensive reserve, because it
needs to account for cloning the largest possible request that comes in
(with multiple bios).

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: dm: Make MIN_IOS, et al, tunable via sysctl.
@ 2013-08-19 14:00   ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-08-19 14:00 UTC (permalink / raw)
  To: Frank Mayhar; +Cc: dm-devel, linux-kernel

On Fri, Aug 16 2013 at  6:55pm -0400,
Frank Mayhar <fmayhar@google.com> wrote:

> The device mapper and some of its modules allocate memory pools at
> various points when setting up a device.  In some cases, these pools are
> fairly large, for example the multipath module allocates a 256-entry
> pool and the dm itself allocates three of that size.  In a
> memory-constrained environment where we're creating a lot of these
> devices, the memory use can quickly become significant.  Unfortunately,
> there's currently no way to change the size of the pools other than by
> changing a constant and rebuilding the kernel.
> 
> This patch fixes that by changing the hardcoded MIN_IOS (and certain
> other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
> sysctl-modifiable values.  This lets us change the size of these pools
> on the fly, we can reduce the size of the pools and reduce memory
> pressure.

These memory reserves are a long-standing issue with DM (made worse when
request-based mpath was introduced).  Two years ago, I assembled a patch
series that took one approach to trying to fix it:
http://people.redhat.com/msnitzer/patches/upstream/dm-rq-based-mempool-sharing/series.html

But in the end I wasn't convinced sharing the memory reserve would allow
for 100s of mpath devices to make forward progress if memory is
depleted.

All said, I think adding the ability to control the size of the memory
reserves is reasonable.  It allows for informed admins to establish
lower reserves (based on the awareness that rq-based mpath doesn't need
to support really large IOs, etc) without compromising the ability to
make forward progress.

But, as mentioned in my porevious mail, I'd like to see this implemnted
in terms of module_param_named().

> We tested performance of dm-mpath with smaller MIN_IOS sizes for both dm
> and dm-mpath, from a value of 32 all the way down to zero.

Bio-based can safely be reduced, as this older (uncommitted) patch did:
http://people.redhat.com/msnitzer/patches/upstream/dm-rq-based-mempool-sharing/0000-dm-lower-bio-based-reservation.patch

> Bearing in mind that the underlying devices were network-based, we saw
> essentially no performance degradation; if there was any, it was down
> in the noise.  One might wonder why these sizes are the way they are;
> I investigated and they've been unchanged since at least 2006.

Performance isn't the concern.  The concern is: does DM allow for
forward progress if the system's memory is completely exhausted?

This is why request-based has such an extensive reserve, because it
needs to account for cloning the largest possible request that comes in
(with multiple bios).

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-19 13:40   ` Mike Snitzer
@ 2013-08-19 15:04     ` Frank Mayhar
  0 siblings, 0 replies; 63+ messages in thread
From: Frank Mayhar @ 2013-08-19 15:04 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: linux-kernel, dm-devel

On Mon, 2013-08-19 at 09:40 -0400, Mike Snitzer wrote:
> On Sat, Aug 17 2013 at  8:30am -0400,
> Alasdair G Kergon <agk@redhat.com> wrote:
> 
> > On Fri, Aug 16, 2013 at 03:55:21PM -0700, Frank Mayhar wrote:
> > > This patch fixes that by changing the hardcoded MIN_IOS (and certain
> > > other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
> > > sysctl-modifiable values.  This lets us change the size of these pools
> > > on the fly, we can reduce the size of the pools and reduce memory
> > > pressure.
> >  
> > Did you consider using module_param_named()?
> > (E.g. dm_verity_prefetch_cluster in dm-verity.)
> 
> Right, I'd prefer to see this be implemented in terms of
> module_param_named().  Aside from dm-verity, dm-bufio.c makes the most
> use of it.

Yeah, I can do that.
-- 
Frank Mayhar
310-460-4042


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [dm-devel] dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-19 14:00   ` Mike Snitzer
  (?)
@ 2013-08-19 17:54   ` Frank Mayhar
  2013-08-19 18:15     ` Mike Snitzer
  2013-08-20 21:44     ` [dm-devel] " Mikulas Patocka
  -1 siblings, 2 replies; 63+ messages in thread
From: Frank Mayhar @ 2013-08-19 17:54 UTC (permalink / raw)
  To: device-mapper development; +Cc: linux-kernel

On Mon, 2013-08-19 at 10:00 -0400, Mike Snitzer wrote:
> Performance isn't the concern.  The concern is: does DM allow for
> forward progress if the system's memory is completely exhausted?
> 
> This is why request-based has such an extensive reserve, because it
> needs to account for cloning the largest possible request that comes in
> (with multiple bios).

Thanks for the response.  In our particular case, I/O will be file
system based and over a network, which makes it pretty easy for us to be
sure that large I/Os never happen.  That notwithstanding, however, as
you said it just seems reasonable to make these values configurable.

I'm also looking at making some similar constants in dm-verity and
dm-bufio configurable in the same way and for similar reasons.
-- 
Frank Mayhar
310-460-4042


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-19 17:54   ` [dm-devel] " Frank Mayhar
@ 2013-08-19 18:15     ` Mike Snitzer
  2013-08-20 21:44     ` [dm-devel] " Mikulas Patocka
  1 sibling, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-08-19 18:15 UTC (permalink / raw)
  To: Frank Mayhar; +Cc: device-mapper development, linux-kernel

On Mon, Aug 19 2013 at  1:54pm -0400,
Frank Mayhar <fmayhar@google.com> wrote:

> On Mon, 2013-08-19 at 10:00 -0400, Mike Snitzer wrote:
> > Performance isn't the concern.  The concern is: does DM allow for
> > forward progress if the system's memory is completely exhausted?
> > 
> > This is why request-based has such an extensive reserve, because it
> > needs to account for cloning the largest possible request that comes in
> > (with multiple bios).
> 
> Thanks for the response.  In our particular case, I/O will be file
> system based and over a network, which makes it pretty easy for us to be
> sure that large I/Os never happen.  That notwithstanding, however, as
> you said it just seems reasonable to make these values configurable.
> 
> I'm also looking at making some similar constants in dm-verity and
> dm-bufio configurable in the same way and for similar reasons.

OK, would be helpful if you were to split each patch out, e.g. separate
patches for DM core, verity, bufio, etc.  Reserve the background context
to the 0th patch header (or DM core patch).  With more precise patch
headers that document the new tunable that is exposed by each patch.

It would also be nice to see these tunables get documented in the
appropriate Documentation/device-mapper/ file too.

Thanks for working on this.  I'll have time to review/assist these
changes in the near term.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [dm-devel] [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-09 17:48 [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl Frank Mayhar
                   ` (3 preceding siblings ...)
  2013-08-19 14:00   ` Mike Snitzer
@ 2013-08-20 21:22 ` Mikulas Patocka
  2013-08-20 21:28   ` Frank Mayhar
  4 siblings, 1 reply; 63+ messages in thread
From: Mikulas Patocka @ 2013-08-20 21:22 UTC (permalink / raw)
  To: device-mapper development; +Cc: linux-kernel, Frank Mayhar, Mike Snitzer



On Fri, 16 Aug 2013, Frank Mayhar wrote:

> The device mapper and some of its modules allocate memory pools at
> various points when setting up a device.  In some cases, these pools are
> fairly large, for example the multipath module allocates a 256-entry
> pool and the dm itself allocates three of that size.  In a
> memory-constrained environment where we're creating a lot of these
> devices, the memory use can quickly become significant.  Unfortunately,
> there's currently no way to change the size of the pools other than by
> changing a constant and rebuilding the kernel.
> 
> This patch fixes that by changing the hardcoded MIN_IOS (and certain
> other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
> sysctl-modifiable values.  This lets us change the size of these pools
> on the fly, we can reduce the size of the pools and reduce memory
> pressure.
> 
> We tested performance of dm-mpath with smaller MIN_IOS sizes for both dm
> and dm-mpath, from a value of 32 all the way down to zero.  Bearing in
> mind that the underlying devices were network-based, we saw essentially
> no performance degradation; if there was any, it was down in the noise.
> One might wonder why these sizes are the way they are; I investigated
> and they've been unchanged since at least 2006.

I think it would be better to set the values to some low value (1 should 
be enough, there is 16 in some places that is already low enough). There 
is no need to make it user-configurable, because the user will see no 
additional benefit from bigger mempools.

Maybe multipath is the exception - but other dm targets don't really need 
big mempools so there is no need to make the size configurable.

Mikulas

> Signed-off-by: Frank Mayhar <fmayhar@google.com>
> --- 
>  drivers/md/dm-crypt.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++----
>  drivers/md/dm-io.c    | 58 +++++++++++++++++++++++++++++++++++++++++++++--
>  drivers/md/dm-mpath.c | 48 ++++++++++++++++++++++++++++++++++++++-
>  drivers/md/dm-snap.c  | 45 +++++++++++++++++++++++++++++++++++-
>  drivers/md/dm.c       | 41 ++++++++++++++++++++++++++++++---
>  5 files changed, 244 insertions(+), 11 deletions(-)
> diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
> index 6d2d41a..d68f10a 100644
> --- a/drivers/md/dm-crypt.c
> +++ b/drivers/md/dm-crypt.c
> @@ -178,12 +178,55 @@ struct crypt_config {
>  #define MIN_IOS        16
>  #define MIN_POOL_PAGES 32
>  
> +static int sysctl_dm_crypt_min_ios = MIN_IOS;
> +static int sysctl_dm_crypt_min_pool_pages = MIN_POOL_PAGES;
> +
>  static struct kmem_cache *_crypt_io_pool;
>  
>  static void clone_init(struct dm_crypt_io *, struct bio *);
>  static void kcryptd_queue_crypt(struct dm_crypt_io *io);
>  static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
>  
> +static struct ctl_table_header *dm_crypt_table_header;
> +static ctl_table dm_crypt_ctl_table[] = {
> +	{
> +		.procname	= "min_ios",
> +		.data		= &sysctl_dm_crypt_min_ios,
> +		.maxlen		= sizeof(int),
> +		.mode		= S_IRUGO|S_IWUSR,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{
> +		.procname	= "min_pool_pages",
> +		.data		= &sysctl_dm_crypt_min_pool_pages,
> +		.maxlen		= sizeof(int),
> +		.mode		= S_IRUGO|S_IWUSR,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{ }
> +};
> +
> +static ctl_table dm_crypt_dir_table[] = {
> +	{ .procname	= "crypt",
> +	  .mode		= 0555,
> +	  .child	= dm_crypt_ctl_table },
> +	{ }
> +};
> +
> +static ctl_table dm_crypt_dm_dir_table[] = {
> +	{ .procname	= "dm",
> +	  .mode		= 0555,
> +	  .child	= dm_crypt_dir_table },
> +	{ }
> +};
> +
> +static ctl_table dm_crypt_root_table[] = {
> +	{ .procname	= "dev",
> +	  .mode		= 0555,
> +	  .child	= dm_crypt_dm_dir_table },
> +	{ }
> +};
> +
>  static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
>  {
>  	return this_cpu_ptr(cc->cpu);
> @@ -1571,7 +1614,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  		goto bad;
>  
>  	ret = -ENOMEM;
> -	cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
> +	cc->io_pool = mempool_create_slab_pool(sysctl_dm_crypt_min_ios,
> +					       _crypt_io_pool);
>  	if (!cc->io_pool) {
>  		ti->error = "Cannot allocate crypt io mempool";
>  		goto bad;
> @@ -1583,20 +1627,22 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  	cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
>  			   ~(crypto_tfm_ctx_alignment() - 1);
>  
> -	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
> +	cc->req_pool = mempool_create_kmalloc_pool(sysctl_dm_crypt_min_ios,
> +			cc->dmreq_start +
>  			sizeof(struct dm_crypt_request) + cc->iv_size);
>  	if (!cc->req_pool) {
>  		ti->error = "Cannot allocate crypt request mempool";
>  		goto bad;
>  	}
>  
> -	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
> +	cc->page_pool = mempool_create_page_pool(sysctl_dm_crypt_min_pool_pages,
> +						 0);
>  	if (!cc->page_pool) {
>  		ti->error = "Cannot allocate page mempool";
>  		goto bad;
>  	}
>  
> -	cc->bs = bioset_create(MIN_IOS, 0);
> +	cc->bs = bioset_create(sysctl_dm_crypt_min_ios, 0);
>  	if (!cc->bs) {
>  		ti->error = "Cannot allocate crypt bioset";
>  		goto bad;
> @@ -1851,11 +1897,20 @@ static int __init dm_crypt_init(void)
>  		kmem_cache_destroy(_crypt_io_pool);
>  	}
>  
> +	dm_crypt_table_header = register_sysctl_table(dm_crypt_root_table);
> +	if (!dm_crypt_table_header) {
> +		DMERR("failed to register sysctl");
> +		dm_unregister_target(&crypt_target);
> +		kmem_cache_destroy(_crypt_io_pool);
> +		return -ENOMEM;
> +	}
> +
>  	return r;
>  }
>  
>  static void __exit dm_crypt_exit(void)
>  {
> +	unregister_sysctl_table(dm_crypt_table_header);
>  	dm_unregister_target(&crypt_target);
>  	kmem_cache_destroy(_crypt_io_pool);
>  }
> diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
> index ea49834..8c45c60 100644
> --- a/drivers/md/dm-io.c
> +++ b/drivers/md/dm-io.c
> @@ -22,6 +22,9 @@
>  #define MIN_IOS		16
>  #define MIN_BIOS	16
>  
> +static int sysctl_dm_io_min_ios = MIN_IOS;
> +static int sysctl_dm_io_min_bios = MIN_BIOS;
> +
>  struct dm_io_client {
>  	mempool_t *pool;
>  	struct bio_set *bios;
> @@ -44,6 +47,46 @@ struct io {
>  
>  static struct kmem_cache *_dm_io_cache;
>  
> +static struct ctl_table_header *dm_io_table_header;
> +static ctl_table dm_io_ctl_table[] = {
> +	{
> +		.procname	= "min_ios",
> +		.data		= &sysctl_dm_io_min_ios,
> +		.maxlen		= sizeof(int),
> +		.mode		= S_IRUGO|S_IWUSR,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{
> +		.procname	= "min_bios",
> +		.data		= &sysctl_dm_io_min_bios,
> +		.maxlen		= sizeof(int),
> +		.mode		= S_IRUGO|S_IWUSR,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{ }
> +};
> +
> +static ctl_table dm_io_dir_table[] = {
> +	{ .procname	= "io",
> +	  .mode		= 0555,
> +	  .child	= dm_io_ctl_table },
> +	{ }
> +};
> +
> +static ctl_table dm_io_dm_dir_table[] = {
> +	{ .procname	= "dm",
> +	  .mode		= 0555,
> +	  .child	= dm_io_dir_table },
> +	{ }
> +};
> +
> +static ctl_table dm_io_root_table[] = {
> +	{ .procname	= "dev",
> +	  .mode		= 0555,
> +	  .child	= dm_io_dm_dir_table },
> +	{ }
> +};
> +
>  /*
>   * Create a client with mempool and bioset.
>   */
> @@ -55,11 +98,12 @@ struct dm_io_client *dm_io_client_create(void)
>  	if (!client)
>  		return ERR_PTR(-ENOMEM);
>  
> -	client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
> +	client->pool = mempool_create_slab_pool(sysctl_dm_io_min_ios,
> +						_dm_io_cache);
>  	if (!client->pool)
>  		goto bad;
>  
> -	client->bios = bioset_create(MIN_BIOS, 0);
> +	client->bios = bioset_create(sysctl_dm_io_min_bios, 0);
>  	if (!client->bios)
>  		goto bad;
>  
> @@ -515,11 +559,21 @@ int __init dm_io_init(void)
>  	if (!_dm_io_cache)
>  		return -ENOMEM;
>  
> +	dm_io_table_header = register_sysctl_table(dm_io_root_table);
> +	if (!dm_io_table_header) {
> +		DMERR("failed to register sysctl");
> +		kmem_cache_destroy(_dm_io_cache);
> +		_dm_io_cache = NULL;
> +		return -ENOMEM;
> +	}
> +
> +
>  	return 0;
>  }
>  
>  void dm_io_exit(void)
>  {
> +	unregister_sysctl_table(dm_io_table_header);
>  	kmem_cache_destroy(_dm_io_cache);
>  	_dm_io_cache = NULL;
>  }
> diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
> index 5adede1..099af1b 100644
> --- a/drivers/md/dm-mpath.c
> +++ b/drivers/md/dm-mpath.c
> @@ -118,6 +118,8 @@ typedef int (*action_fn) (struct pgpath *pgpath);
>  
>  #define MIN_IOS 256	/* Mempool size */
>  
> +static int sysctl_dm_mpath_min_ios = MIN_IOS;
> +
>  static struct kmem_cache *_mpio_cache;
>  
>  static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
> @@ -125,6 +127,38 @@ static void process_queued_ios(struct work_struct *work);
>  static void trigger_event(struct work_struct *work);
>  static void activate_path(struct work_struct *work);
>  
> +static struct ctl_table_header *dm_mpath_table_header;
> +static ctl_table dm_mpath_ctl_table[] = {
> +	{
> +		.procname	= "min_ios",
> +		.data		= &sysctl_dm_mpath_min_ios,
> +		.maxlen		= sizeof(int),
> +		.mode		= S_IRUGO|S_IWUSR,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{ }
> +};
> +
> +static ctl_table dm_mpath_dir_table[] = {
> +	{ .procname	= "mpath",
> +	  .mode		= 0555,
> +	  .child	= dm_mpath_ctl_table },
> +	{ }
> +};
> +
> +static ctl_table dm_mpath_dm_dir_table[] = {
> +	{ .procname	= "dm",
> +	  .mode		= 0555,
> +	  .child	= dm_mpath_dir_table },
> +	{ }
> +};
> +
> +static ctl_table dm_mpath_root_table[] = {
> +	{ .procname	= "dev",
> +	  .mode		= 0555,
> +	  .child	= dm_mpath_dm_dir_table },
> +	{ }
> +};
>  
>  /*-----------------------------------------------
>   * Allocation routines
> @@ -202,7 +236,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
>  		INIT_WORK(&m->trigger_event, trigger_event);
>  		init_waitqueue_head(&m->pg_init_wait);
>  		mutex_init(&m->work_mutex);
> -		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
> +		m->mpio_pool = mempool_create_slab_pool(sysctl_dm_mpath_min_ios,
> +							_mpio_cache);
>  		if (!m->mpio_pool) {
>  			kfree(m);
>  			return NULL;
> @@ -1745,6 +1780,15 @@ static int __init dm_multipath_init(void)
>  		kmem_cache_destroy(_mpio_cache);
>  		return -ENOMEM;
>  	}
> +	dm_mpath_table_header = register_sysctl_table(dm_mpath_root_table);
> +	if (!dm_mpath_table_header) {
> +		DMERR("failed to register sysctl");
> +		destroy_workqueue(kmpath_handlerd);
> +		destroy_workqueue(kmultipathd);
> +		dm_unregister_target(&multipath_target);
> +		kmem_cache_destroy(_mpio_cache);
> +		return -ENOMEM;
> +	}
>  
>  	DMINFO("version %u.%u.%u loaded",
>  	       multipath_target.version[0], multipath_target.version[1],
> @@ -1755,6 +1799,8 @@ static int __init dm_multipath_init(void)
>  
>  static void __exit dm_multipath_exit(void)
>  {
> +	unregister_sysctl_table(dm_mpath_table_header);
> +
>  	destroy_workqueue(kmpath_handlerd);
>  	destroy_workqueue(kmultipathd);
>  
> diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
> index c434e5a..723b3c2 100644
> --- a/drivers/md/dm-snap.c
> +++ b/drivers/md/dm-snap.c
> @@ -33,11 +33,45 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
>   * The size of the mempool used to track chunks in use.
>   */
>  #define MIN_IOS 256
> +static int sysctl_dm_snap_min_ios = MIN_IOS;
>  
>  #define DM_TRACKED_CHUNK_HASH_SIZE	16
>  #define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & \
>  					 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
>  
> +static struct ctl_table_header *dm_snap_table_header;
> +static ctl_table dm_snap_ctl_table[] = {
> +	{
> +		.procname	= "min_ios",
> +		.data		= &sysctl_dm_snap_min_ios,
> +		.maxlen		= sizeof(int),
> +		.mode		= S_IRUGO|S_IWUSR,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{ }
> +};
> +
> +static ctl_table dm_snap_dir_table[] = {
> +	{ .procname	= "snap",
> +	  .mode		= 0555,
> +	  .child	= dm_snap_ctl_table },
> +	{ }
> +};
> +
> +static ctl_table dm_snap_dm_dir_table[] = {
> +	{ .procname	= "dm",
> +	  .mode		= 0555,
> +	  .child	= dm_snap_dir_table },
> +	{ }
> +};
> +
> +static ctl_table dm_snap_root_table[] = {
> +	{ .procname	= "dev",
> +	  .mode		= 0555,
> +	  .child	= dm_snap_dm_dir_table },
> +	{ }
> +};
> +
>  struct dm_exception_table {
>  	uint32_t hash_mask;
>  	unsigned hash_shift;
> @@ -1118,7 +1152,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  		goto bad_kcopyd;
>  	}
>  
> -	s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
> +	s->pending_pool = mempool_create_slab_pool(sysctl_dm_snap_min_ios,
> +											   pending_cache);
>  	if (!s->pending_pool) {
>  		ti->error = "Could not allocate mempool for pending exceptions";
>  		r = -ENOMEM;
> @@ -2268,8 +2303,14 @@ static int __init dm_snapshot_init(void)
>  		goto bad_pending_cache;
>  	}
>  
> +	dm_snap_table_header = register_sysctl_table(dm_snap_root_table);
> +	if (!dm_snap_table_header)
> +		goto bad_sysctl_table;
> +
>  	return 0;
>  
> +bad_sysctl_table:
> +	kmem_cache_destroy(pending_cache);
>  bad_pending_cache:
>  	kmem_cache_destroy(exception_cache);
>  bad_exception_cache:
> @@ -2288,6 +2329,8 @@ bad_register_snapshot_target:
>  
>  static void __exit dm_snapshot_exit(void)
>  {
> +	unregister_sysctl_table(dm_snap_table_header);
> +
>  	dm_unregister_target(&snapshot_target);
>  	dm_unregister_target(&origin_target);
>  	dm_unregister_target(&merge_target);
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 9e39d2b..fdff32f 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -19,6 +19,7 @@
>  #include <linux/idr.h>
>  #include <linux/hdreg.h>
>  #include <linux/delay.h>
> +#include <linux/sysctl.h>
>  
>  #include <trace/events/block.h>
>  
> @@ -209,9 +210,36 @@ struct dm_md_mempools {
>  };
>  
>  #define MIN_IOS 256
> +static int sysctl_dm_min_ios = MIN_IOS;
>  static struct kmem_cache *_io_cache;
>  static struct kmem_cache *_rq_tio_cache;
>  
> +static struct ctl_table_header *dm_table_header;
> +static ctl_table dm_ctl_table[] = {
> +	{
> +		.procname	= "min_ios",
> +		.data		= &sysctl_dm_min_ios,
> +		.maxlen		= sizeof(int),
> +		.mode		= S_IRUGO|S_IWUSR,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{ }
> +};
> +
> +static ctl_table dm_dir_table[] = {
> +	{ .procname	= "dm",
> +	  .mode		= 0555,
> +	  .child	= dm_ctl_table },
> +	{ }
> +};
> +
> +static ctl_table dm_root_table[] = {
> +	{ .procname	= "dev",
> +	  .mode		= 0555,
> +	  .child	= dm_dir_table },
> +	{ }
> +};
> +
>  static int __init local_init(void)
>  {
>  	int r = -ENOMEM;
> @@ -229,16 +257,22 @@ static int __init local_init(void)
>  	if (r)
>  		goto out_free_rq_tio_cache;
>  
> +	dm_table_header = register_sysctl_table(dm_root_table);
> +	if (!dm_table_header)
> +		goto out_uevent_exit;
> +
>  	_major = major;
>  	r = register_blkdev(_major, _name);
>  	if (r < 0)
> -		goto out_uevent_exit;
> +		goto out_unregister_sysctl_table;
>  
>  	if (!_major)
>  		_major = r;
>  
>  	return 0;
>  
> +out_unregister_sysctl_table:
> +	unregister_sysctl_table(dm_table_header);
>  out_uevent_exit:
>  	dm_uevent_exit();
>  out_free_rq_tio_cache:
> @@ -251,6 +285,7 @@ out_free_io_cache:
>  
>  static void local_exit(void)
>  {
> +	unregister_sysctl_table(dm_table_header);
>  	kmem_cache_destroy(_rq_tio_cache);
>  	kmem_cache_destroy(_io_cache);
>  	unregister_blkdev(_major, _name);
> @@ -2806,14 +2841,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
>  		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
>  	} else if (type == DM_TYPE_REQUEST_BASED) {
>  		cachep = _rq_tio_cache;
> -		pool_size = MIN_IOS;
> +		pool_size = sysctl_dm_min_ios;
>  		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
>  		/* per_bio_data_size is not used. See __bind_mempools(). */
>  		WARN_ON(per_bio_data_size != 0);
>  	} else
>  		goto out;
>  
> -	pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep);
> +	pools->io_pool = mempool_create_slab_pool(sysctl_dm_min_ios, cachep);
>  	if (!pools->io_pool)
>  		goto out;
>  
> 
> -- 
> Frank Mayhar
> 310-460-4042
> 
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [dm-devel] [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-20 21:22 ` [dm-devel] [PATCH] " Mikulas Patocka
@ 2013-08-20 21:28   ` Frank Mayhar
  2013-08-20 21:47     ` Mikulas Patocka
  0 siblings, 1 reply; 63+ messages in thread
From: Frank Mayhar @ 2013-08-20 21:28 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: device-mapper development, linux-kernel, Mike Snitzer

On Tue, 2013-08-20 at 17:22 -0400, Mikulas Patocka wrote:
> On Fri, 16 Aug 2013, Frank Mayhar wrote:
> > The device mapper and some of its modules allocate memory pools at
> > various points when setting up a device.  In some cases, these pools are
> > fairly large, for example the multipath module allocates a 256-entry
> > pool and the dm itself allocates three of that size.  In a
> > memory-constrained environment where we're creating a lot of these
> > devices, the memory use can quickly become significant.  Unfortunately,
> > there's currently no way to change the size of the pools other than by
> > changing a constant and rebuilding the kernel.
> I think it would be better to set the values to some low value (1 should 
> be enough, there is 16 in some places that is already low enough). There 
> is no need to make it user-configurable, because the user will see no 
> additional benefit from bigger mempools.
> 
> Maybe multipath is the exception - but other dm targets don't really need 
> big mempools so there is no need to make the size configurable.

The point is not to make the mempools bigger, the point is to be able to
make them _smaller_ for memory-constrained environments.  In some cases,
even 16 can be too big, especially when creating a large number of
devices.

In any event, it seems unfortunate to me that these values are
hard-coded.  One shouldn't have to rebuild the kernel to change them,
IMHO.
-- 
Frank Mayhar
310-460-4042


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [dm-devel] dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-19 14:00   ` Mike Snitzer
  (?)
  (?)
@ 2013-08-20 21:41   ` Mikulas Patocka
  -1 siblings, 0 replies; 63+ messages in thread
From: Mikulas Patocka @ 2013-08-20 21:41 UTC (permalink / raw)
  To: device-mapper development; +Cc: Frank Mayhar, linux-kernel



On Mon, 19 Aug 2013, Mike Snitzer wrote:

> On Fri, Aug 16 2013 at  6:55pm -0400,
> Frank Mayhar <fmayhar@google.com> wrote:
> 
> > The device mapper and some of its modules allocate memory pools at
> > various points when setting up a device.  In some cases, these pools are
> > fairly large, for example the multipath module allocates a 256-entry
> > pool and the dm itself allocates three of that size.  In a
> > memory-constrained environment where we're creating a lot of these
> > devices, the memory use can quickly become significant.  Unfortunately,
> > there's currently no way to change the size of the pools other than by
> > changing a constant and rebuilding the kernel.
> > 
> > This patch fixes that by changing the hardcoded MIN_IOS (and certain
> > other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
> > sysctl-modifiable values.  This lets us change the size of these pools
> > on the fly, we can reduce the size of the pools and reduce memory
> > pressure.
> 
> These memory reserves are a long-standing issue with DM (made worse when
> request-based mpath was introduced).  Two years ago, I assembled a patch
> series that took one approach to trying to fix it:
> http://people.redhat.com/msnitzer/patches/upstream/dm-rq-based-mempool-sharing/series.html
> 
> But in the end I wasn't convinced sharing the memory reserve would allow
> for 100s of mpath devices to make forward progress if memory is
> depleted.
> 
> All said, I think adding the ability to control the size of the memory
> reserves is reasonable.  It allows for informed admins to establish
> lower reserves (based on the awareness that rq-based mpath doesn't need
> to support really large IOs, etc) without compromising the ability to
> make forward progress.
> 
> But, as mentioned in my porevious mail, I'd like to see this implemnted
> in terms of module_param_named().
> 
> > We tested performance of dm-mpath with smaller MIN_IOS sizes for both dm
> > and dm-mpath, from a value of 32 all the way down to zero.
> 
> Bio-based can safely be reduced, as this older (uncommitted) patch did:
> http://people.redhat.com/msnitzer/patches/upstream/dm-rq-based-mempool-sharing/0000-dm-lower-bio-based-reservation.patch
> 
> > Bearing in mind that the underlying devices were network-based, we saw
> > essentially no performance degradation; if there was any, it was down
> > in the noise.  One might wonder why these sizes are the way they are;
> > I investigated and they've been unchanged since at least 2006.
> 
> Performance isn't the concern.  The concern is: does DM allow for
> forward progress if the system's memory is completely exhausted?

There is one possible deadlock that was introduced in commit 
d89d87965dcbe6fe4f96a2a7e8421b3a75f634d1 in 2.6.22-rc1. Unfortunatelly, no 
one found that bug at that time and now it seems to be hard to revert 
that.

The problem is this:

* we send bio1 to the device dm-1, device mapper splits it to bio2 and 
bio3 and sends both of them to the device dm-2. These two bios are added 
to current->bio_list.

* bio2 is popped off current->bio_list, a mempool entry from device dm-2's 
mempool is allocated, bio4 is created and sent to the device dm-3. bio4 is 
added to the end of current->bio_list.

* bio3 is popped off current->bio_list, a mempool entry from device dm-2's 
mempool is allocated. Suppose that the mempool is exhausted, so we wait 
until some existing work (bio2) finishes and returns the entry to the 
mempool.

So: bio3's request routine waits until bio2 finishes and refills the 
mempool. bio2 is waiting for bio4 to finish. bio4 is in current->bio_list 
and is waiting until bio3's request routine fininshes. Deadlock.

In practice, it is not so serious because in mempool_alloc there is:
/*
 * FIXME: this should be io_schedule().  The timeout is there as a
 * workaround for some DM problems in 2.6.18.
 */
io_schedule_timeout(5*HZ);

- so it waits for 5 seconds and retries. If there is something in the 
system that is able to free memory, it resumes.

> This is why request-based has such an extensive reserve, because it
> needs to account for cloning the largest possible request that comes in
> (with multiple bios).

Mikulas

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [dm-devel] dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-19 17:54   ` [dm-devel] " Frank Mayhar
  2013-08-19 18:15     ` Mike Snitzer
@ 2013-08-20 21:44     ` Mikulas Patocka
  2013-08-20 21:52       ` Frank Mayhar
  1 sibling, 1 reply; 63+ messages in thread
From: Mikulas Patocka @ 2013-08-20 21:44 UTC (permalink / raw)
  To: device-mapper development; +Cc: linux-kernel, Frank Mayhar



On Mon, 19 Aug 2013, Frank Mayhar wrote:

> On Mon, 2013-08-19 at 10:00 -0400, Mike Snitzer wrote:
> > Performance isn't the concern.  The concern is: does DM allow for
> > forward progress if the system's memory is completely exhausted?
> > 
> > This is why request-based has such an extensive reserve, because it
> > needs to account for cloning the largest possible request that comes in
> > (with multiple bios).
> 
> Thanks for the response.  In our particular case, I/O will be file
> system based and over a network, which makes it pretty easy for us to be
> sure that large I/Os never happen.  That notwithstanding, however, as
> you said it just seems reasonable to make these values configurable.
> 
> I'm also looking at making some similar constants in dm-verity and
> dm-bufio configurable in the same way and for similar reasons.

Regarding dm-bufio: the user of dm-bufio sets the pool size as an argument 
in dm_bufio_client_create. There is no need to make it configurable - if 
the user selects too low value, deadlock is possible, if the user selects 
too high value, there is no additional advantage.

Regarding dm-verity: the mempool size is 4, there is no need to make it 
bigger, there is no advantage from that.

Mikulas

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [dm-devel] [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-20 21:28   ` Frank Mayhar
@ 2013-08-20 21:47     ` Mikulas Patocka
  2013-08-20 21:57       ` Frank Mayhar
  0 siblings, 1 reply; 63+ messages in thread
From: Mikulas Patocka @ 2013-08-20 21:47 UTC (permalink / raw)
  To: device-mapper development, Frank Mayhar; +Cc: Mike Snitzer, linux-kernel



On Tue, 20 Aug 2013, Frank Mayhar wrote:

> On Tue, 2013-08-20 at 17:22 -0400, Mikulas Patocka wrote:
> > On Fri, 16 Aug 2013, Frank Mayhar wrote:
> > > The device mapper and some of its modules allocate memory pools at
> > > various points when setting up a device.  In some cases, these pools are
> > > fairly large, for example the multipath module allocates a 256-entry
> > > pool and the dm itself allocates three of that size.  In a
> > > memory-constrained environment where we're creating a lot of these
> > > devices, the memory use can quickly become significant.  Unfortunately,
> > > there's currently no way to change the size of the pools other than by
> > > changing a constant and rebuilding the kernel.
> > I think it would be better to set the values to some low value (1 should 
> > be enough, there is 16 in some places that is already low enough). There 
> > is no need to make it user-configurable, because the user will see no 
> > additional benefit from bigger mempools.
> > 
> > Maybe multipath is the exception - but other dm targets don't really need 
> > big mempools so there is no need to make the size configurable.
> 
> The point is not to make the mempools bigger, the point is to be able to
> make them _smaller_ for memory-constrained environments.  In some cases,
> even 16 can be too big, especially when creating a large number of
> devices.
> 
> In any event, it seems unfortunate to me that these values are
> hard-coded.  One shouldn't have to rebuild the kernel to change them,
> IMHO.

So make a patch that changes 16 to 1 if it helps on your system (I'd like 
to ask - what is the configuration where this kind of change helps?). 
There is no need for that to be tunable, anyone can live with mempool size 
being 1.

Mikulas

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [dm-devel] dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-20 21:44     ` [dm-devel] " Mikulas Patocka
@ 2013-08-20 21:52       ` Frank Mayhar
  0 siblings, 0 replies; 63+ messages in thread
From: Frank Mayhar @ 2013-08-20 21:52 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: device-mapper development, linux-kernel

On Tue, 2013-08-20 at 17:44 -0400, Mikulas Patocka wrote:
> On Mon, 19 Aug 2013, Frank Mayhar wrote:
> > On Mon, 2013-08-19 at 10:00 -0400, Mike Snitzer wrote:
> > > Performance isn't the concern.  The concern is: does DM allow for
> > > forward progress if the system's memory is completely exhausted?
> > > 
> > > This is why request-based has such an extensive reserve, because it
> > > needs to account for cloning the largest possible request that comes in
> > > (with multiple bios).
> > 
> > Thanks for the response.  In our particular case, I/O will be file
> > system based and over a network, which makes it pretty easy for us to be
> > sure that large I/Os never happen.  That notwithstanding, however, as
> > you said it just seems reasonable to make these values configurable.
> > 
> > I'm also looking at making some similar constants in dm-verity and
> > dm-bufio configurable in the same way and for similar reasons.
> 
> Regarding dm-bufio: the user of dm-bufio sets the pool size as an argument 
> in dm_bufio_client_create. There is no need to make it configurable - if 
> the user selects too low value, deadlock is possible, if the user selects 
> too high value, there is no additional advantage.

True, but dm-bufio also allocates a a fixed-size 8MB (on a 64-bit
machine) hash table.  I'm still getting performance data but it appears
that reducing this, even by a lot, doesn't impact performance
significantly, at least not with the workload I'm running.  (Which is
using fio, random and sequential reads of varying buffer sizes.)

> Regarding dm-verity: the mempool size is 4, there is no need to make it 
> bigger, there is no advantage from that.

Also true, but there may be an advantage in making it smaller.
-- 
Frank Mayhar
310-460-4042


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [dm-devel] [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-20 21:47     ` Mikulas Patocka
@ 2013-08-20 21:57       ` Frank Mayhar
  2013-08-20 22:24         ` Mike Snitzer
  0 siblings, 1 reply; 63+ messages in thread
From: Frank Mayhar @ 2013-08-20 21:57 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: device-mapper development, Mike Snitzer, linux-kernel

On Tue, 2013-08-20 at 17:47 -0400, Mikulas Patocka wrote:
> 
> On Tue, 20 Aug 2013, Frank Mayhar wrote:
> 
> > On Tue, 2013-08-20 at 17:22 -0400, Mikulas Patocka wrote:
> > > On Fri, 16 Aug 2013, Frank Mayhar wrote:
> > > > The device mapper and some of its modules allocate memory pools at
> > > > various points when setting up a device.  In some cases, these pools are
> > > > fairly large, for example the multipath module allocates a 256-entry
> > > > pool and the dm itself allocates three of that size.  In a
> > > > memory-constrained environment where we're creating a lot of these
> > > > devices, the memory use can quickly become significant.  Unfortunately,
> > > > there's currently no way to change the size of the pools other than by
> > > > changing a constant and rebuilding the kernel.
> > > I think it would be better to set the values to some low value (1 should 
> > > be enough, there is 16 in some places that is already low enough). There 
> > > is no need to make it user-configurable, because the user will see no 
> > > additional benefit from bigger mempools.
> > > 
> > > Maybe multipath is the exception - but other dm targets don't really need 
> > > big mempools so there is no need to make the size configurable.
> > 
> > The point is not to make the mempools bigger, the point is to be able to
> > make them _smaller_ for memory-constrained environments.  In some cases,
> > even 16 can be too big, especially when creating a large number of
> > devices.
> > 
> > In any event, it seems unfortunate to me that these values are
> > hard-coded.  One shouldn't have to rebuild the kernel to change them,
> > IMHO.
> 
> So make a patch that changes 16 to 1 if it helps on your system (I'd like 
> to ask - what is the configuration where this kind of change helps?). 
> There is no need for that to be tunable, anyone can live with mempool size 
> being 1.

I reiterate:  It seems unfortunate that these values are hard-coded.  It
seems to me that a sysadmin should be able to tune them according to his
or her needs.  Particularly when the same kernel is being run on many
machines that may have different constraints; building a special kernel
for each set of constraints doesn't scale.

And as I said, it's a memory-constrained environment.
-- 
Frank Mayhar
310-460-4042


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-20 21:57       ` Frank Mayhar
@ 2013-08-20 22:24         ` Mike Snitzer
  2013-08-20 22:52           ` Mikulas Patocka
  2013-08-20 23:14           ` Frank Mayhar
  0 siblings, 2 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-08-20 22:24 UTC (permalink / raw)
  To: Frank Mayhar; +Cc: Mikulas Patocka, device-mapper development, linux-kernel

On Tue, Aug 20 2013 at  5:57pm -0400,
Frank Mayhar <fmayhar@google.com> wrote:

> On Tue, 2013-08-20 at 17:47 -0400, Mikulas Patocka wrote:
> > 
> > On Tue, 20 Aug 2013, Frank Mayhar wrote:
> > 
> > > On Tue, 2013-08-20 at 17:22 -0400, Mikulas Patocka wrote:
> > > > On Fri, 16 Aug 2013, Frank Mayhar wrote:
> > > > > The device mapper and some of its modules allocate memory pools at
> > > > > various points when setting up a device.  In some cases, these pools are
> > > > > fairly large, for example the multipath module allocates a 256-entry
> > > > > pool and the dm itself allocates three of that size.  In a
> > > > > memory-constrained environment where we're creating a lot of these
> > > > > devices, the memory use can quickly become significant.  Unfortunately,
> > > > > there's currently no way to change the size of the pools other than by
> > > > > changing a constant and rebuilding the kernel.
> > > > I think it would be better to set the values to some low value (1 should 
> > > > be enough, there is 16 in some places that is already low enough). There 
> > > > is no need to make it user-configurable, because the user will see no 
> > > > additional benefit from bigger mempools.
> > > > 
> > > > Maybe multipath is the exception - but other dm targets don't really need 
> > > > big mempools so there is no need to make the size configurable.
> > > 
> > > The point is not to make the mempools bigger, the point is to be able to
> > > make them _smaller_ for memory-constrained environments.  In some cases,
> > > even 16 can be too big, especially when creating a large number of
> > > devices.
> > > 
> > > In any event, it seems unfortunate to me that these values are
> > > hard-coded.  One shouldn't have to rebuild the kernel to change them,
> > > IMHO.
> > 
> > So make a patch that changes 16 to 1 if it helps on your system (I'd like 
> > to ask - what is the configuration where this kind of change helps?). 
> > There is no need for that to be tunable, anyone can live with mempool size 
> > being 1.
> 
> I reiterate:  It seems unfortunate that these values are hard-coded.  It
> seems to me that a sysadmin should be able to tune them according to his
> or her needs.  Particularly when the same kernel is being run on many
> machines that may have different constraints; building a special kernel
> for each set of constraints doesn't scale.
> 
> And as I said, it's a memory-constrained environment.

Mikulas' point is that you cannot reduce the size to smaller than 1.
And aside from rq-based DM, 1 is sufficient to allow for forward
progress even when memory is completely consumed.

A patch that simply changes them to 1 but makes the rq-based DM
mempool's size configurable should actually be fine.

Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-20 22:24         ` Mike Snitzer
@ 2013-08-20 22:52           ` Mikulas Patocka
  2013-08-20 23:14           ` Frank Mayhar
  1 sibling, 0 replies; 63+ messages in thread
From: Mikulas Patocka @ 2013-08-20 22:52 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: Frank Mayhar, device-mapper development, linux-kernel



On Tue, 20 Aug 2013, Mike Snitzer wrote:

> Mikulas' point is that you cannot reduce the size to smaller than 1.
> And aside from rq-based DM, 1 is sufficient to allow for forward
> progress even when memory is completely consumed.
> 
> A patch that simply changes them to 1 but makes the rq-based DM
> mempool's size configurable should actually be fine.

Yes, I think it would be viable.

Mikulas

> Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-20 22:24         ` Mike Snitzer
  2013-08-20 22:52           ` Mikulas Patocka
@ 2013-08-20 23:14           ` Frank Mayhar
  2013-08-22 17:26             ` Frank Mayhar
  2013-08-26 14:28             ` Mikulas Patocka
  1 sibling, 2 replies; 63+ messages in thread
From: Frank Mayhar @ 2013-08-20 23:14 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: Mikulas Patocka, device-mapper development, linux-kernel

On Tue, 2013-08-20 at 18:24 -0400, Mike Snitzer wrote:
> Mikulas' point is that you cannot reduce the size to smaller than 1.
> And aside from rq-based DM, 1 is sufficient to allow for forward
> progress even when memory is completely consumed.
> 
> A patch that simply changes them to 1 but makes the rq-based DM
> mempool's size configurable should actually be fine.

So you're saying that I should submit a patch to drop the pool size for
BIO_BASED to 1 and make the pool size for REQUEST_BASED configurable?
At the moment, in dm.c the former is hardcoded to 16 and the latter is
set via MIN_IOS (currently 256).  There's also io_pool, a slab pool,
which is also set via MIN_IOS.

How does this relate to the rest of the DM modules?  Mpath also sets
MIN_IOS to 256 and creates a slab pool from that, and there are a number
of hardcoded constants in dm-io (MIN_IOS and MIN_BIOS), dm-snap
(MIN_IOS), dm-crypt (MIN_IOS and MIN_POOL_PAGES), dm-bufio
(DM_BUFIO_HASH_BITS, which is allocated via vmalloc per client) and
dm-verity (DM_VERITY_MEMPOOL_SIZE, which is allocated per device).

For the most part I can't imagine that people will want to change these
from their defaults, but when someone does need to change one of these,
they need to do so badly and there's currently no good way to do that
besides hacking the source and building a new kernel.

By the way, I do appreciate the advice.  I'm just trying to clear up
confusion on my part, make sure that our needs are met and, while I'm at
it, make things a bit better for those who come after me.
-- 
Frank Mayhar
310-460-4042


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-20 23:14           ` Frank Mayhar
@ 2013-08-22 17:26             ` Frank Mayhar
  2013-08-26 14:28             ` Mikulas Patocka
  1 sibling, 0 replies; 63+ messages in thread
From: Frank Mayhar @ 2013-08-22 17:26 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: device-mapper development, Mikulas Patocka

On Tue, 2013-08-20 at 16:14 -0700, Frank Mayhar wrote:
> On Tue, 2013-08-20 at 18:24 -0400, Mike Snitzer wrote:
> > Mikulas' point is that you cannot reduce the size to smaller than 1.
> > And aside from rq-based DM, 1 is sufficient to allow for forward
> > progress even when memory is completely consumed.
> > 
> > A patch that simply changes them to 1 but makes the rq-based DM
> > mempool's size configurable should actually be fine.
> 
> So you're saying that I should submit a patch to drop the pool size for
> BIO_BASED to 1 and make the pool size for REQUEST_BASED configurable?
> At the moment, in dm.c the former is hardcoded to 16 and the latter is
> set via MIN_IOS (currently 256).  There's also io_pool, a slab pool,
> which is also set via MIN_IOS.
> 
> How does this relate to the rest of the DM modules?  Mpath also sets
> MIN_IOS to 256 and creates a slab pool from that, and there are a number
> of hardcoded constants in dm-io (MIN_IOS and MIN_BIOS), dm-snap
> (MIN_IOS), dm-crypt (MIN_IOS and MIN_POOL_PAGES), dm-bufio
> (DM_BUFIO_HASH_BITS, which is allocated via vmalloc per client) and
> dm-verity (DM_VERITY_MEMPOOL_SIZE, which is allocated per device).
> 
> For the most part I can't imagine that people will want to change these
> from their defaults, but when someone does need to change one of these,
> they need to do so badly and there's currently no good way to do that
> besides hacking the source and building a new kernel.
> 
> By the way, I do appreciate the advice.  I'm just trying to clear up
> confusion on my part, make sure that our needs are met and, while I'm at
> it, make things a bit better for those who come after me.

Ping?  Can one of you guys answer my question, here?  Thanks!
-- 
Frank Mayhar
310-460-4042

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: dm: Make MIN_IOS, et al, tunable via sysctl.
  2013-08-20 23:14           ` Frank Mayhar
  2013-08-22 17:26             ` Frank Mayhar
@ 2013-08-26 14:28             ` Mikulas Patocka
  2013-09-12 22:24               ` [PATCH 0/7] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
  1 sibling, 1 reply; 63+ messages in thread
From: Mikulas Patocka @ 2013-08-26 14:28 UTC (permalink / raw)
  To: Frank Mayhar; +Cc: Mike Snitzer, device-mapper development, linux-kernel



On Tue, 20 Aug 2013, Frank Mayhar wrote:

> On Tue, 2013-08-20 at 18:24 -0400, Mike Snitzer wrote:
> > Mikulas' point is that you cannot reduce the size to smaller than 1.
> > And aside from rq-based DM, 1 is sufficient to allow for forward
> > progress even when memory is completely consumed.
> > 
> > A patch that simply changes them to 1 but makes the rq-based DM
> > mempool's size configurable should actually be fine.
> 
> So you're saying that I should submit a patch to drop the pool size for
> BIO_BASED to 1 and make the pool size for REQUEST_BASED configurable?
> At the moment, in dm.c the former is hardcoded to 16 and the latter is
> set via MIN_IOS (currently 256).  There's also io_pool, a slab pool,
> which is also set via MIN_IOS.

In case of request-based io, yes, submit a patch that makes it 
configurable.

Regarding bio-based io - there is:
pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep); allocating a 
pool of 256 entries. I think it could be reduced to 16 for bio based 
devices, like other pools.

As for reducing 16 further down to 1 - do you have a setup where it really 
helps? Please describe your system - the amount of memory, the number of 
dm devices, and how much memory is saved by reducing the mempools from 16 
to 1.

> How does this relate to the rest of the DM modules?  Mpath also sets
> MIN_IOS to 256 and creates a slab pool from that, and there are a number
> of hardcoded constants in dm-io (MIN_IOS and MIN_BIOS), dm-snap
> (MIN_IOS),

In dm-snap you shouldn't reduce it because it can cause failure.

> dm-crypt (MIN_IOS and MIN_POOL_PAGES), dm-bufio
> (DM_BUFIO_HASH_BITS, which is allocated via vmalloc per client) and

dm-bufio hash could be reduced - one possibility is to auto-tune it based 
on device size, another possibility is to allocate shared hash table for 
all devices.

> dm-verity (DM_VERITY_MEMPOOL_SIZE, which is allocated per device).
> 
> For the most part I can't imagine that people will want to change these
> from their defaults, but when someone does need to change one of these,
> they need to do so badly and there's currently no good way to do that
> besides hacking the source and building a new kernel.
> 
> By the way, I do appreciate the advice.  I'm just trying to clear up
> confusion on my part, make sure that our needs are met and, while I'm at
> it, make things a bit better for those who come after me.
> -- 
> Frank Mayhar
> 310-460-4042

Mikulas

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH 0/7] dm: allow mempool and bioset reserves to be tuned
  2013-08-26 14:28             ` Mikulas Patocka
@ 2013-09-12 22:24               ` Mike Snitzer
  2013-09-12 22:24                 ` [PATCH 1/7] dm: lower bio-based mempool reservation Mike Snitzer
                                   ` (7 more replies)
  0 siblings, 8 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-09-12 22:24 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: dm-devel

You can pull these changes from the 'devel' branch of:
git://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git

To browse, see:
https://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/log/?h=devel

I'm open to other suggestions but I think this is closer to the right
way forward in the near-term (for 3.13).  I welcome any review feedback
you might have.

Thanks,
Mike

Mike Snitzer (7):
  dm: lower bio-based mempool reservation
  dm: add reserved_rq_based_ios module parameter
  dm: add reserved_bio_based_ios module parameter
  dm io: use dm_get_reserved_bio_based_ios to size reserves
  dm mpath: use dm_get_reserved_rq_based_ios to size mempool
  dm: track the maximum number of bios in a cloned request
  dm: optimize clone_rq() when track_peak_rq_based_ios is disabled

 drivers/md/dm-io.c    |   7 +--
 drivers/md/dm-mpath.c |   6 +-
 drivers/md/dm.c       | 169 ++++++++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/dm.h       |   3 +
 4 files changed, 174 insertions(+), 11 deletions(-)

-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH 1/7] dm: lower bio-based mempool reservation
  2013-09-12 22:24               ` [PATCH 0/7] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
@ 2013-09-12 22:24                 ` Mike Snitzer
  2013-09-12 22:40                   ` Mikulas Patocka
  2013-09-12 22:24                 ` [PATCH 2/7] dm: add reserved_rq_based_ios module parameter Mike Snitzer
                                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 63+ messages in thread
From: Mike Snitzer @ 2013-09-12 22:24 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: dm-devel

Bio-based device mapper processing doesn't need larger mempools (like
request-based DM does), so lower the number of reserved entries for
bio-based operation.  16 was already used for bio-based DM's bioset
but mistakenly wasn't used for it's _io_cache.

Formalize difference between bio-based and request-based defaults by
introducing RESERVED_BIO_BASED_IOS and RESERVED_REQUEST_BASED_IOS.

(based on older code from Mikulas Patocka)

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6a5e9ed..47bac14 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -211,7 +211,8 @@ struct dm_md_mempools {
 	struct bio_set *bs;
 };
 
-#define MIN_IOS 256
+#define RESERVED_BIO_BASED_IOS		16
+#define RESERVED_REQUEST_BASED_IOS	256
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 
@@ -2862,18 +2863,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 
 	if (type == DM_TYPE_BIO_BASED) {
 		cachep = _io_cache;
-		pool_size = 16;
+		pool_size = RESERVED_BIO_BASED_IOS;
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 	} else if (type == DM_TYPE_REQUEST_BASED) {
 		cachep = _rq_tio_cache;
-		pool_size = MIN_IOS;
+		pool_size = RESERVED_REQUEST_BASED_IOS;
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
 		/* per_bio_data_size is not used. See __bind_mempools(). */
 		WARN_ON(per_bio_data_size != 0);
 	} else
 		goto out;
 
-	pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep);
+	pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
 	if (!pools->io_pool)
 		goto out;
 
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 2/7] dm: add reserved_rq_based_ios module parameter
  2013-09-12 22:24               ` [PATCH 0/7] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
  2013-09-12 22:24                 ` [PATCH 1/7] dm: lower bio-based mempool reservation Mike Snitzer
@ 2013-09-12 22:24                 ` Mike Snitzer
  2013-09-12 22:45                   ` Mikulas Patocka
  2013-09-12 22:24                 ` [PATCH 3/7] dm: add reserved_bio_based_ios " Mike Snitzer
                                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 63+ messages in thread
From: Mike Snitzer @ 2013-09-12 22:24 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: dm-devel

Allow user to change the number of IOs that are reserved by
request-based DM's mempools by writing to this file:
/sys/module/dm_mod/parameters/reserved_rq_based_ios

The default value is RESERVED_REQUEST_BASED_IOS (256).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 47bac14..8553d03 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -216,6 +216,38 @@ struct dm_md_mempools {
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 
+/*
+ * Request-based DM's mempools' reserved IOs set by the user
+ */
+static unsigned reserved_rq_based_ios;
+
+/*
+ * A copy of reserved_rq_based_ios because it can change anytime.
+ * If values disagree, the user has changed reserved_rq_based_ios.
+ */
+static unsigned reserved_rq_based_ios_latch;
+
+/*
+ * This mutex protects reserved_rq_based_ios_latch.
+ */
+static DEFINE_MUTEX(dm_mempools_lock);
+
+static void __reserved_request_based_ios_refresh(void)
+{
+	BUG_ON(!mutex_is_locked(&dm_mempools_lock));
+
+	reserved_rq_based_ios_latch = ACCESS_ONCE(reserved_rq_based_ios);
+
+	/*
+	 * If the user uses "0", it means default.  Modify
+	 * reserved_rq_based_ios to report the default to the user.
+	 */
+	if (!reserved_rq_based_ios_latch) {
+		(void)cmpxchg(&reserved_rq_based_ios, 0, RESERVED_REQUEST_BASED_IOS);
+		reserved_rq_based_ios_latch = reserved_rq_based_ios;
+	}
+}
+
 static int __init local_init(void)
 {
 	int r = -ENOMEM;
@@ -241,6 +273,10 @@ static int __init local_init(void)
 	if (!_major)
 		_major = r;
 
+	mutex_lock(&dm_mempools_lock);
+	__reserved_request_based_ios_refresh();
+	mutex_unlock(&dm_mempools_lock);
+
 	return 0;
 
 out_uevent_exit:
@@ -2867,7 +2903,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 	} else if (type == DM_TYPE_REQUEST_BASED) {
 		cachep = _rq_tio_cache;
-		pool_size = RESERVED_REQUEST_BASED_IOS;
+
+		mutex_lock(&dm_mempools_lock);
+		/* Check if reserved_rq_based_ios changed. */
+		if (reserved_rq_based_ios != reserved_rq_based_ios_latch)
+			__reserved_request_based_ios_refresh();
+		pool_size = reserved_rq_based_ios_latch;
+		mutex_unlock(&dm_mempools_lock);
+
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
 		/* per_bio_data_size is not used. See __bind_mempools(). */
 		WARN_ON(per_bio_data_size != 0);
@@ -2925,6 +2968,10 @@ module_exit(dm_exit);
 
 module_param(major, uint, 0);
 MODULE_PARM_DESC(major, "The major number of the device mapper");
+
+module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
+
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 3/7] dm: add reserved_bio_based_ios module parameter
  2013-09-12 22:24               ` [PATCH 0/7] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
  2013-09-12 22:24                 ` [PATCH 1/7] dm: lower bio-based mempool reservation Mike Snitzer
  2013-09-12 22:24                 ` [PATCH 2/7] dm: add reserved_rq_based_ios module parameter Mike Snitzer
@ 2013-09-12 22:24                 ` Mike Snitzer
  2013-09-12 22:47                   ` Mikulas Patocka
  2013-09-12 22:24                 ` [PATCH 4/7] dm io: use dm_get_reserved_bio_based_ios to size reserves Mike Snitzer
                                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 63+ messages in thread
From: Mike Snitzer @ 2013-09-12 22:24 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: dm-devel

Allow user to change the number of IOs that are reserved by
bio-based DM's mempools by writing to this file:
/sys/module/dm_mod/parameters/reserved_bio_based_ios

The default value is RESERVED_BIO_BASED_IOS (16).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 42 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8553d03..b44ae19 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -217,6 +217,17 @@ static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 
 /*
+ * Bio-based DM's mempools' reserved IOs set by the user
+ */
+static unsigned reserved_bio_based_ios;
+
+/*
+ * A copy of reserved_bio_based_ios because it can change anytime.
+ * If values disagree, the user has changed reserved_bio_based_ios.
+ */
+static unsigned reserved_bio_based_ios_latch;
+
+/*
  * Request-based DM's mempools' reserved IOs set by the user
  */
 static unsigned reserved_rq_based_ios;
@@ -228,10 +239,26 @@ static unsigned reserved_rq_based_ios;
 static unsigned reserved_rq_based_ios_latch;
 
 /*
- * This mutex protects reserved_rq_based_ios_latch.
+ * This mutex protects reserved_bio_based_ios_latch and reserved_rq_based_ios_latch.
  */
 static DEFINE_MUTEX(dm_mempools_lock);
 
+static void __reserved_bio_based_ios_refresh(void)
+{
+	BUG_ON(!mutex_is_locked(&dm_mempools_lock));
+
+	reserved_bio_based_ios_latch = ACCESS_ONCE(reserved_bio_based_ios);
+
+	/*
+	 * If the user uses "0", it means default.  Modify
+	 * reserved_bio_based_ios to report the default to the user.
+	 */
+	if (!reserved_bio_based_ios_latch) {
+		(void)cmpxchg(&reserved_bio_based_ios, 0, RESERVED_BIO_BASED_IOS);
+		reserved_bio_based_ios_latch = reserved_bio_based_ios;
+	}
+}
+
 static void __reserved_request_based_ios_refresh(void)
 {
 	BUG_ON(!mutex_is_locked(&dm_mempools_lock));
@@ -274,6 +301,7 @@ static int __init local_init(void)
 		_major = r;
 
 	mutex_lock(&dm_mempools_lock);
+	__reserved_bio_based_ios_refresh();
 	__reserved_request_based_ios_refresh();
 	mutex_unlock(&dm_mempools_lock);
 
@@ -2899,7 +2927,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 
 	if (type == DM_TYPE_BIO_BASED) {
 		cachep = _io_cache;
-		pool_size = RESERVED_BIO_BASED_IOS;
+
+		mutex_lock(&dm_mempools_lock);
+		/* Check if reserved_bio_based_ios changed. */
+		if (reserved_bio_based_ios != reserved_bio_based_ios_latch)
+			__reserved_bio_based_ios_refresh();
+		pool_size = reserved_bio_based_ios_latch;
+		mutex_unlock(&dm_mempools_lock);
+
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 	} else if (type == DM_TYPE_REQUEST_BASED) {
 		cachep = _rq_tio_cache;
@@ -2969,6 +3004,9 @@ module_exit(dm_exit);
 module_param(major, uint, 0);
 MODULE_PARM_DESC(major, "The major number of the device mapper");
 
+module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
+
 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
 
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 4/7] dm io: use dm_get_reserved_bio_based_ios to size reserves
  2013-09-12 22:24               ` [PATCH 0/7] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
                                   ` (2 preceding siblings ...)
  2013-09-12 22:24                 ` [PATCH 3/7] dm: add reserved_bio_based_ios " Mike Snitzer
@ 2013-09-12 22:24                 ` Mike Snitzer
  2013-09-12 22:48                   ` Mikulas Patocka
  2013-09-12 22:24                 ` [PATCH 5/7] dm mpath: use dm_get_reserved_rq_based_ios to size mempool Mike Snitzer
                                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 63+ messages in thread
From: Mike Snitzer @ 2013-09-12 22:24 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: dm-devel

Switch dm-io over to sizing its mempool and bioset using DM core's
configurable 'reserved_bio_based_ios'.

Export dm_get_reserved_bio_based_ios() for use by DM targets and core
code.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-io.c |  7 +++----
 drivers/md/dm.c    | 12 ++++++++++++
 drivers/md/dm.h    |  2 ++
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea49834..2a20986 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -19,8 +19,6 @@
 #define DM_MSG_PREFIX "io"
 
 #define DM_IO_MAX_REGIONS	BITS_PER_LONG
-#define MIN_IOS		16
-#define MIN_BIOS	16
 
 struct dm_io_client {
 	mempool_t *pool;
@@ -50,16 +48,17 @@ static struct kmem_cache *_dm_io_cache;
 struct dm_io_client *dm_io_client_create(void)
 {
 	struct dm_io_client *client;
+	unsigned min_ios = dm_get_reserved_bio_based_ios();
 
 	client = kmalloc(sizeof(*client), GFP_KERNEL);
 	if (!client)
 		return ERR_PTR(-ENOMEM);
 
-	client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
+	client->pool = mempool_create_slab_pool(min_ios, _dm_io_cache);
 	if (!client->pool)
 		goto bad;
 
-	client->bios = bioset_create(MIN_BIOS, 0);
+	client->bios = bioset_create(min_ios, 0);
 	if (!client->bios)
 		goto bad;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b44ae19..3f96c86 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -259,6 +259,18 @@ static void __reserved_bio_based_ios_refresh(void)
 	}
 }
 
+unsigned dm_get_reserved_bio_based_ios(void)
+{
+	unsigned ios;
+
+	mutex_lock(&dm_mempools_lock);
+	ios = reserved_bio_based_ios_latch;
+	mutex_unlock(&dm_mempools_lock);
+
+	return (ios ? : RESERVED_BIO_BASED_IOS);
+}
+EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
+
 static void __reserved_request_based_ios_refresh(void)
 {
 	BUG_ON(!mutex_is_locked(&dm_mempools_lock));
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 5e604cc..e1982eb 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -184,6 +184,8 @@ void dm_free_md_mempools(struct dm_md_mempools *pools);
 /*
  * Helpers that are used by DM core
  */
+unsigned dm_get_reserved_bio_based_ios(void);
+
 static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
 {
 	return !maxlen || strlen(result) + 1 >= maxlen;
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 5/7] dm mpath: use dm_get_reserved_rq_based_ios to size mempool
  2013-09-12 22:24               ` [PATCH 0/7] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
                                   ` (3 preceding siblings ...)
  2013-09-12 22:24                 ` [PATCH 4/7] dm io: use dm_get_reserved_bio_based_ios to size reserves Mike Snitzer
@ 2013-09-12 22:24                 ` Mike Snitzer
  2013-09-12 22:48                   ` Mikulas Patocka
  2013-09-12 22:24                 ` [PATCH 6/7] dm: track the maximum number of bios in a cloned request Mike Snitzer
                                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 63+ messages in thread
From: Mike Snitzer @ 2013-09-12 22:24 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: dm-devel

Switch dm-mpath over to sizing its mempool using DM core's configurable
'reserved_rq_based_ios'.

Export dm_get_reserved_rq_based_ios() for use by DM targets and core
code.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c |  6 +++---
 drivers/md/dm.c       | 12 ++++++++++++
 drivers/md/dm.h       |  1 +
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 5adede1..ddef91f 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -7,6 +7,7 @@
 
 #include <linux/device-mapper.h>
 
+#include "dm.h"
 #include "dm-path-selector.h"
 #include "dm-uevent.h"
 
@@ -116,8 +117,6 @@ struct dm_mpath_io {
 
 typedef int (*action_fn) (struct pgpath *pgpath);
 
-#define MIN_IOS 256	/* Mempool size */
-
 static struct kmem_cache *_mpio_cache;
 
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
@@ -190,6 +189,7 @@ static void free_priority_group(struct priority_group *pg,
 static struct multipath *alloc_multipath(struct dm_target *ti)
 {
 	struct multipath *m;
+	unsigned min_ios = dm_get_reserved_rq_based_ios();
 
 	m = kzalloc(sizeof(*m), GFP_KERNEL);
 	if (m) {
@@ -202,7 +202,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 		INIT_WORK(&m->trigger_event, trigger_event);
 		init_waitqueue_head(&m->pg_init_wait);
 		mutex_init(&m->work_mutex);
-		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
+		m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
 		if (!m->mpio_pool) {
 			kfree(m);
 			return NULL;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3f96c86..186f77d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -287,6 +287,18 @@ static void __reserved_request_based_ios_refresh(void)
 	}
 }
 
+unsigned dm_get_reserved_rq_based_ios(void)
+{
+	unsigned ios;
+
+	mutex_lock(&dm_mempools_lock);
+	ios = reserved_rq_based_ios_latch;
+	mutex_unlock(&dm_mempools_lock);
+
+	return (ios ? : RESERVED_REQUEST_BASED_IOS);
+}
+EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
+
 static int __init local_init(void)
 {
 	int r = -ENOMEM;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index e1982eb..1d1ad7b 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -185,6 +185,7 @@ void dm_free_md_mempools(struct dm_md_mempools *pools);
  * Helpers that are used by DM core
  */
 unsigned dm_get_reserved_bio_based_ios(void);
+unsigned dm_get_reserved_rq_based_ios(void);
 
 static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
 {
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 6/7] dm: track the maximum number of bios in a cloned request
  2013-09-12 22:24               ` [PATCH 0/7] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
                                   ` (4 preceding siblings ...)
  2013-09-12 22:24                 ` [PATCH 5/7] dm mpath: use dm_get_reserved_rq_based_ios to size mempool Mike Snitzer
@ 2013-09-12 22:24                 ` Mike Snitzer
  2013-09-12 22:55                   ` Mikulas Patocka
  2013-09-12 22:24                 ` [PATCH 7/7] dm: optimize clone_rq() when track_peak_rq_based_ios is disabled Mike Snitzer
  2013-09-13 18:59                 ` [PATCH v2 0/3] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
  7 siblings, 1 reply; 63+ messages in thread
From: Mike Snitzer @ 2013-09-12 22:24 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: dm-devel

If /sys/modules/dm_mod/parameters/track_peak_rq_based_ios is set to 1 it
enables the tracking of the maximum number of bios in a cloned request.

The maximum number of bios in a cloned request is then exposed through
the following read-only parameter:
/sys/modules/dm_mod/parameters/peak_rq_based_ios

This information can be useful when deciding on a value for dm_mod's
'reserved_rq_based_ios' parameter.  Otherwise, 'track_peak_rq_based_ios'
should be kept disabled (the default).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 186f77d..de83930 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -239,6 +239,12 @@ static unsigned reserved_rq_based_ios;
 static unsigned reserved_rq_based_ios_latch;
 
 /*
+ * Optionally track the maximum numbers of IOs in a cloned request.
+ */
+static unsigned track_peak_rq_based_ios;
+static unsigned peak_rq_based_ios;
+
+/*
  * This mutex protects reserved_bio_based_ios_latch and reserved_rq_based_ios_latch.
  */
 static DEFINE_MUTEX(dm_mempools_lock);
@@ -1719,6 +1725,16 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 		return NULL;
 	}
 
+	if (unlikely(ACCESS_ONCE(track_peak_rq_based_ios))) {
+		struct bio *bio;
+		unsigned num_bios = 0;
+
+		__rq_for_each_bio(bio, clone)
+			num_bios++;
+		if (num_bios > ACCESS_ONCE(peak_rq_based_ios))
+			ACCESS_ONCE(peak_rq_based_ios) = num_bios;
+	}
+
 	return clone;
 }
 
@@ -3034,6 +3050,12 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
 
+module_param(track_peak_rq_based_ios, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(peak_rq_based_ios, "Enable tracking the maximum IOs in a cloned request");
+
+module_param(peak_rq_based_ios, uint, S_IRUGO);
+MODULE_PARM_DESC(peak_rq_based_ios, "Tracks the maximum IOs in a cloned request");
+
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 7/7] dm: optimize clone_rq() when track_peak_rq_based_ios is disabled
  2013-09-12 22:24               ` [PATCH 0/7] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
                                   ` (5 preceding siblings ...)
  2013-09-12 22:24                 ` [PATCH 6/7] dm: track the maximum number of bios in a cloned request Mike Snitzer
@ 2013-09-12 22:24                 ` Mike Snitzer
  2013-09-12 23:00                   ` Mikulas Patocka
  2013-09-13 18:59                 ` [PATCH v2 0/3] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
  7 siblings, 1 reply; 63+ messages in thread
From: Mike Snitzer @ 2013-09-12 22:24 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: dm-devel

Make use of static keys to eliminate the relevant branch in clone_rq()
when dm_mod's 'track_peak_rq_based_ios' paramter is set to 0 (disabled).

Even if 'track_peak_rq_based_ios' is set to 0 it will not take effect
until the next request-based table reload.  Once it is disabled the
'peak_rq_based_ios' parameter will be reset to 0.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index de83930..d9e38a7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -19,6 +19,7 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/delay.h>
+#include <linux/static_key.h>
 
 #include <trace/events/block.h>
 
@@ -242,10 +243,14 @@ static unsigned reserved_rq_based_ios_latch;
  * Optionally track the maximum numbers of IOs in a cloned request.
  */
 static unsigned track_peak_rq_based_ios;
+static unsigned track_peak_rq_based_ios_latch;
 static unsigned peak_rq_based_ios;
 
+static struct static_key_deferred track_peak_rq_based_ios_enabled; /* defaults to false */
+
 /*
- * This mutex protects reserved_bio_based_ios_latch and reserved_rq_based_ios_latch.
+ * This mutex protects reserved_bio_based_ios_latch, reserved_rq_based_ios_latch
+ * and track_peak_rq_based_ios_latch.
  */
 static DEFINE_MUTEX(dm_mempools_lock);
 
@@ -305,6 +310,26 @@ unsigned dm_get_reserved_rq_based_ios(void)
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
 
+static void __track_peak_rq_based_ios_refresh(void)
+{
+	bool enabled = static_key_false(&track_peak_rq_based_ios_enabled.key);
+
+	BUG_ON(!mutex_is_locked(&dm_mempools_lock));
+
+	track_peak_rq_based_ios_latch = ACCESS_ONCE(track_peak_rq_based_ios);
+
+	if (track_peak_rq_based_ios_latch) {
+		if (enabled)
+			return; /* already enabled */
+		static_key_slow_inc(&track_peak_rq_based_ios_enabled.key);
+	} else {
+		if (!enabled)
+			return; /* already disabled */
+		static_key_slow_dec(&track_peak_rq_based_ios_enabled.key);
+		ACCESS_ONCE(peak_rq_based_ios) = 0;
+	}
+}
+
 static int __init local_init(void)
 {
 	int r = -ENOMEM;
@@ -1725,7 +1750,7 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 		return NULL;
 	}
 
-	if (unlikely(ACCESS_ONCE(track_peak_rq_based_ios))) {
+	if (static_key_false(&track_peak_rq_based_ios_enabled.key)) {
 		struct bio *bio;
 		unsigned num_bios = 0;
 
@@ -2984,6 +3009,10 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 		if (reserved_rq_based_ios != reserved_rq_based_ios_latch)
 			__reserved_request_based_ios_refresh();
 		pool_size = reserved_rq_based_ios_latch;
+
+		/* Check if track_peak_rq_based_ios changed. */
+		if (track_peak_rq_based_ios != track_peak_rq_based_ios_latch)
+			__track_peak_rq_based_ios_refresh();
 		mutex_unlock(&dm_mempools_lock);
 
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH 1/7] dm: lower bio-based mempool reservation
  2013-09-12 22:24                 ` [PATCH 1/7] dm: lower bio-based mempool reservation Mike Snitzer
@ 2013-09-12 22:40                   ` Mikulas Patocka
  0 siblings, 0 replies; 63+ messages in thread
From: Mikulas Patocka @ 2013-09-12 22:40 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: dm-devel, Frank Mayhar

Acked-by: Mikulas Patocka <mpatocka@redhat.com>

On Thu, 12 Sep 2013, Mike Snitzer wrote:

> Bio-based device mapper processing doesn't need larger mempools (like
> request-based DM does), so lower the number of reserved entries for
> bio-based operation.  16 was already used for bio-based DM's bioset
> but mistakenly wasn't used for it's _io_cache.
> 
> Formalize difference between bio-based and request-based defaults by
> introducing RESERVED_BIO_BASED_IOS and RESERVED_REQUEST_BASED_IOS.
> 
> (based on older code from Mikulas Patocka)
> 
> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> ---
>  drivers/md/dm.c | 9 +++++----
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 6a5e9ed..47bac14 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -211,7 +211,8 @@ struct dm_md_mempools {
>  	struct bio_set *bs;
>  };
>  
> -#define MIN_IOS 256
> +#define RESERVED_BIO_BASED_IOS		16
> +#define RESERVED_REQUEST_BASED_IOS	256
>  static struct kmem_cache *_io_cache;
>  static struct kmem_cache *_rq_tio_cache;
>  
> @@ -2862,18 +2863,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
>  
>  	if (type == DM_TYPE_BIO_BASED) {
>  		cachep = _io_cache;
> -		pool_size = 16;
> +		pool_size = RESERVED_BIO_BASED_IOS;
>  		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
>  	} else if (type == DM_TYPE_REQUEST_BASED) {
>  		cachep = _rq_tio_cache;
> -		pool_size = MIN_IOS;
> +		pool_size = RESERVED_REQUEST_BASED_IOS;
>  		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
>  		/* per_bio_data_size is not used. See __bind_mempools(). */
>  		WARN_ON(per_bio_data_size != 0);
>  	} else
>  		goto out;
>  
> -	pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep);
> +	pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
>  	if (!pools->io_pool)
>  		goto out;
>  
> -- 
> 1.8.1.4
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 2/7] dm: add reserved_rq_based_ios module parameter
  2013-09-12 22:24                 ` [PATCH 2/7] dm: add reserved_rq_based_ios module parameter Mike Snitzer
@ 2013-09-12 22:45                   ` Mikulas Patocka
  2013-09-12 23:15                     ` Mike Snitzer
  0 siblings, 1 reply; 63+ messages in thread
From: Mikulas Patocka @ 2013-09-12 22:45 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: dm-devel, Frank Mayhar

I think this is too complicated.

dm-bufio uses similar approach like this patch - but in dm-bufio the code 
path is performance critical, we don't want to recalculate memory size 
with each i/o request.

Here it is not performance critical, so I'd drop the mutex, drop the 
latch, drop the function __reserved_request_based_ios_refresh and add only 
these lines:

pool_size = ACCESS_ONCE(reserved_rq_based_ios);
if (!pool_size) pool_size = RESERVED_REQUEST_BASED_IOS;

Mikulas


On Thu, 12 Sep 2013, Mike Snitzer wrote:

> Allow user to change the number of IOs that are reserved by
> request-based DM's mempools by writing to this file:
> /sys/module/dm_mod/parameters/reserved_rq_based_ios
> 
> The default value is RESERVED_REQUEST_BASED_IOS (256).
> 
> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> ---
>  drivers/md/dm.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 48 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 47bac14..8553d03 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -216,6 +216,38 @@ struct dm_md_mempools {
>  static struct kmem_cache *_io_cache;
>  static struct kmem_cache *_rq_tio_cache;
>  
> +/*
> + * Request-based DM's mempools' reserved IOs set by the user
> + */
> +static unsigned reserved_rq_based_ios;
> +
> +/*
> + * A copy of reserved_rq_based_ios because it can change anytime.
> + * If values disagree, the user has changed reserved_rq_based_ios.
> + */
> +static unsigned reserved_rq_based_ios_latch;
> +
> +/*
> + * This mutex protects reserved_rq_based_ios_latch.
> + */
> +static DEFINE_MUTEX(dm_mempools_lock);
> +
> +static void __reserved_request_based_ios_refresh(void)
> +{
> +	BUG_ON(!mutex_is_locked(&dm_mempools_lock));
> +
> +	reserved_rq_based_ios_latch = ACCESS_ONCE(reserved_rq_based_ios);
> +
> +	/*
> +	 * If the user uses "0", it means default.  Modify
> +	 * reserved_rq_based_ios to report the default to the user.
> +	 */
> +	if (!reserved_rq_based_ios_latch) {
> +		(void)cmpxchg(&reserved_rq_based_ios, 0, RESERVED_REQUEST_BASED_IOS);
> +		reserved_rq_based_ios_latch = reserved_rq_based_ios;
> +	}
> +}
> +
>  static int __init local_init(void)
>  {
>  	int r = -ENOMEM;
> @@ -241,6 +273,10 @@ static int __init local_init(void)
>  	if (!_major)
>  		_major = r;
>  
> +	mutex_lock(&dm_mempools_lock);
> +	__reserved_request_based_ios_refresh();
> +	mutex_unlock(&dm_mempools_lock);
> +
>  	return 0;
>  
>  out_uevent_exit:
> @@ -2867,7 +2903,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
>  		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
>  	} else if (type == DM_TYPE_REQUEST_BASED) {
>  		cachep = _rq_tio_cache;
> -		pool_size = RESERVED_REQUEST_BASED_IOS;
> +
> +		mutex_lock(&dm_mempools_lock);
> +		/* Check if reserved_rq_based_ios changed. */
> +		if (reserved_rq_based_ios != reserved_rq_based_ios_latch)
> +			__reserved_request_based_ios_refresh();
> +		pool_size = reserved_rq_based_ios_latch;
> +		mutex_unlock(&dm_mempools_lock);
> +
>  		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
>  		/* per_bio_data_size is not used. See __bind_mempools(). */
>  		WARN_ON(per_bio_data_size != 0);
> @@ -2925,6 +2968,10 @@ module_exit(dm_exit);
>  
>  module_param(major, uint, 0);
>  MODULE_PARM_DESC(major, "The major number of the device mapper");
> +
> +module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
> +MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
> +
>  MODULE_DESCRIPTION(DM_NAME " driver");
>  MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
>  MODULE_LICENSE("GPL");
> -- 
> 1.8.1.4
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 3/7] dm: add reserved_bio_based_ios module parameter
  2013-09-12 22:24                 ` [PATCH 3/7] dm: add reserved_bio_based_ios " Mike Snitzer
@ 2013-09-12 22:47                   ` Mikulas Patocka
  2013-09-12 23:11                     ` Mike Snitzer
  0 siblings, 1 reply; 63+ messages in thread
From: Mikulas Patocka @ 2013-09-12 22:47 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: dm-devel, Frank Mayhar

We don't need this. bio-based i/o should be fine with a small mempool, 
there is no need to make it tunable.

Mikulas


On Thu, 12 Sep 2013, Mike Snitzer wrote:

> Allow user to change the number of IOs that are reserved by
> bio-based DM's mempools by writing to this file:
> /sys/module/dm_mod/parameters/reserved_bio_based_ios
> 
> The default value is RESERVED_BIO_BASED_IOS (16).
> 
> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> ---
>  drivers/md/dm.c | 42 ++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 40 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 8553d03..b44ae19 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -217,6 +217,17 @@ static struct kmem_cache *_io_cache;
>  static struct kmem_cache *_rq_tio_cache;
>  
>  /*
> + * Bio-based DM's mempools' reserved IOs set by the user
> + */
> +static unsigned reserved_bio_based_ios;
> +
> +/*
> + * A copy of reserved_bio_based_ios because it can change anytime.
> + * If values disagree, the user has changed reserved_bio_based_ios.
> + */
> +static unsigned reserved_bio_based_ios_latch;
> +
> +/*
>   * Request-based DM's mempools' reserved IOs set by the user
>   */
>  static unsigned reserved_rq_based_ios;
> @@ -228,10 +239,26 @@ static unsigned reserved_rq_based_ios;
>  static unsigned reserved_rq_based_ios_latch;
>  
>  /*
> - * This mutex protects reserved_rq_based_ios_latch.
> + * This mutex protects reserved_bio_based_ios_latch and reserved_rq_based_ios_latch.
>   */
>  static DEFINE_MUTEX(dm_mempools_lock);
>  
> +static void __reserved_bio_based_ios_refresh(void)
> +{
> +	BUG_ON(!mutex_is_locked(&dm_mempools_lock));
> +
> +	reserved_bio_based_ios_latch = ACCESS_ONCE(reserved_bio_based_ios);
> +
> +	/*
> +	 * If the user uses "0", it means default.  Modify
> +	 * reserved_bio_based_ios to report the default to the user.
> +	 */
> +	if (!reserved_bio_based_ios_latch) {
> +		(void)cmpxchg(&reserved_bio_based_ios, 0, RESERVED_BIO_BASED_IOS);
> +		reserved_bio_based_ios_latch = reserved_bio_based_ios;
> +	}
> +}
> +
>  static void __reserved_request_based_ios_refresh(void)
>  {
>  	BUG_ON(!mutex_is_locked(&dm_mempools_lock));
> @@ -274,6 +301,7 @@ static int __init local_init(void)
>  		_major = r;
>  
>  	mutex_lock(&dm_mempools_lock);
> +	__reserved_bio_based_ios_refresh();
>  	__reserved_request_based_ios_refresh();
>  	mutex_unlock(&dm_mempools_lock);
>  
> @@ -2899,7 +2927,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
>  
>  	if (type == DM_TYPE_BIO_BASED) {
>  		cachep = _io_cache;
> -		pool_size = RESERVED_BIO_BASED_IOS;
> +
> +		mutex_lock(&dm_mempools_lock);
> +		/* Check if reserved_bio_based_ios changed. */
> +		if (reserved_bio_based_ios != reserved_bio_based_ios_latch)
> +			__reserved_bio_based_ios_refresh();
> +		pool_size = reserved_bio_based_ios_latch;
> +		mutex_unlock(&dm_mempools_lock);
> +
>  		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
>  	} else if (type == DM_TYPE_REQUEST_BASED) {
>  		cachep = _rq_tio_cache;
> @@ -2969,6 +3004,9 @@ module_exit(dm_exit);
>  module_param(major, uint, 0);
>  MODULE_PARM_DESC(major, "The major number of the device mapper");
>  
> +module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
> +MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
> +
>  module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
>  MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
>  
> -- 
> 1.8.1.4
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 4/7] dm io: use dm_get_reserved_bio_based_ios to size reserves
  2013-09-12 22:24                 ` [PATCH 4/7] dm io: use dm_get_reserved_bio_based_ios to size reserves Mike Snitzer
@ 2013-09-12 22:48                   ` Mikulas Patocka
  0 siblings, 0 replies; 63+ messages in thread
From: Mikulas Patocka @ 2013-09-12 22:48 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: dm-devel, Frank Mayhar

No need to tune it for bio based.

On Thu, 12 Sep 2013, Mike Snitzer wrote:

> Switch dm-io over to sizing its mempool and bioset using DM core's
> configurable 'reserved_bio_based_ios'.
> 
> Export dm_get_reserved_bio_based_ios() for use by DM targets and core
> code.
> 
> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> ---
>  drivers/md/dm-io.c |  7 +++----
>  drivers/md/dm.c    | 12 ++++++++++++
>  drivers/md/dm.h    |  2 ++
>  3 files changed, 17 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
> index ea49834..2a20986 100644
> --- a/drivers/md/dm-io.c
> +++ b/drivers/md/dm-io.c
> @@ -19,8 +19,6 @@
>  #define DM_MSG_PREFIX "io"
>  
>  #define DM_IO_MAX_REGIONS	BITS_PER_LONG
> -#define MIN_IOS		16
> -#define MIN_BIOS	16
>  
>  struct dm_io_client {
>  	mempool_t *pool;
> @@ -50,16 +48,17 @@ static struct kmem_cache *_dm_io_cache;
>  struct dm_io_client *dm_io_client_create(void)
>  {
>  	struct dm_io_client *client;
> +	unsigned min_ios = dm_get_reserved_bio_based_ios();
>  
>  	client = kmalloc(sizeof(*client), GFP_KERNEL);
>  	if (!client)
>  		return ERR_PTR(-ENOMEM);
>  
> -	client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
> +	client->pool = mempool_create_slab_pool(min_ios, _dm_io_cache);
>  	if (!client->pool)
>  		goto bad;
>  
> -	client->bios = bioset_create(MIN_BIOS, 0);
> +	client->bios = bioset_create(min_ios, 0);
>  	if (!client->bios)
>  		goto bad;
>  
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index b44ae19..3f96c86 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -259,6 +259,18 @@ static void __reserved_bio_based_ios_refresh(void)
>  	}
>  }
>  
> +unsigned dm_get_reserved_bio_based_ios(void)
> +{
> +	unsigned ios;
> +
> +	mutex_lock(&dm_mempools_lock);
> +	ios = reserved_bio_based_ios_latch;
> +	mutex_unlock(&dm_mempools_lock);
> +
> +	return (ios ? : RESERVED_BIO_BASED_IOS);
> +}
> +EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
> +
>  static void __reserved_request_based_ios_refresh(void)
>  {
>  	BUG_ON(!mutex_is_locked(&dm_mempools_lock));
> diff --git a/drivers/md/dm.h b/drivers/md/dm.h
> index 5e604cc..e1982eb 100644
> --- a/drivers/md/dm.h
> +++ b/drivers/md/dm.h
> @@ -184,6 +184,8 @@ void dm_free_md_mempools(struct dm_md_mempools *pools);
>  /*
>   * Helpers that are used by DM core
>   */
> +unsigned dm_get_reserved_bio_based_ios(void);
> +
>  static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
>  {
>  	return !maxlen || strlen(result) + 1 >= maxlen;
> -- 
> 1.8.1.4
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 5/7] dm mpath: use dm_get_reserved_rq_based_ios to size mempool
  2013-09-12 22:24                 ` [PATCH 5/7] dm mpath: use dm_get_reserved_rq_based_ios to size mempool Mike Snitzer
@ 2013-09-12 22:48                   ` Mikulas Patocka
  0 siblings, 0 replies; 63+ messages in thread
From: Mikulas Patocka @ 2013-09-12 22:48 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: dm-devel, Frank Mayhar

Acked-by: Mikulas Patocka <mpatocka@redhat.com>

On Thu, 12 Sep 2013, Mike Snitzer wrote:

> Switch dm-mpath over to sizing its mempool using DM core's configurable
> 'reserved_rq_based_ios'.
> 
> Export dm_get_reserved_rq_based_ios() for use by DM targets and core
> code.
> 
> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> ---
>  drivers/md/dm-mpath.c |  6 +++---
>  drivers/md/dm.c       | 12 ++++++++++++
>  drivers/md/dm.h       |  1 +
>  3 files changed, 16 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
> index 5adede1..ddef91f 100644
> --- a/drivers/md/dm-mpath.c
> +++ b/drivers/md/dm-mpath.c
> @@ -7,6 +7,7 @@
>  
>  #include <linux/device-mapper.h>
>  
> +#include "dm.h"
>  #include "dm-path-selector.h"
>  #include "dm-uevent.h"
>  
> @@ -116,8 +117,6 @@ struct dm_mpath_io {
>  
>  typedef int (*action_fn) (struct pgpath *pgpath);
>  
> -#define MIN_IOS 256	/* Mempool size */
> -
>  static struct kmem_cache *_mpio_cache;
>  
>  static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
> @@ -190,6 +189,7 @@ static void free_priority_group(struct priority_group *pg,
>  static struct multipath *alloc_multipath(struct dm_target *ti)
>  {
>  	struct multipath *m;
> +	unsigned min_ios = dm_get_reserved_rq_based_ios();
>  
>  	m = kzalloc(sizeof(*m), GFP_KERNEL);
>  	if (m) {
> @@ -202,7 +202,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
>  		INIT_WORK(&m->trigger_event, trigger_event);
>  		init_waitqueue_head(&m->pg_init_wait);
>  		mutex_init(&m->work_mutex);
> -		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
> +		m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
>  		if (!m->mpio_pool) {
>  			kfree(m);
>  			return NULL;
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 3f96c86..186f77d 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -287,6 +287,18 @@ static void __reserved_request_based_ios_refresh(void)
>  	}
>  }
>  
> +unsigned dm_get_reserved_rq_based_ios(void)
> +{
> +	unsigned ios;
> +
> +	mutex_lock(&dm_mempools_lock);
> +	ios = reserved_rq_based_ios_latch;
> +	mutex_unlock(&dm_mempools_lock);
> +
> +	return (ios ? : RESERVED_REQUEST_BASED_IOS);
> +}
> +EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
> +
>  static int __init local_init(void)
>  {
>  	int r = -ENOMEM;
> diff --git a/drivers/md/dm.h b/drivers/md/dm.h
> index e1982eb..1d1ad7b 100644
> --- a/drivers/md/dm.h
> +++ b/drivers/md/dm.h
> @@ -185,6 +185,7 @@ void dm_free_md_mempools(struct dm_md_mempools *pools);
>   * Helpers that are used by DM core
>   */
>  unsigned dm_get_reserved_bio_based_ios(void);
> +unsigned dm_get_reserved_rq_based_ios(void);
>  
>  static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
>  {
> -- 
> 1.8.1.4
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 6/7] dm: track the maximum number of bios in a cloned request
  2013-09-12 22:24                 ` [PATCH 6/7] dm: track the maximum number of bios in a cloned request Mike Snitzer
@ 2013-09-12 22:55                   ` Mikulas Patocka
  2013-09-12 23:09                     ` Mike Snitzer
  0 siblings, 1 reply; 63+ messages in thread
From: Mikulas Patocka @ 2013-09-12 22:55 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: dm-devel, Frank Mayhar

There's a race condition (it's not serious):

+               if (num_bios > ACCESS_ONCE(peak_rq_based_ios))
+                       ACCESS_ONCE(peak_rq_based_ios) = num_bios;

You can double-check it using code like this:
if (num_bios > ACCESS_ONCE(peak_rq_based_ios)) {
	spin_lock_irqsave(&peak_rq_based_ios_lock, flags);
	if (num_bios > peak_rq_based_ios)
		peak_rq_based_ios = num_bios;
	spin_unlock_irqrestore(&peak_rq_based_ios_lock, flags);
}


Maybe - reset the value peak_rq_based_ios if the user clears 
track_peak_rq_based_ios?

Mikulas

On Thu, 12 Sep 2013, Mike Snitzer wrote:

> If /sys/modules/dm_mod/parameters/track_peak_rq_based_ios is set to 1 it
> enables the tracking of the maximum number of bios in a cloned request.
> 
> The maximum number of bios in a cloned request is then exposed through
> the following read-only parameter:
> /sys/modules/dm_mod/parameters/peak_rq_based_ios
> 
> This information can be useful when deciding on a value for dm_mod's
> 'reserved_rq_based_ios' parameter.  Otherwise, 'track_peak_rq_based_ios'
> should be kept disabled (the default).
> 
> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> ---
>  drivers/md/dm.c | 22 ++++++++++++++++++++++
>  1 file changed, 22 insertions(+)
> 
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 186f77d..de83930 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -239,6 +239,12 @@ static unsigned reserved_rq_based_ios;
>  static unsigned reserved_rq_based_ios_latch;
>  
>  /*
> + * Optionally track the maximum numbers of IOs in a cloned request.
> + */
> +static unsigned track_peak_rq_based_ios;
> +static unsigned peak_rq_based_ios;
> +
> +/*
>   * This mutex protects reserved_bio_based_ios_latch and reserved_rq_based_ios_latch.
>   */
>  static DEFINE_MUTEX(dm_mempools_lock);
> @@ -1719,6 +1725,16 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
>  		return NULL;
>  	}
>  
> +	if (unlikely(ACCESS_ONCE(track_peak_rq_based_ios))) {
> +		struct bio *bio;
> +		unsigned num_bios = 0;
> +
> +		__rq_for_each_bio(bio, clone)
> +			num_bios++;
> +		if (num_bios > ACCESS_ONCE(peak_rq_based_ios))
> +			ACCESS_ONCE(peak_rq_based_ios) = num_bios;
> +	}
> +
>  	return clone;
>  }
>  
> @@ -3034,6 +3050,12 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
>  module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
>  MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
>  
> +module_param(track_peak_rq_based_ios, uint, S_IRUGO | S_IWUSR);
> +MODULE_PARM_DESC(peak_rq_based_ios, "Enable tracking the maximum IOs in a cloned request");
> +
> +module_param(peak_rq_based_ios, uint, S_IRUGO);
> +MODULE_PARM_DESC(peak_rq_based_ios, "Tracks the maximum IOs in a cloned request");
> +
>  MODULE_DESCRIPTION(DM_NAME " driver");
>  MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
>  MODULE_LICENSE("GPL");
> -- 
> 1.8.1.4
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/7] dm: optimize clone_rq() when track_peak_rq_based_ios is disabled
  2013-09-12 22:24                 ` [PATCH 7/7] dm: optimize clone_rq() when track_peak_rq_based_ios is disabled Mike Snitzer
@ 2013-09-12 23:00                   ` Mikulas Patocka
  2013-09-12 23:06                     ` Mike Snitzer
  0 siblings, 1 reply; 63+ messages in thread
From: Mikulas Patocka @ 2013-09-12 23:00 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: dm-devel, Frank Mayhar



On Thu, 12 Sep 2013, Mike Snitzer wrote:

> Make use of static keys to eliminate the relevant branch in clone_rq()
> when dm_mod's 'track_peak_rq_based_ios' paramter is set to 0 (disabled).
> 
> Even if 'track_peak_rq_based_ios' is set to 0 it will not take effect
> until the next request-based table reload.  Once it is disabled the
> 'peak_rq_based_ios' parameter will be reset to 0.

This patch changes it so that the value track_peak_rq_based_ios is sampled 
only when reloading a table. I think it will be confusing to the user if 
he changes the value, but the change doesn't take effect until something 
reloads some table.

Mikulas

> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> ---
>  drivers/md/dm.c | 33 +++++++++++++++++++++++++++++++--
>  1 file changed, 31 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index de83930..d9e38a7 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -19,6 +19,7 @@
>  #include <linux/idr.h>
>  #include <linux/hdreg.h>
>  #include <linux/delay.h>
> +#include <linux/static_key.h>
>  
>  #include <trace/events/block.h>
>  
> @@ -242,10 +243,14 @@ static unsigned reserved_rq_based_ios_latch;
>   * Optionally track the maximum numbers of IOs in a cloned request.
>   */
>  static unsigned track_peak_rq_based_ios;
> +static unsigned track_peak_rq_based_ios_latch;
>  static unsigned peak_rq_based_ios;
>  
> +static struct static_key_deferred track_peak_rq_based_ios_enabled; /* defaults to false */
> +
>  /*
> - * This mutex protects reserved_bio_based_ios_latch and reserved_rq_based_ios_latch.
> + * This mutex protects reserved_bio_based_ios_latch, reserved_rq_based_ios_latch
> + * and track_peak_rq_based_ios_latch.
>   */
>  static DEFINE_MUTEX(dm_mempools_lock);
>  
> @@ -305,6 +310,26 @@ unsigned dm_get_reserved_rq_based_ios(void)
>  }
>  EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
>  
> +static void __track_peak_rq_based_ios_refresh(void)
> +{
> +	bool enabled = static_key_false(&track_peak_rq_based_ios_enabled.key);
> +
> +	BUG_ON(!mutex_is_locked(&dm_mempools_lock));
> +
> +	track_peak_rq_based_ios_latch = ACCESS_ONCE(track_peak_rq_based_ios);
> +
> +	if (track_peak_rq_based_ios_latch) {
> +		if (enabled)
> +			return; /* already enabled */
> +		static_key_slow_inc(&track_peak_rq_based_ios_enabled.key);
> +	} else {
> +		if (!enabled)
> +			return; /* already disabled */
> +		static_key_slow_dec(&track_peak_rq_based_ios_enabled.key);
> +		ACCESS_ONCE(peak_rq_based_ios) = 0;
> +	}
> +}
> +
>  static int __init local_init(void)
>  {
>  	int r = -ENOMEM;
> @@ -1725,7 +1750,7 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
>  		return NULL;
>  	}
>  
> -	if (unlikely(ACCESS_ONCE(track_peak_rq_based_ios))) {
> +	if (static_key_false(&track_peak_rq_based_ios_enabled.key)) {
>  		struct bio *bio;
>  		unsigned num_bios = 0;
>  
> @@ -2984,6 +3009,10 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
>  		if (reserved_rq_based_ios != reserved_rq_based_ios_latch)
>  			__reserved_request_based_ios_refresh();
>  		pool_size = reserved_rq_based_ios_latch;
> +
> +		/* Check if track_peak_rq_based_ios changed. */
> +		if (track_peak_rq_based_ios != track_peak_rq_based_ios_latch)
> +			__track_peak_rq_based_ios_refresh();
>  		mutex_unlock(&dm_mempools_lock);
>  
>  		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
> -- 
> 1.8.1.4
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/7] dm: optimize clone_rq() when track_peak_rq_based_ios is disabled
  2013-09-12 23:00                   ` Mikulas Patocka
@ 2013-09-12 23:06                     ` Mike Snitzer
  2013-09-12 23:30                       ` Mikulas Patocka
  0 siblings, 1 reply; 63+ messages in thread
From: Mike Snitzer @ 2013-09-12 23:06 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: dm-devel, Frank Mayhar

On Thu, Sep 12 2013 at  7:00pm -0400,
Mikulas Patocka <mpatocka@redhat.com> wrote:

> 
> 
> On Thu, 12 Sep 2013, Mike Snitzer wrote:
> 
> > Make use of static keys to eliminate the relevant branch in clone_rq()
> > when dm_mod's 'track_peak_rq_based_ios' paramter is set to 0 (disabled).
> > 
> > Even if 'track_peak_rq_based_ios' is set to 0 it will not take effect
> > until the next request-based table reload.  Once it is disabled the
> > 'peak_rq_based_ios' parameter will be reset to 0.
> 
> This patch changes it so that the value track_peak_rq_based_ios is sampled 
> only when reloading a table. I think it will be confusing to the user if 
> he changes the value, but the change doesn't take effect until something 
> reloads some table.

Well we need to be able to notice the change but I do not wish to do so
in a performance critical path (which clone_rq is).

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 6/7] dm: track the maximum number of bios in a cloned request
  2013-09-12 22:55                   ` Mikulas Patocka
@ 2013-09-12 23:09                     ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-09-12 23:09 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: dm-devel, Frank Mayhar

On Thu, Sep 12 2013 at  6:55pm -0400,
Mikulas Patocka <mpatocka@redhat.com> wrote:

> There's a race condition (it's not serious):
> 
> +               if (num_bios > ACCESS_ONCE(peak_rq_based_ios))
> +                       ACCESS_ONCE(peak_rq_based_ios) = num_bios;
> 
> You can double-check it using code like this:
> if (num_bios > ACCESS_ONCE(peak_rq_based_ios)) {
> 	spin_lock_irqsave(&peak_rq_based_ios_lock, flags);
> 	if (num_bios > peak_rq_based_ios)
> 		peak_rq_based_ios = num_bios;
> 	spin_unlock_irqrestore(&peak_rq_based_ios_lock, flags);
> }

Yeah I thought about using double checked locking but didn't want to
incur the extra locking.  Figured precise answer for peak_rq_based_ios
wasn't of utmost importance.  But I'll think about doing it.

> Maybe - reset the value peak_rq_based_ios if the user clears 
> track_peak_rq_based_ios?

Well until patch 7 there isn't a convenient place to trap the change.
And you already took issue with how I did it in that patch.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 3/7] dm: add reserved_bio_based_ios module parameter
  2013-09-12 22:47                   ` Mikulas Patocka
@ 2013-09-12 23:11                     ` Mike Snitzer
  2013-09-12 23:17                       ` Mikulas Patocka
  0 siblings, 1 reply; 63+ messages in thread
From: Mike Snitzer @ 2013-09-12 23:11 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: dm-devel, Frank Mayhar

On Thu, Sep 12 2013 at  6:47pm -0400,
Mikulas Patocka <mpatocka@redhat.com> wrote:

> We don't need this. bio-based i/o should be fine with a small mempool, 
> there is no need to make it tunable.

I'd like to get Frank's insight here.  He clearly had an interest in
tuning bio-based also.  While 16 shouldn't really hurt it could still be
artifically high.  I'm not opposed to exposing a sane default but
allowing other to experiemnt (in production workloads) with smaller
values that still enable forward progress should memory get exhausted.

Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 2/7] dm: add reserved_rq_based_ios module parameter
  2013-09-12 22:45                   ` Mikulas Patocka
@ 2013-09-12 23:15                     ` Mike Snitzer
  2013-09-12 23:27                       ` Mikulas Patocka
  0 siblings, 1 reply; 63+ messages in thread
From: Mike Snitzer @ 2013-09-12 23:15 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: dm-devel, Frank Mayhar

On Thu, Sep 12 2013 at  6:45pm -0400,
Mikulas Patocka <mpatocka@redhat.com> wrote:

> I think this is too complicated.
> 
> dm-bufio uses similar approach like this patch - but in dm-bufio the code 
> path is performance critical, we don't want to recalculate memory size 
> with each i/o request.
> 
> Here it is not performance critical, so I'd drop the mutex, drop the 
> latch, drop the function __reserved_request_based_ios_refresh and add only 
> these lines:
> 
> pool_size = ACCESS_ONCE(reserved_rq_based_ios);
> if (!pool_size) pool_size = RESERVED_REQUEST_BASED_IOS;

Could be I'm missing something but isn't the locking about correctness
rather than performance?

Also, using the latch enables the dm_get_reserved_rq_based_ios()
interface that is used in patch 5.

Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 3/7] dm: add reserved_bio_based_ios module parameter
  2013-09-12 23:11                     ` Mike Snitzer
@ 2013-09-12 23:17                       ` Mikulas Patocka
  2013-09-18 15:17                         ` Frank Mayhar
  0 siblings, 1 reply; 63+ messages in thread
From: Mikulas Patocka @ 2013-09-12 23:17 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: dm-devel, Frank Mayhar



On Thu, 12 Sep 2013, Mike Snitzer wrote:

> On Thu, Sep 12 2013 at  6:47pm -0400,
> Mikulas Patocka <mpatocka@redhat.com> wrote:
> 
> > We don't need this. bio-based i/o should be fine with a small mempool, 
> > there is no need to make it tunable.
> 
> I'd like to get Frank's insight here.  He clearly had an interest in
> tuning bio-based also.  While 16 shouldn't really hurt it could still be
> artifically high.  I'm not opposed to exposing a sane default but
> allowing other to experiemnt (in production workloads) with smaller
> values that still enable forward progress should memory get exhausted.
> 
> Mike

I would do it this way: if Frank gets a measurable improvement in memory 
consumption when the values is dropped from 16 to a lower number (4 or 
maybe 1), then I would drop the value by default (don't make it tunable, 
drop it for all users).

If there is no improvement when the value is lowered, I'd leave it as it 
is, on 16.

Mikulas

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 2/7] dm: add reserved_rq_based_ios module parameter
  2013-09-12 23:15                     ` Mike Snitzer
@ 2013-09-12 23:27                       ` Mikulas Patocka
  2013-09-12 23:32                         ` Mike Snitzer
  0 siblings, 1 reply; 63+ messages in thread
From: Mikulas Patocka @ 2013-09-12 23:27 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: dm-devel, Frank Mayhar



On Thu, 12 Sep 2013, Mike Snitzer wrote:

> On Thu, Sep 12 2013 at  6:45pm -0400,
> Mikulas Patocka <mpatocka@redhat.com> wrote:
> 
> > I think this is too complicated.
> > 
> > dm-bufio uses similar approach like this patch - but in dm-bufio the code 
> > path is performance critical, we don't want to recalculate memory size 
> > with each i/o request.
> > 
> > Here it is not performance critical, so I'd drop the mutex, drop the 
> > latch, drop the function __reserved_request_based_ios_refresh and add only 
> > these lines:
> > 
> > pool_size = ACCESS_ONCE(reserved_rq_based_ios);
> > if (!pool_size) pool_size = RESERVED_REQUEST_BASED_IOS;
> 
> Could be I'm missing something but isn't the locking about correctness
> rather than performance?

If the user changes the value while you are allocating the mempool, you 
can allocate the mempool with the old or new value, it doesn't matter. 
There is nothing to lock against.

> Also, using the latch enables the dm_get_reserved_rq_based_ios()
> interface that is used in patch 5.
> 
> Mike

You can just define dm_get_reserved_rq_based_ios as this:
unsigned dm_get_reserved_rq_based_ios(void)
{
	unsigned pool_size = ACCESS_ONCE(reserved_rq_based_ios);
	if (!pool_size) pool_size = RESERVED_REQUEST_BASED_IOS;
	return pool_size;
}
... and call it when you need the pool size.

The purpose of the latch in dm-bufio is to avoid memory size recalculation 
with each buffer allocation. Here, there is nothing to recalculate and the 
code path is not performance-critical.

Mikulas

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/7] dm: optimize clone_rq() when track_peak_rq_based_ios is disabled
  2013-09-12 23:06                     ` Mike Snitzer
@ 2013-09-12 23:30                       ` Mikulas Patocka
  2013-09-12 23:53                         ` Mike Snitzer
  0 siblings, 1 reply; 63+ messages in thread
From: Mikulas Patocka @ 2013-09-12 23:30 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: dm-devel, Frank Mayhar



On Thu, 12 Sep 2013, Mike Snitzer wrote:

> On Thu, Sep 12 2013 at  7:00pm -0400,
> Mikulas Patocka <mpatocka@redhat.com> wrote:
> 
> > 
> > 
> > On Thu, 12 Sep 2013, Mike Snitzer wrote:
> > 
> > > Make use of static keys to eliminate the relevant branch in clone_rq()
> > > when dm_mod's 'track_peak_rq_based_ios' paramter is set to 0 (disabled).
> > > 
> > > Even if 'track_peak_rq_based_ios' is set to 0 it will not take effect
> > > until the next request-based table reload.  Once it is disabled the
> > > 'peak_rq_based_ios' parameter will be reset to 0.
> > 
> > This patch changes it so that the value track_peak_rq_based_ios is sampled 
> > only when reloading a table. I think it will be confusing to the user if 
> > he changes the value, but the change doesn't take effect until something 
> > reloads some table.
> 
> Well we need to be able to notice the change but I do not wish to do so
> in a performance critical path (which clone_rq is).

I think one condition isn't that bad so there is no need to use static 
keys.

If you want to use static keys, you need to refresh them in some place 
that it called regularly.

Mikulas

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 2/7] dm: add reserved_rq_based_ios module parameter
  2013-09-12 23:27                       ` Mikulas Patocka
@ 2013-09-12 23:32                         ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-09-12 23:32 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: dm-devel, Frank Mayhar

On Thu, Sep 12 2013 at  7:27pm -0400,
Mikulas Patocka <mpatocka@redhat.com> wrote:

> 
> 
> On Thu, 12 Sep 2013, Mike Snitzer wrote:
> 
> > On Thu, Sep 12 2013 at  6:45pm -0400,
> > Mikulas Patocka <mpatocka@redhat.com> wrote:
> > 
> > > I think this is too complicated.
> > > 
> > > dm-bufio uses similar approach like this patch - but in dm-bufio the code 
> > > path is performance critical, we don't want to recalculate memory size 
> > > with each i/o request.
> > > 
> > > Here it is not performance critical, so I'd drop the mutex, drop the 
> > > latch, drop the function __reserved_request_based_ios_refresh and add only 
> > > these lines:
> > > 
> > > pool_size = ACCESS_ONCE(reserved_rq_based_ios);
> > > if (!pool_size) pool_size = RESERVED_REQUEST_BASED_IOS;
> > 
> > Could be I'm missing something but isn't the locking about correctness
> > rather than performance?
> 
> If the user changes the value while you are allocating the mempool, you 
> can allocate the mempool with the old or new value, it doesn't matter. 
> There is nothing to lock against.
> 
> > Also, using the latch enables the dm_get_reserved_rq_based_ios()
> > interface that is used in patch 5.
> > 
> > Mike
> 
> You can just define dm_get_reserved_rq_based_ios as this:
> unsigned dm_get_reserved_rq_based_ios(void)
> {
> 	unsigned pool_size = ACCESS_ONCE(reserved_rq_based_ios);
> 	if (!pool_size) pool_size = RESERVED_REQUEST_BASED_IOS;
> 	return pool_size;
> }
> ... and call it when you need the pool size.
> 
> The purpose of the latch in dm-bufio is to avoid memory size recalculation 
> with each buffer allocation. Here, there is nothing to recalculate and the 
> code path is not performance-critical.

OK, I'll revise the patch.

Thanks,
Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/7] dm: optimize clone_rq() when track_peak_rq_based_ios is disabled
  2013-09-12 23:30                       ` Mikulas Patocka
@ 2013-09-12 23:53                         ` Mike Snitzer
  2013-09-13  4:46                           ` Jun'ichi Nomura
  0 siblings, 1 reply; 63+ messages in thread
From: Mike Snitzer @ 2013-09-12 23:53 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: dm-devel, Frank Mayhar

On Thu, Sep 12 2013 at  7:30pm -0400,
Mikulas Patocka <mpatocka@redhat.com> wrote:

> 
> 
> On Thu, 12 Sep 2013, Mike Snitzer wrote:
> 
> > On Thu, Sep 12 2013 at  7:00pm -0400,
> > Mikulas Patocka <mpatocka@redhat.com> wrote:
> > 
> > > 
> > > 
> > > On Thu, 12 Sep 2013, Mike Snitzer wrote:
> > > 
> > > > Make use of static keys to eliminate the relevant branch in clone_rq()
> > > > when dm_mod's 'track_peak_rq_based_ios' paramter is set to 0 (disabled).
> > > > 
> > > > Even if 'track_peak_rq_based_ios' is set to 0 it will not take effect
> > > > until the next request-based table reload.  Once it is disabled the
> > > > 'peak_rq_based_ios' parameter will be reset to 0.
> > > 
> > > This patch changes it so that the value track_peak_rq_based_ios is sampled 
> > > only when reloading a table. I think it will be confusing to the user if 
> > > he changes the value, but the change doesn't take effect until something 
> > > reloads some table.
> > 
> > Well we need to be able to notice the change but I do not wish to do so
> > in a performance critical path (which clone_rq is).
> 
> I think one condition isn't that bad so there is no need to use static 
> keys.

Considering we're talking about the IO path I'd rather avoid it given
'track_peak_rq_based_ios' will almost always be disabled.

> If you want to use static keys, you need to refresh them in some place 
> that it called regularly.

No you don't.  I'm assuming if someone enables 'track_peak_rq_based_ios'
they know what they are doing.

But I'm open to suggestions on a more appropriate hook to be catching
the update to track_peak_rq_based_ios.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/7] dm: optimize clone_rq() when track_peak_rq_based_ios is disabled
  2013-09-12 23:53                         ` Mike Snitzer
@ 2013-09-13  4:46                           ` Jun'ichi Nomura
  2013-09-13 13:04                             ` Mike Snitzer
  2013-09-13 14:34                             ` Mikulas Patocka
  0 siblings, 2 replies; 63+ messages in thread
From: Jun'ichi Nomura @ 2013-09-13  4:46 UTC (permalink / raw)
  To: device-mapper development, Mike Snitzer, Mikulas Patocka, Frank Mayhar

On 09/13/13 08:53, Mike Snitzer wrote:
> On Thu, Sep 12 2013 at  7:30pm -0400,
> Mikulas Patocka <mpatocka@redhat.com> wrote:
>>>>> Make use of static keys to eliminate the relevant branch in clone_rq()
>>>>> when dm_mod's 'track_peak_rq_based_ios' paramter is set to 0 (disabled).
>>>>>
>>>>> Even if 'track_peak_rq_based_ios' is set to 0 it will not take effect
>>>>> until the next request-based table reload.  Once it is disabled the
>>>>> 'peak_rq_based_ios' parameter will be reset to 0.
>>>>
>>>> This patch changes it so that the value track_peak_rq_based_ios is sampled 
>>>> only when reloading a table. I think it will be confusing to the user if 
>>>> he changes the value, but the change doesn't take effect until something 
>>>> reloads some table.
>>>
>>> Well we need to be able to notice the change but I do not wish to do so
>>> in a performance critical path (which clone_rq is).
>>
>> I think one condition isn't that bad so there is no need to use static 
>> keys.
> 
> Considering we're talking about the IO path I'd rather avoid it given
> 'track_peak_rq_based_ios' will almost always be disabled.
> 
>> If you want to use static keys, you need to refresh them in some place 
>> that it called regularly.
> 
> No you don't.  I'm assuming if someone enables 'track_peak_rq_based_ios'
> they know what they are doing.
> 
> But I'm open to suggestions on a more appropriate hook to be catching
> the update to track_peak_rq_based_ios.

Other approach might be putting the info on tracepoints.
Then the tracepoint framework will take care of the static_key thing.

Since 'the number of bios in a request' is a block-layer generic info,
you could extend the block layer events instead of having your own.

Attached patch adds it to block_rq_remap event.
You could do something like this to see requests with many bios:
  echo 'nr_bios > 32' > /sys/kernel/debug/tracing/events/block/block_rq_remap/filter
  echo 1 > /sys/kernel/debug/tracing/events/block/block_rq_remap/enable 

For example,
     kworker/1:1H-308   [001] d..1   276.242448: block_rq_remap: 8,64 W 1967760 + 512 <- (253,1) 1967760 64
     kworker/1:1H-308   [001] d..1   276.242482: block_rq_remap: 8,160 W 1968272 + 512 <- (253,1) 1968272 64
     kworker/1:1H-308   [001] d..1   276.242515: block_rq_remap: 65,0 W 1968784 + 512 <- (253,1) 1968784 64
# the last item in line is the number of bios. ("64" in this case)

---
Jun'ichi Nomura, NEC Corporation


diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2fdb4a4..0e6f765 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -862,6 +862,17 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq)
 	return blk_queue_get_max_sectors(q, rq->cmd_flags);
 }
 
+static inline unsigned int blk_rq_count_bios(struct request *rq)
+{
+	unsigned int nr_bios = 0;
+	struct bio *bio;
+
+	__rq_for_each_bio(bio, rq)
+		nr_bios++;
+
+	return nr_bios;
+}
+
 /*
  * Request issue related functions.
  */
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 60ae7c3..4c2301d 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -618,6 +618,7 @@ TRACE_EVENT(block_rq_remap,
 		__field( unsigned int,	nr_sector	)
 		__field( dev_t,		old_dev		)
 		__field( sector_t,	old_sector	)
+		__field( unsigned int,	nr_bios		)
 		__array( char,		rwbs,	RWBS_LEN)
 	),
 
@@ -627,15 +628,16 @@ TRACE_EVENT(block_rq_remap,
 		__entry->nr_sector	= blk_rq_sectors(rq);
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
+		__entry->nr_bios	= blk_rq_count_bios(rq);
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 	),
 
-	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
+	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
 		  (unsigned long long)__entry->sector,
 		  __entry->nr_sector,
 		  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
-		  (unsigned long long)__entry->old_sector)
+		  (unsigned long long)__entry->old_sector, __entry->nr_bios)
 );
 
 #endif /* _TRACE_BLOCK_H */

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/7] dm: optimize clone_rq() when track_peak_rq_based_ios is disabled
  2013-09-13  4:46                           ` Jun'ichi Nomura
@ 2013-09-13 13:04                             ` Mike Snitzer
  2013-09-13 14:34                             ` Mikulas Patocka
  1 sibling, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-09-13 13:04 UTC (permalink / raw)
  To: Jun'ichi Nomura
  Cc: axboe, Frank Mayhar, device-mapper development, Mikulas Patocka

On Fri, Sep 13 2013 at 12:46am -0400,
Jun'ichi Nomura <j-nomura@ce.jp.nec.com> wrote:

> On 09/13/13 08:53, Mike Snitzer wrote:
> > On Thu, Sep 12 2013 at  7:30pm -0400,
> > Mikulas Patocka <mpatocka@redhat.com> wrote:
> >>>>> Make use of static keys to eliminate the relevant branch in clone_rq()
> >>>>> when dm_mod's 'track_peak_rq_based_ios' paramter is set to 0 (disabled).
> >>>>>
> >>>>> Even if 'track_peak_rq_based_ios' is set to 0 it will not take effect
> >>>>> until the next request-based table reload.  Once it is disabled the
> >>>>> 'peak_rq_based_ios' parameter will be reset to 0.
> >>>>
> >>>> This patch changes it so that the value track_peak_rq_based_ios is sampled 
> >>>> only when reloading a table. I think it will be confusing to the user if 
> >>>> he changes the value, but the change doesn't take effect until something 
> >>>> reloads some table.
> >>>
> >>> Well we need to be able to notice the change but I do not wish to do so
> >>> in a performance critical path (which clone_rq is).
> >>
> >> I think one condition isn't that bad so there is no need to use static 
> >> keys.
> > 
> > Considering we're talking about the IO path I'd rather avoid it given
> > 'track_peak_rq_based_ios' will almost always be disabled.
> > 
> >> If you want to use static keys, you need to refresh them in some place 
> >> that it called regularly.
> > 
> > No you don't.  I'm assuming if someone enables 'track_peak_rq_based_ios'
> > they know what they are doing.
> > 
> > But I'm open to suggestions on a more appropriate hook to be catching
> > the update to track_peak_rq_based_ios.
> 
> Other approach might be putting the info on tracepoints.
> Then the tracepoint framework will take care of the static_key thing.
> 
> Since 'the number of bios in a request' is a block-layer generic info,
> you could extend the block layer events instead of having your own.

Sounds good, I like this better.
 
> Attached patch adds it to block_rq_remap event.
> You could do something like this to see requests with many bios:
>   echo 'nr_bios > 32' > /sys/kernel/debug/tracing/events/block/block_rq_remap/filter
>   echo 1 > /sys/kernel/debug/tracing/events/block/block_rq_remap/enable 
> 
> For example,
>      kworker/1:1H-308   [001] d..1   276.242448: block_rq_remap: 8,64 W 1967760 + 512 <- (253,1) 1967760 64
>      kworker/1:1H-308   [001] d..1   276.242482: block_rq_remap: 8,160 W 1968272 + 512 <- (253,1) 1968272 64
>      kworker/1:1H-308   [001] d..1   276.242515: block_rq_remap: 65,0 W 1968784 + 512 <- (253,1) 1968784 64
> # the last item in line is the number of bios. ("64" in this case)

Acked-by: Mike Snitzer <snitzer@redhat.com>

Any chance you'd be open to adding a proper header (with your
Signed-off-by, my Ack, etc) and cc Jens on v2?

I can do it if you'd like but figured it more appropriate for you to
submit your work.

Thanks,
Mike

Jens, FYI here is the patch Jun'ichi developed as an alternative to these
2 DM-specific patches:
http://www.redhat.com/archives/dm-devel/2013-September/msg00030.html
http://www.redhat.com/archives/dm-devel/2013-September/msg00031.html

> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 2fdb4a4..0e6f765 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -862,6 +862,17 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq)
>  	return blk_queue_get_max_sectors(q, rq->cmd_flags);
>  }
>  
> +static inline unsigned int blk_rq_count_bios(struct request *rq)
> +{
> +	unsigned int nr_bios = 0;
> +	struct bio *bio;
> +
> +	__rq_for_each_bio(bio, rq)
> +		nr_bios++;
> +
> +	return nr_bios;
> +}
> +
>  /*
>   * Request issue related functions.
>   */
> diff --git a/include/trace/events/block.h b/include/trace/events/block.h
> index 60ae7c3..4c2301d 100644
> --- a/include/trace/events/block.h
> +++ b/include/trace/events/block.h
> @@ -618,6 +618,7 @@ TRACE_EVENT(block_rq_remap,
>  		__field( unsigned int,	nr_sector	)
>  		__field( dev_t,		old_dev		)
>  		__field( sector_t,	old_sector	)
> +		__field( unsigned int,	nr_bios		)
>  		__array( char,		rwbs,	RWBS_LEN)
>  	),
>  
> @@ -627,15 +628,16 @@ TRACE_EVENT(block_rq_remap,
>  		__entry->nr_sector	= blk_rq_sectors(rq);
>  		__entry->old_dev	= dev;
>  		__entry->old_sector	= from;
> +		__entry->nr_bios	= blk_rq_count_bios(rq);
>  		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
>  	),
>  
> -	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
> +	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
>  		  (unsigned long long)__entry->sector,
>  		  __entry->nr_sector,
>  		  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
> -		  (unsigned long long)__entry->old_sector)
> +		  (unsigned long long)__entry->old_sector, __entry->nr_bios)
>  );
>  
>  #endif /* _TRACE_BLOCK_H */
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/7] dm: optimize clone_rq() when track_peak_rq_based_ios is disabled
  2013-09-13  4:46                           ` Jun'ichi Nomura
  2013-09-13 13:04                             ` Mike Snitzer
@ 2013-09-13 14:34                             ` Mikulas Patocka
  1 sibling, 0 replies; 63+ messages in thread
From: Mikulas Patocka @ 2013-09-13 14:34 UTC (permalink / raw)
  To: Jun'ichi Nomura; +Cc: device-mapper development, Frank Mayhar, Mike Snitzer

This is good idea.

Mikulas


On Fri, 13 Sep 2013, Jun'ichi Nomura wrote:

> Other approach might be putting the info on tracepoints.
> Then the tracepoint framework will take care of the static_key thing.
> 
> Since 'the number of bios in a request' is a block-layer generic info,
> you could extend the block layer events instead of having your own.
> 
> Attached patch adds it to block_rq_remap event.
> You could do something like this to see requests with many bios:
>   echo 'nr_bios > 32' > /sys/kernel/debug/tracing/events/block/block_rq_remap/filter
>   echo 1 > /sys/kernel/debug/tracing/events/block/block_rq_remap/enable 
> 
> For example,
>      kworker/1:1H-308   [001] d..1   276.242448: block_rq_remap: 8,64 W 1967760 + 512 <- (253,1) 1967760 64
>      kworker/1:1H-308   [001] d..1   276.242482: block_rq_remap: 8,160 W 1968272 + 512 <- (253,1) 1968272 64
>      kworker/1:1H-308   [001] d..1   276.242515: block_rq_remap: 65,0 W 1968784 + 512 <- (253,1) 1968784 64
> # the last item in line is the number of bios. ("64" in this case)
> 
> ---
> Jun'ichi Nomura, NEC Corporation
> 
> 
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 2fdb4a4..0e6f765 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -862,6 +862,17 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq)
>  	return blk_queue_get_max_sectors(q, rq->cmd_flags);
>  }
>  
> +static inline unsigned int blk_rq_count_bios(struct request *rq)
> +{
> +	unsigned int nr_bios = 0;
> +	struct bio *bio;
> +
> +	__rq_for_each_bio(bio, rq)
> +		nr_bios++;
> +
> +	return nr_bios;
> +}
> +
>  /*
>   * Request issue related functions.
>   */
> diff --git a/include/trace/events/block.h b/include/trace/events/block.h
> index 60ae7c3..4c2301d 100644
> --- a/include/trace/events/block.h
> +++ b/include/trace/events/block.h
> @@ -618,6 +618,7 @@ TRACE_EVENT(block_rq_remap,
>  		__field( unsigned int,	nr_sector	)
>  		__field( dev_t,		old_dev		)
>  		__field( sector_t,	old_sector	)
> +		__field( unsigned int,	nr_bios		)
>  		__array( char,		rwbs,	RWBS_LEN)
>  	),
>  
> @@ -627,15 +628,16 @@ TRACE_EVENT(block_rq_remap,
>  		__entry->nr_sector	= blk_rq_sectors(rq);
>  		__entry->old_dev	= dev;
>  		__entry->old_sector	= from;
> +		__entry->nr_bios	= blk_rq_count_bios(rq);
>  		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
>  	),
>  
> -	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
> +	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
>  		  (unsigned long long)__entry->sector,
>  		  __entry->nr_sector,
>  		  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
> -		  (unsigned long long)__entry->old_sector)
> +		  (unsigned long long)__entry->old_sector, __entry->nr_bios)
>  );
>  
>  #endif /* _TRACE_BLOCK_H */
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH v2 0/3] dm: allow mempool and bioset reserves to be tuned
  2013-09-12 22:24               ` [PATCH 0/7] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
                                   ` (6 preceding siblings ...)
  2013-09-12 22:24                 ` [PATCH 7/7] dm: optimize clone_rq() when track_peak_rq_based_ios is disabled Mike Snitzer
@ 2013-09-13 18:59                 ` Mike Snitzer
  2013-09-13 18:59                   ` [PATCH v2 1/3] dm: lower bio-based mempool reservation Mike Snitzer
                                     ` (3 more replies)
  7 siblings, 4 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-09-13 18:59 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: Jun'ichi Nomura, dm-devel

You can pull these changes from the 'devel' branch of:
git://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git

To browse, see:
https://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/log/?h=devel

v2 changes:
Simplified the implementation.  Dropped the peak_reserved_rq_based_ios
tracking, we'll use the tracepoint patch Jun'ichi Nomura suggested:
http://www.redhat.com/archives/dm-devel/2013-September/msg00048.html

Mike Snitzer (3):
  dm: lower bio-based mempool reservation
  dm: add reserved_rq_based_ios module parameter
  dm: add reserved_bio_based_ios module parameter

 drivers/md/dm-io.c    |    7 +++----
 drivers/md/dm-mpath.c |    6 +++---
 drivers/md/dm.c       |   38 ++++++++++++++++++++++++++++++++++----
 drivers/md/dm.h       |    3 +++
 4 files changed, 43 insertions(+), 11 deletions(-)

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH v2 1/3] dm: lower bio-based mempool reservation
  2013-09-13 18:59                 ` [PATCH v2 0/3] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
@ 2013-09-13 18:59                   ` Mike Snitzer
  2013-09-13 18:59                   ` [PATCH v2 2/3] dm: add reserved_rq_based_ios module parameter Mike Snitzer
                                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-09-13 18:59 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: Jun'ichi Nomura, dm-devel

Bio-based device mapper processing doesn't need larger mempools (like
request-based DM does), so lower the number of reserved entries for
bio-based operation.  16 was already used for bio-based DM's bioset
but mistakenly wasn't used for it's _io_cache.

Formalize difference between bio-based and request-based defaults by
introducing RESERVED_BIO_BASED_IOS and RESERVED_REQUEST_BASED_IOS.

(based on older code from Mikulas Patocka)

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm.c |    9 +++++----
 1 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6a5e9ed..47bac14 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -211,7 +211,8 @@ struct dm_md_mempools {
 	struct bio_set *bs;
 };
 
-#define MIN_IOS 256
+#define RESERVED_BIO_BASED_IOS		16
+#define RESERVED_REQUEST_BASED_IOS	256
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 
@@ -2862,18 +2863,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 
 	if (type == DM_TYPE_BIO_BASED) {
 		cachep = _io_cache;
-		pool_size = 16;
+		pool_size = RESERVED_BIO_BASED_IOS;
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 	} else if (type == DM_TYPE_REQUEST_BASED) {
 		cachep = _rq_tio_cache;
-		pool_size = MIN_IOS;
+		pool_size = RESERVED_REQUEST_BASED_IOS;
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
 		/* per_bio_data_size is not used. See __bind_mempools(). */
 		WARN_ON(per_bio_data_size != 0);
 	} else
 		goto out;
 
-	pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep);
+	pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
 	if (!pools->io_pool)
 		goto out;
 
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH v2 2/3] dm: add reserved_rq_based_ios module parameter
  2013-09-13 18:59                 ` [PATCH v2 0/3] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
  2013-09-13 18:59                   ` [PATCH v2 1/3] dm: lower bio-based mempool reservation Mike Snitzer
@ 2013-09-13 18:59                   ` Mike Snitzer
  2013-09-13 18:59                   ` [PATCH v2 3/3] dm: add reserved_bio_based_ios " Mike Snitzer
  2013-09-13 19:22                   ` [PATCH v2 0/3] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
  3 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-09-13 18:59 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: Jun'ichi Nomura, dm-devel

Allow user to change the number of IOs that are reserved by
request-based DM's mempools by writing to this file:
/sys/module/dm_mod/parameters/reserved_rq_based_ios

The default value is RESERVED_REQUEST_BASED_IOS (256).

Export dm_get_reserved_rq_based_ios() for use by DM targets and core
code.  Switch to sizing dm-mpath's mempool using DM core's configurable
'reserved_rq_based_ios'.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-mpath.c |    6 +++---
 drivers/md/dm.c       |   17 ++++++++++++++++-
 drivers/md/dm.h       |    2 ++
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index b759a12..e0bed10 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -7,6 +7,7 @@
 
 #include <linux/device-mapper.h>
 
+#include "dm.h"
 #include "dm-path-selector.h"
 #include "dm-uevent.h"
 
@@ -116,8 +117,6 @@ struct dm_mpath_io {
 
 typedef int (*action_fn) (struct pgpath *pgpath);
 
-#define MIN_IOS 256	/* Mempool size */
-
 static struct kmem_cache *_mpio_cache;
 
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
@@ -190,6 +189,7 @@ static void free_priority_group(struct priority_group *pg,
 static struct multipath *alloc_multipath(struct dm_target *ti)
 {
 	struct multipath *m;
+	unsigned min_ios = dm_get_reserved_rq_based_ios();
 
 	m = kzalloc(sizeof(*m), GFP_KERNEL);
 	if (m) {
@@ -202,7 +202,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 		INIT_WORK(&m->trigger_event, trigger_event);
 		init_waitqueue_head(&m->pg_init_wait);
 		mutex_init(&m->work_mutex);
-		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
+		m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
 		if (!m->mpio_pool) {
 			kfree(m);
 			return NULL;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 47bac14..5a4854f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -216,6 +216,17 @@ struct dm_md_mempools {
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 
+/*
+ * Request-based DM's mempools' reserved IOs set by the user.
+ */
+static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
+
+unsigned dm_get_reserved_rq_based_ios(void)
+{
+	return ACCESS_ONCE(reserved_rq_based_ios);
+}
+EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
+
 static int __init local_init(void)
 {
 	int r = -ENOMEM;
@@ -2867,7 +2878,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 	} else if (type == DM_TYPE_REQUEST_BASED) {
 		cachep = _rq_tio_cache;
-		pool_size = RESERVED_REQUEST_BASED_IOS;
+		pool_size = dm_get_reserved_rq_based_ios();
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
 		/* per_bio_data_size is not used. See __bind_mempools(). */
 		WARN_ON(per_bio_data_size != 0);
@@ -2925,6 +2936,10 @@ module_exit(dm_exit);
 
 module_param(major, uint, 0);
 MODULE_PARM_DESC(major, "The major number of the device mapper");
+
+module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
+
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 5e604cc..1539650 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -184,6 +184,8 @@ void dm_free_md_mempools(struct dm_md_mempools *pools);
 /*
  * Helpers that are used by DM core
  */
+unsigned dm_get_reserved_rq_based_ios(void);
+
 static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
 {
 	return !maxlen || strlen(result) + 1 >= maxlen;
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH v2 3/3] dm: add reserved_bio_based_ios module parameter
  2013-09-13 18:59                 ` [PATCH v2 0/3] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
  2013-09-13 18:59                   ` [PATCH v2 1/3] dm: lower bio-based mempool reservation Mike Snitzer
  2013-09-13 18:59                   ` [PATCH v2 2/3] dm: add reserved_rq_based_ios module parameter Mike Snitzer
@ 2013-09-13 18:59                   ` Mike Snitzer
  2013-09-13 19:22                   ` [PATCH v2 0/3] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
  3 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-09-13 18:59 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: Jun'ichi Nomura, dm-devel

Allow user to change the number of IOs that are reserved by
bio-based DM's mempools by writing to this file:
/sys/module/dm_mod/parameters/reserved_bio_based_ios

The default value is RESERVED_BIO_BASED_IOS (16).

Export dm_get_reserved_bio_based_ios() for use by DM targets and core
code.  Switch to sizing dm-io's mempool and bioset using DM core's
configurable 'reserved_bio_based_ios'.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-io.c |    7 +++----
 drivers/md/dm.c    |   16 +++++++++++++++-
 drivers/md/dm.h    |    1 +
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea49834..2a20986 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -19,8 +19,6 @@
 #define DM_MSG_PREFIX "io"
 
 #define DM_IO_MAX_REGIONS	BITS_PER_LONG
-#define MIN_IOS		16
-#define MIN_BIOS	16
 
 struct dm_io_client {
 	mempool_t *pool;
@@ -50,16 +48,17 @@ static struct kmem_cache *_dm_io_cache;
 struct dm_io_client *dm_io_client_create(void)
 {
 	struct dm_io_client *client;
+	unsigned min_ios = dm_get_reserved_bio_based_ios();
 
 	client = kmalloc(sizeof(*client), GFP_KERNEL);
 	if (!client)
 		return ERR_PTR(-ENOMEM);
 
-	client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
+	client->pool = mempool_create_slab_pool(min_ios, _dm_io_cache);
 	if (!client->pool)
 		goto bad;
 
-	client->bios = bioset_create(MIN_BIOS, 0);
+	client->bios = bioset_create(min_ios, 0);
 	if (!client->bios)
 		goto bad;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5a4854f..a617fe3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -217,10 +217,21 @@ static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 
 /*
+ * Bio-based DM's mempools' reserved IOs set by the user.
+ */
+static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
+
+/*
  * Request-based DM's mempools' reserved IOs set by the user.
  */
 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
 
+unsigned dm_get_reserved_bio_based_ios(void)
+{
+	return ACCESS_ONCE(reserved_bio_based_ios);
+}
+EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
+
 unsigned dm_get_reserved_rq_based_ios(void)
 {
 	return ACCESS_ONCE(reserved_rq_based_ios);
@@ -2874,7 +2885,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 
 	if (type == DM_TYPE_BIO_BASED) {
 		cachep = _io_cache;
-		pool_size = RESERVED_BIO_BASED_IOS;
+		pool_size = dm_get_reserved_bio_based_ios();
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 	} else if (type == DM_TYPE_REQUEST_BASED) {
 		cachep = _rq_tio_cache;
@@ -2937,6 +2948,9 @@ module_exit(dm_exit);
 module_param(major, uint, 0);
 MODULE_PARM_DESC(major, "The major number of the device mapper");
 
+module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
+
 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 1539650..1d1ad7b 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -184,6 +184,7 @@ void dm_free_md_mempools(struct dm_md_mempools *pools);
 /*
  * Helpers that are used by DM core
  */
+unsigned dm_get_reserved_bio_based_ios(void);
 unsigned dm_get_reserved_rq_based_ios(void);
 
 static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH v2 0/3] dm: allow mempool and bioset reserves to be tuned
  2013-09-13 18:59                 ` [PATCH v2 0/3] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
                                     ` (2 preceding siblings ...)
  2013-09-13 18:59                   ` [PATCH v2 3/3] dm: add reserved_bio_based_ios " Mike Snitzer
@ 2013-09-13 19:22                   ` Mike Snitzer
  2013-09-13 20:30                     ` Mike Snitzer
  3 siblings, 1 reply; 63+ messages in thread
From: Mike Snitzer @ 2013-09-13 19:22 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: Jun'ichi Nomura, dm-devel

On Fri, Sep 13 2013 at  2:59pm -0400,
Mike Snitzer <snitzer@redhat.com> wrote:

> You can pull these changes from the 'devel' branch of:
> git://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git
> 
> To browse, see:
> https://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/log/?h=devel
> 
> v2 changes:
> Simplified the implementation.  Dropped the peak_reserved_rq_based_ios
> tracking, we'll use the tracepoint patch Jun'ichi Nomura suggested:
> http://www.redhat.com/archives/dm-devel/2013-September/msg00048.html

This needs a v3 because the simplified code doesn't handle bounds
properly (previous version handled remapping 0 to defaults).  But we
also need a maximum value that we're willing to support.  So if the user
exceeds that value it is reset to the max supported.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v2 0/3] dm: allow mempool and bioset reserves to be tuned
  2013-09-13 19:22                   ` [PATCH v2 0/3] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
@ 2013-09-13 20:30                     ` Mike Snitzer
  2013-09-13 21:08                       ` [PATCH v3 " Mike Snitzer
  0 siblings, 1 reply; 63+ messages in thread
From: Mike Snitzer @ 2013-09-13 20:30 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Jun'ichi Nomura, dm-devel, Frank Mayhar

On Fri, Sep 13 2013 at  3:22pm -0400,
Mike Snitzer <snitzer@redhat.com> wrote:

> On Fri, Sep 13 2013 at  2:59pm -0400,
> Mike Snitzer <snitzer@redhat.com> wrote:
> 
> > You can pull these changes from the 'devel' branch of:
> > git://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git
> > 
> > To browse, see:
> > https://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/log/?h=devel
> > 
> > v2 changes:
> > Simplified the implementation.  Dropped the peak_reserved_rq_based_ios
> > tracking, we'll use the tracepoint patch Jun'ichi Nomura suggested:
> > http://www.redhat.com/archives/dm-devel/2013-September/msg00048.html
> 
> This needs a v3 because the simplified code doesn't handle bounds
> properly (previous version handled remapping 0 to defaults).  But we
> also need a maximum value that we're willing to support.  So if the user
> exceeds that value it is reset to the max supported.

Here is an incremental diff:

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a617fe3..033dc26 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -213,6 +213,7 @@ struct dm_md_mempools {
 
 #define RESERVED_BIO_BASED_IOS		16
 #define RESERVED_REQUEST_BASED_IOS	256
+#define RESERVED_MAX_IOS		1024
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 
@@ -226,15 +227,36 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
  */
 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
 
+static unsigned __dm_get_reserved_ios(unsigned *reserved_ios,
+				      unsigned def, unsigned max)
+{
+	unsigned ios = ACCESS_ONCE(*reserved_ios);
+	unsigned modified_ios = 0;
+
+	if (!ios)
+		modified_ios = def;
+	else if (ios > max)
+		modified_ios = max;
+
+	if (modified_ios) {
+		(void)cmpxchg(reserved_ios, ios, modified_ios);
+		ios = modified_ios;
+	}
+
+	return ios;
+}
+
 unsigned dm_get_reserved_bio_based_ios(void)
 {
-	return ACCESS_ONCE(reserved_bio_based_ios);
+	return __dm_get_reserved_ios(&reserved_bio_based_ios,
+				     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 
 unsigned dm_get_reserved_rq_based_ios(void)
 {
-	return ACCESS_ONCE(reserved_rq_based_ios);
+	return __dm_get_reserved_ios(&reserved_rq_based_ios,
+				     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
 

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH v3 0/3] dm: allow mempool and bioset reserves to be tuned
  2013-09-13 20:30                     ` Mike Snitzer
@ 2013-09-13 21:08                       ` Mike Snitzer
  2013-09-13 21:08                         ` [PATCH v3 1/3] dm: lower bio-based mempool reservation Mike Snitzer
                                           ` (3 more replies)
  0 siblings, 4 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-09-13 21:08 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: Jun'ichi Nomura, dm-devel

You can pull these changes from the 'devel' branch of:
git://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git

To browse, see:
https://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/log/?h=devel

v3 changes:
Added bounds checking (if 0 reset to default, if > 1024 reset to 1024)

v2 changes:
Simplified the implementation.  Dropped the peak_reserved_rq_based_ios
tracking, we'll use the tracepoint patch Jun'ichi Nomura suggested:
http://www.redhat.com/archives/dm-devel/2013-September/msg00048.html

Mike Snitzer (3):
  dm: lower bio-based mempool reservation
  dm: add reserved_rq_based_ios module parameter
  dm: add reserved_bio_based_ios module parameter

 drivers/md/dm-io.c    |    7 ++---
 drivers/md/dm-mpath.c |    6 ++--
 drivers/md/dm.c       |   60 +++++++++++++++++++++++++++++++++++++++++++++---
 drivers/md/dm.h       |    3 ++
 4 files changed, 65 insertions(+), 11 deletions(-)

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH v3 1/3] dm: lower bio-based mempool reservation
  2013-09-13 21:08                       ` [PATCH v3 " Mike Snitzer
@ 2013-09-13 21:08                         ` Mike Snitzer
  2013-09-13 21:08                         ` [PATCH v3 2/3] dm: add reserved_rq_based_ios module parameter Mike Snitzer
                                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-09-13 21:08 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: Jun'ichi Nomura, dm-devel

Bio-based device mapper processing doesn't need larger mempools (like
request-based DM does), so lower the number of reserved entries for
bio-based operation.  16 was already used for bio-based DM's bioset
but mistakenly wasn't used for it's _io_cache.

Formalize difference between bio-based and request-based defaults by
introducing RESERVED_BIO_BASED_IOS and RESERVED_REQUEST_BASED_IOS.

(based on older code from Mikulas Patocka)

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm.c |    9 +++++----
 1 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6a5e9ed..47bac14 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -211,7 +211,8 @@ struct dm_md_mempools {
 	struct bio_set *bs;
 };
 
-#define MIN_IOS 256
+#define RESERVED_BIO_BASED_IOS		16
+#define RESERVED_REQUEST_BASED_IOS	256
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 
@@ -2862,18 +2863,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 
 	if (type == DM_TYPE_BIO_BASED) {
 		cachep = _io_cache;
-		pool_size = 16;
+		pool_size = RESERVED_BIO_BASED_IOS;
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 	} else if (type == DM_TYPE_REQUEST_BASED) {
 		cachep = _rq_tio_cache;
-		pool_size = MIN_IOS;
+		pool_size = RESERVED_REQUEST_BASED_IOS;
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
 		/* per_bio_data_size is not used. See __bind_mempools(). */
 		WARN_ON(per_bio_data_size != 0);
 	} else
 		goto out;
 
-	pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep);
+	pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
 	if (!pools->io_pool)
 		goto out;
 
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH v3 2/3] dm: add reserved_rq_based_ios module parameter
  2013-09-13 21:08                       ` [PATCH v3 " Mike Snitzer
  2013-09-13 21:08                         ` [PATCH v3 1/3] dm: lower bio-based mempool reservation Mike Snitzer
@ 2013-09-13 21:08                         ` Mike Snitzer
  2013-09-13 21:08                         ` [PATCH v3 3/3] dm: add reserved_bio_based_ios " Mike Snitzer
  2013-09-18 15:10                         ` [PATCH v3 0/3] dm: allow mempool and bioset reserves to be tuned Frank Mayhar
  3 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-09-13 21:08 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: Jun'ichi Nomura, dm-devel

Allow user to change the number of IOs that are reserved by
request-based DM's mempools by writing to this file:
/sys/module/dm_mod/parameters/reserved_rq_based_ios

The default value is RESERVED_REQUEST_BASED_IOS (256).  The maximum
allowed value is RESERVED_MAX_IOS (1024).

Export dm_get_reserved_rq_based_ios() for use by DM targets and core
code.  Switch to sizing dm-mpath's mempool using DM core's configurable
'reserved_rq_based_ios'.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-mpath.c |    6 +++---
 drivers/md/dm.c       |   38 +++++++++++++++++++++++++++++++++++++-
 drivers/md/dm.h       |    2 ++
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index b759a12..e0bed10 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -7,6 +7,7 @@
 
 #include <linux/device-mapper.h>
 
+#include "dm.h"
 #include "dm-path-selector.h"
 #include "dm-uevent.h"
 
@@ -116,8 +117,6 @@ struct dm_mpath_io {
 
 typedef int (*action_fn) (struct pgpath *pgpath);
 
-#define MIN_IOS 256	/* Mempool size */
-
 static struct kmem_cache *_mpio_cache;
 
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
@@ -190,6 +189,7 @@ static void free_priority_group(struct priority_group *pg,
 static struct multipath *alloc_multipath(struct dm_target *ti)
 {
 	struct multipath *m;
+	unsigned min_ios = dm_get_reserved_rq_based_ios();
 
 	m = kzalloc(sizeof(*m), GFP_KERNEL);
 	if (m) {
@@ -202,7 +202,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 		INIT_WORK(&m->trigger_event, trigger_event);
 		init_waitqueue_head(&m->pg_init_wait);
 		mutex_init(&m->work_mutex);
-		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
+		m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
 		if (!m->mpio_pool) {
 			kfree(m);
 			return NULL;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 47bac14..93ba644 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -213,9 +213,41 @@ struct dm_md_mempools {
 
 #define RESERVED_BIO_BASED_IOS		16
 #define RESERVED_REQUEST_BASED_IOS	256
+#define RESERVED_MAX_IOS		1024
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 
+/*
+ * Request-based DM's mempools' reserved IOs set by the user.
+ */
+static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
+
+static unsigned __dm_get_reserved_ios(unsigned *reserved_ios,
+				      unsigned def, unsigned max)
+{
+	unsigned ios = ACCESS_ONCE(*reserved_ios);
+	unsigned modified_ios = 0;
+
+	if (!ios)
+		modified_ios = def;
+	else if (ios > max)
+		modified_ios = max;
+
+	if (modified_ios) {
+		(void)cmpxchg(reserved_ios, ios, modified_ios);
+		ios = modified_ios;
+	}
+
+	return ios;
+}
+
+unsigned dm_get_reserved_rq_based_ios(void)
+{
+	return __dm_get_reserved_ios(&reserved_rq_based_ios,
+				     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
+}
+EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
+
 static int __init local_init(void)
 {
 	int r = -ENOMEM;
@@ -2867,7 +2899,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 	} else if (type == DM_TYPE_REQUEST_BASED) {
 		cachep = _rq_tio_cache;
-		pool_size = RESERVED_REQUEST_BASED_IOS;
+		pool_size = dm_get_reserved_rq_based_ios();
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
 		/* per_bio_data_size is not used. See __bind_mempools(). */
 		WARN_ON(per_bio_data_size != 0);
@@ -2925,6 +2957,10 @@ module_exit(dm_exit);
 
 module_param(major, uint, 0);
 MODULE_PARM_DESC(major, "The major number of the device mapper");
+
+module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
+
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 5e604cc..1539650 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -184,6 +184,8 @@ void dm_free_md_mempools(struct dm_md_mempools *pools);
 /*
  * Helpers that are used by DM core
  */
+unsigned dm_get_reserved_rq_based_ios(void);
+
 static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
 {
 	return !maxlen || strlen(result) + 1 >= maxlen;
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH v3 3/3] dm: add reserved_bio_based_ios module parameter
  2013-09-13 21:08                       ` [PATCH v3 " Mike Snitzer
  2013-09-13 21:08                         ` [PATCH v3 1/3] dm: lower bio-based mempool reservation Mike Snitzer
  2013-09-13 21:08                         ` [PATCH v3 2/3] dm: add reserved_rq_based_ios module parameter Mike Snitzer
@ 2013-09-13 21:08                         ` Mike Snitzer
  2013-09-18 15:10                         ` [PATCH v3 0/3] dm: allow mempool and bioset reserves to be tuned Frank Mayhar
  3 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2013-09-13 21:08 UTC (permalink / raw)
  To: Frank Mayhar, Mikulas Patocka; +Cc: Jun'ichi Nomura, dm-devel

Allow user to change the number of IOs that are reserved by
bio-based DM's mempools by writing to this file:
/sys/module/dm_mod/parameters/reserved_bio_based_ios

The default value is RESERVED_BIO_BASED_IOS (16).  The maximum allowed
value is RESERVED_MAX_IOS (1024).

Export dm_get_reserved_bio_based_ios() for use by DM targets and core
code.  Switch to sizing dm-io's mempool and bioset using DM core's
configurable 'reserved_bio_based_ios'.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-io.c |    7 +++----
 drivers/md/dm.c    |   17 ++++++++++++++++-
 drivers/md/dm.h    |    1 +
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea49834..2a20986 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -19,8 +19,6 @@
 #define DM_MSG_PREFIX "io"
 
 #define DM_IO_MAX_REGIONS	BITS_PER_LONG
-#define MIN_IOS		16
-#define MIN_BIOS	16
 
 struct dm_io_client {
 	mempool_t *pool;
@@ -50,16 +48,17 @@ static struct kmem_cache *_dm_io_cache;
 struct dm_io_client *dm_io_client_create(void)
 {
 	struct dm_io_client *client;
+	unsigned min_ios = dm_get_reserved_bio_based_ios();
 
 	client = kmalloc(sizeof(*client), GFP_KERNEL);
 	if (!client)
 		return ERR_PTR(-ENOMEM);
 
-	client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
+	client->pool = mempool_create_slab_pool(min_ios, _dm_io_cache);
 	if (!client->pool)
 		goto bad;
 
-	client->bios = bioset_create(MIN_BIOS, 0);
+	client->bios = bioset_create(min_ios, 0);
 	if (!client->bios)
 		goto bad;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 93ba644..033dc26 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -218,6 +218,11 @@ static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 
 /*
+ * Bio-based DM's mempools' reserved IOs set by the user.
+ */
+static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
+
+/*
  * Request-based DM's mempools' reserved IOs set by the user.
  */
 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
@@ -241,6 +246,13 @@ static unsigned __dm_get_reserved_ios(unsigned *reserved_ios,
 	return ios;
 }
 
+unsigned dm_get_reserved_bio_based_ios(void)
+{
+	return __dm_get_reserved_ios(&reserved_bio_based_ios,
+				     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
+}
+EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
+
 unsigned dm_get_reserved_rq_based_ios(void)
 {
 	return __dm_get_reserved_ios(&reserved_rq_based_ios,
@@ -2895,7 +2907,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 
 	if (type == DM_TYPE_BIO_BASED) {
 		cachep = _io_cache;
-		pool_size = RESERVED_BIO_BASED_IOS;
+		pool_size = dm_get_reserved_bio_based_ios();
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 	} else if (type == DM_TYPE_REQUEST_BASED) {
 		cachep = _rq_tio_cache;
@@ -2958,6 +2970,9 @@ module_exit(dm_exit);
 module_param(major, uint, 0);
 MODULE_PARM_DESC(major, "The major number of the device mapper");
 
+module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
+
 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 1539650..1d1ad7b 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -184,6 +184,7 @@ void dm_free_md_mempools(struct dm_md_mempools *pools);
 /*
  * Helpers that are used by DM core
  */
+unsigned dm_get_reserved_bio_based_ios(void);
 unsigned dm_get_reserved_rq_based_ios(void);
 
 static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH v3 0/3] dm: allow mempool and bioset reserves to be tuned
  2013-09-13 21:08                       ` [PATCH v3 " Mike Snitzer
                                           ` (2 preceding siblings ...)
  2013-09-13 21:08                         ` [PATCH v3 3/3] dm: add reserved_bio_based_ios " Mike Snitzer
@ 2013-09-18 15:10                         ` Frank Mayhar
  3 siblings, 0 replies; 63+ messages in thread
From: Frank Mayhar @ 2013-09-18 15:10 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: Jun'ichi Nomura, dm-devel, Mikulas Patocka

On Fri, 2013-09-13 at 17:08 -0400, Mike Snitzer wrote:
> You can pull these changes from the 'devel' branch of:
> git://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git
> 
> To browse, see:
> https://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/log/?h=devel
> 
> v3 changes:
> Added bounds checking (if 0 reset to default, if > 1024 reset to 1024)
> 
> v2 changes:
> Simplified the implementation.  Dropped the peak_reserved_rq_based_ios
> tracking, we'll use the tracepoint patch Jun'ichi Nomura suggested:
> http://www.redhat.com/archives/dm-devel/2013-September/msg00048.html

Very nice, thanks for doing this.  It also makes these parameters a lot
more clear about what they're for, which is a feature.  "MIN_IOS" was
kind of opaque without actually digging into the code.

Feel free to add

Signed-off-by: Frank Mayhar <fmayhar@google.com>

if you care to.
-- 
Frank Mayhar
310-460-4042

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 3/7] dm: add reserved_bio_based_ios module parameter
  2013-09-12 23:17                       ` Mikulas Patocka
@ 2013-09-18 15:17                         ` Frank Mayhar
  0 siblings, 0 replies; 63+ messages in thread
From: Frank Mayhar @ 2013-09-18 15:17 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: dm-devel, Mike Snitzer

On Thu, 2013-09-12 at 19:17 -0400, Mikulas Patocka wrote:
> 
> On Thu, 12 Sep 2013, Mike Snitzer wrote:
> 
> > On Thu, Sep 12 2013 at  6:47pm -0400,
> > Mikulas Patocka <mpatocka@redhat.com> wrote:
> > 
> > > We don't need this. bio-based i/o should be fine with a small mempool, 
> > > there is no need to make it tunable.
> > 
> > I'd like to get Frank's insight here.  He clearly had an interest in
> > tuning bio-based also.  While 16 shouldn't really hurt it could still be
> > artifically high.  I'm not opposed to exposing a sane default but
> > allowing other to experiemnt (in production workloads) with smaller
> > values that still enable forward progress should memory get exhausted.
> > 
> > Mike
> 
> I would do it this way: if Frank gets a measurable improvement in memory 
> consumption when the values is dropped from 16 to a lower number (4 or 
> maybe 1), then I would drop the value by default (don't make it tunable, 
> drop it for all users).
> 
> If there is no improvement when the value is lowered, I'd leave it as it 
> is, on 16.

I haven't had time to look at this lately, but I did see a small but
measurable reduction in memory consumption when I was dealing with this
before.  The real problem for us is the fact that we'll have tons of
these devices so even a small memory reduction means a lot when
multiplied by, say, a hundred devices across a thousand machines.  And
that's a _small_ number.

That said, I wouldn't want to see all Linux installs limited by our
specific case, which is why I would vote for a tunable rather than a
fixed, hardcoded value.
-- 
Frank Mayhar
310-460-4042

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.
@ 2013-08-16 22:55 Frank Mayhar
  0 siblings, 0 replies; 63+ messages in thread
From: Frank Mayhar @ 2013-08-16 22:55 UTC (permalink / raw)
  To: linux-kernel

The device mapper and some of its modules allocate memory pools at
various points when setting up a device.  In some cases, these pools are
fairly large, for example the multipath module allocates a 256-entry
pool and the dm itself allocates three of that size.  In a
memory-constrained environment where we're creating a lot of these
devices, the memory use can quickly become significant.  Unfortunately,
there's currently no way to change the size of the pools other than by
changing a constant and rebuilding the kernel.

This patch fixes that by changing the hardcoded MIN_IOS (and certain
other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
sysctl-modifiable values.  This lets us change the size of these pools
on the fly, we can reduce the size of the pools and reduce memory
pressure.

We tested performance of dm-mpath with smaller MIN_IOS sizes for both dm
and dm-mpath, from a value of 32 all the way down to zero.  Bearing in
mind that the underlying devices were network-based, we saw essentially
no performance degradation; if there was any, it was down in the noise.
One might wonder why these sizes are the way they are; I investigated
and they've been unchanged since at least 2006.

Signed-off-by: Frank Mayhar <fmayhar@google.com>
--- 
 drivers/md/dm-crypt.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++----
 drivers/md/dm-io.c    | 58 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/dm-mpath.c | 48 ++++++++++++++++++++++++++++++++++++++-
 drivers/md/dm-snap.c  | 45 +++++++++++++++++++++++++++++++++++-
 drivers/md/dm.c       | 41 ++++++++++++++++++++++++++++++---
 5 files changed, 244 insertions(+), 11 deletions(-)
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 6d2d41a..d68f10a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -178,12 +178,55 @@ struct crypt_config {
 #define MIN_IOS        16
 #define MIN_POOL_PAGES 32
 
+static int sysctl_dm_crypt_min_ios = MIN_IOS;
+static int sysctl_dm_crypt_min_pool_pages = MIN_POOL_PAGES;
+
 static struct kmem_cache *_crypt_io_pool;
 
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
 
+static struct ctl_table_header *dm_crypt_table_header;
+static ctl_table dm_crypt_ctl_table[] = {
+	{
+		.procname	= "min_ios",
+		.data		= &sysctl_dm_crypt_min_ios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "min_pool_pages",
+		.data		= &sysctl_dm_crypt_min_pool_pages,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table dm_crypt_dir_table[] = {
+	{ .procname	= "crypt",
+	  .mode		= 0555,
+	  .child	= dm_crypt_ctl_table },
+	{ }
+};
+
+static ctl_table dm_crypt_dm_dir_table[] = {
+	{ .procname	= "dm",
+	  .mode		= 0555,
+	  .child	= dm_crypt_dir_table },
+	{ }
+};
+
+static ctl_table dm_crypt_root_table[] = {
+	{ .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= dm_crypt_dm_dir_table },
+	{ }
+};
+
 static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
 {
 	return this_cpu_ptr(cc->cpu);
@@ -1571,7 +1614,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 
 	ret = -ENOMEM;
-	cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
+	cc->io_pool = mempool_create_slab_pool(sysctl_dm_crypt_min_ios,
+					       _crypt_io_pool);
 	if (!cc->io_pool) {
 		ti->error = "Cannot allocate crypt io mempool";
 		goto bad;
@@ -1583,20 +1627,22 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
 			   ~(crypto_tfm_ctx_alignment() - 1);
 
-	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
+	cc->req_pool = mempool_create_kmalloc_pool(sysctl_dm_crypt_min_ios,
+			cc->dmreq_start +
 			sizeof(struct dm_crypt_request) + cc->iv_size);
 	if (!cc->req_pool) {
 		ti->error = "Cannot allocate crypt request mempool";
 		goto bad;
 	}
 
-	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
+	cc->page_pool = mempool_create_page_pool(sysctl_dm_crypt_min_pool_pages,
+						 0);
 	if (!cc->page_pool) {
 		ti->error = "Cannot allocate page mempool";
 		goto bad;
 	}
 
-	cc->bs = bioset_create(MIN_IOS, 0);
+	cc->bs = bioset_create(sysctl_dm_crypt_min_ios, 0);
 	if (!cc->bs) {
 		ti->error = "Cannot allocate crypt bioset";
 		goto bad;
@@ -1851,11 +1897,20 @@ static int __init dm_crypt_init(void)
 		kmem_cache_destroy(_crypt_io_pool);
 	}
 
+	dm_crypt_table_header = register_sysctl_table(dm_crypt_root_table);
+	if (!dm_crypt_table_header) {
+		DMERR("failed to register sysctl");
+		dm_unregister_target(&crypt_target);
+		kmem_cache_destroy(_crypt_io_pool);
+		return -ENOMEM;
+	}
+
 	return r;
 }
 
 static void __exit dm_crypt_exit(void)
 {
+	unregister_sysctl_table(dm_crypt_table_header);
 	dm_unregister_target(&crypt_target);
 	kmem_cache_destroy(_crypt_io_pool);
 }
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea49834..8c45c60 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -22,6 +22,9 @@
 #define MIN_IOS		16
 #define MIN_BIOS	16
 
+static int sysctl_dm_io_min_ios = MIN_IOS;
+static int sysctl_dm_io_min_bios = MIN_BIOS;
+
 struct dm_io_client {
 	mempool_t *pool;
 	struct bio_set *bios;
@@ -44,6 +47,46 @@ struct io {
 
 static struct kmem_cache *_dm_io_cache;
 
+static struct ctl_table_header *dm_io_table_header;
+static ctl_table dm_io_ctl_table[] = {
+	{
+		.procname	= "min_ios",
+		.data		= &sysctl_dm_io_min_ios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "min_bios",
+		.data		= &sysctl_dm_io_min_bios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table dm_io_dir_table[] = {
+	{ .procname	= "io",
+	  .mode		= 0555,
+	  .child	= dm_io_ctl_table },
+	{ }
+};
+
+static ctl_table dm_io_dm_dir_table[] = {
+	{ .procname	= "dm",
+	  .mode		= 0555,
+	  .child	= dm_io_dir_table },
+	{ }
+};
+
+static ctl_table dm_io_root_table[] = {
+	{ .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= dm_io_dm_dir_table },
+	{ }
+};
+
 /*
  * Create a client with mempool and bioset.
  */
@@ -55,11 +98,12 @@ struct dm_io_client *dm_io_client_create(void)
 	if (!client)
 		return ERR_PTR(-ENOMEM);
 
-	client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
+	client->pool = mempool_create_slab_pool(sysctl_dm_io_min_ios,
+						_dm_io_cache);
 	if (!client->pool)
 		goto bad;
 
-	client->bios = bioset_create(MIN_BIOS, 0);
+	client->bios = bioset_create(sysctl_dm_io_min_bios, 0);
 	if (!client->bios)
 		goto bad;
 
@@ -515,11 +559,21 @@ int __init dm_io_init(void)
 	if (!_dm_io_cache)
 		return -ENOMEM;
 
+	dm_io_table_header = register_sysctl_table(dm_io_root_table);
+	if (!dm_io_table_header) {
+		DMERR("failed to register sysctl");
+		kmem_cache_destroy(_dm_io_cache);
+		_dm_io_cache = NULL;
+		return -ENOMEM;
+	}
+
+
 	return 0;
 }
 
 void dm_io_exit(void)
 {
+	unregister_sysctl_table(dm_io_table_header);
 	kmem_cache_destroy(_dm_io_cache);
 	_dm_io_cache = NULL;
 }
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 5adede1..099af1b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -118,6 +118,8 @@ typedef int (*action_fn) (struct pgpath *pgpath);
 
 #define MIN_IOS 256	/* Mempool size */
 
+static int sysctl_dm_mpath_min_ios = MIN_IOS;
+
 static struct kmem_cache *_mpio_cache;
 
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
@@ -125,6 +127,38 @@ static void process_queued_ios(struct work_struct *work);
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
 
+static struct ctl_table_header *dm_mpath_table_header;
+static ctl_table dm_mpath_ctl_table[] = {
+	{
+		.procname	= "min_ios",
+		.data		= &sysctl_dm_mpath_min_ios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table dm_mpath_dir_table[] = {
+	{ .procname	= "mpath",
+	  .mode		= 0555,
+	  .child	= dm_mpath_ctl_table },
+	{ }
+};
+
+static ctl_table dm_mpath_dm_dir_table[] = {
+	{ .procname	= "dm",
+	  .mode		= 0555,
+	  .child	= dm_mpath_dir_table },
+	{ }
+};
+
+static ctl_table dm_mpath_root_table[] = {
+	{ .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= dm_mpath_dm_dir_table },
+	{ }
+};
 
 /*-----------------------------------------------
  * Allocation routines
@@ -202,7 +236,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 		INIT_WORK(&m->trigger_event, trigger_event);
 		init_waitqueue_head(&m->pg_init_wait);
 		mutex_init(&m->work_mutex);
-		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
+		m->mpio_pool = mempool_create_slab_pool(sysctl_dm_mpath_min_ios,
+							_mpio_cache);
 		if (!m->mpio_pool) {
 			kfree(m);
 			return NULL;
@@ -1745,6 +1780,15 @@ static int __init dm_multipath_init(void)
 		kmem_cache_destroy(_mpio_cache);
 		return -ENOMEM;
 	}
+	dm_mpath_table_header = register_sysctl_table(dm_mpath_root_table);
+	if (!dm_mpath_table_header) {
+		DMERR("failed to register sysctl");
+		destroy_workqueue(kmpath_handlerd);
+		destroy_workqueue(kmultipathd);
+		dm_unregister_target(&multipath_target);
+		kmem_cache_destroy(_mpio_cache);
+		return -ENOMEM;
+	}
 
 	DMINFO("version %u.%u.%u loaded",
 	       multipath_target.version[0], multipath_target.version[1],
@@ -1755,6 +1799,8 @@ static int __init dm_multipath_init(void)
 
 static void __exit dm_multipath_exit(void)
 {
+	unregister_sysctl_table(dm_mpath_table_header);
+
 	destroy_workqueue(kmpath_handlerd);
 	destroy_workqueue(kmultipathd);
 
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c434e5a..723b3c2 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -33,11 +33,45 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
  * The size of the mempool used to track chunks in use.
  */
 #define MIN_IOS 256
+static int sysctl_dm_snap_min_ios = MIN_IOS;
 
 #define DM_TRACKED_CHUNK_HASH_SIZE	16
 #define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & \
 					 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
 
+static struct ctl_table_header *dm_snap_table_header;
+static ctl_table dm_snap_ctl_table[] = {
+	{
+		.procname	= "min_ios",
+		.data		= &sysctl_dm_snap_min_ios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table dm_snap_dir_table[] = {
+	{ .procname	= "snap",
+	  .mode		= 0555,
+	  .child	= dm_snap_ctl_table },
+	{ }
+};
+
+static ctl_table dm_snap_dm_dir_table[] = {
+	{ .procname	= "dm",
+	  .mode		= 0555,
+	  .child	= dm_snap_dir_table },
+	{ }
+};
+
+static ctl_table dm_snap_root_table[] = {
+	{ .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= dm_snap_dm_dir_table },
+	{ }
+};
+
 struct dm_exception_table {
 	uint32_t hash_mask;
 	unsigned hash_shift;
@@ -1118,7 +1152,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_kcopyd;
 	}
 
-	s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
+	s->pending_pool = mempool_create_slab_pool(sysctl_dm_snap_min_ios,
+											   pending_cache);
 	if (!s->pending_pool) {
 		ti->error = "Could not allocate mempool for pending exceptions";
 		r = -ENOMEM;
@@ -2268,8 +2303,14 @@ static int __init dm_snapshot_init(void)
 		goto bad_pending_cache;
 	}
 
+	dm_snap_table_header = register_sysctl_table(dm_snap_root_table);
+	if (!dm_snap_table_header)
+		goto bad_sysctl_table;
+
 	return 0;
 
+bad_sysctl_table:
+	kmem_cache_destroy(pending_cache);
 bad_pending_cache:
 	kmem_cache_destroy(exception_cache);
 bad_exception_cache:
@@ -2288,6 +2329,8 @@ bad_register_snapshot_target:
 
 static void __exit dm_snapshot_exit(void)
 {
+	unregister_sysctl_table(dm_snap_table_header);
+
 	dm_unregister_target(&snapshot_target);
 	dm_unregister_target(&origin_target);
 	dm_unregister_target(&merge_target);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9e39d2b..fdff32f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -19,6 +19,7 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/delay.h>
+#include <linux/sysctl.h>
 
 #include <trace/events/block.h>
 
@@ -209,9 +210,36 @@ struct dm_md_mempools {
 };
 
 #define MIN_IOS 256
+static int sysctl_dm_min_ios = MIN_IOS;
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 
+static struct ctl_table_header *dm_table_header;
+static ctl_table dm_ctl_table[] = {
+	{
+		.procname	= "min_ios",
+		.data		= &sysctl_dm_min_ios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table dm_dir_table[] = {
+	{ .procname	= "dm",
+	  .mode		= 0555,
+	  .child	= dm_ctl_table },
+	{ }
+};
+
+static ctl_table dm_root_table[] = {
+	{ .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= dm_dir_table },
+	{ }
+};
+
 static int __init local_init(void)
 {
 	int r = -ENOMEM;
@@ -229,16 +257,22 @@ static int __init local_init(void)
 	if (r)
 		goto out_free_rq_tio_cache;
 
+	dm_table_header = register_sysctl_table(dm_root_table);
+	if (!dm_table_header)
+		goto out_uevent_exit;
+
 	_major = major;
 	r = register_blkdev(_major, _name);
 	if (r < 0)
-		goto out_uevent_exit;
+		goto out_unregister_sysctl_table;
 
 	if (!_major)
 		_major = r;
 
 	return 0;
 
+out_unregister_sysctl_table:
+	unregister_sysctl_table(dm_table_header);
 out_uevent_exit:
 	dm_uevent_exit();
 out_free_rq_tio_cache:
@@ -251,6 +285,7 @@ out_free_io_cache:
 
 static void local_exit(void)
 {
+	unregister_sysctl_table(dm_table_header);
 	kmem_cache_destroy(_rq_tio_cache);
 	kmem_cache_destroy(_io_cache);
 	unregister_blkdev(_major, _name);
@@ -2806,14 +2841,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 	} else if (type == DM_TYPE_REQUEST_BASED) {
 		cachep = _rq_tio_cache;
-		pool_size = MIN_IOS;
+		pool_size = sysctl_dm_min_ios;
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
 		/* per_bio_data_size is not used. See __bind_mempools(). */
 		WARN_ON(per_bio_data_size != 0);
 	} else
 		goto out;
 
-	pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep);
+	pools->io_pool = mempool_create_slab_pool(sysctl_dm_min_ios, cachep);
 	if (!pools->io_pool)
 		goto out;
 

-- 
Frank Mayhar
310-460-4042

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.
@ 2013-08-16 20:51 Frank Mayhar
  0 siblings, 0 replies; 63+ messages in thread
From: Frank Mayhar @ 2013-08-16 20:51 UTC (permalink / raw)
  To: linux-kernel

The device mapper and some of its modules allocate memory pools at
various points when setting up a device.  In some cases, these pools are
fairly large, for example the multipath module allocates a 256-entry
pool and the dm itself allocates three of that size.  In a
memory-constrained environment where we're creating a lot of these
devices, the memory use can quickly become significant.  Unfortunately,
there's currently no way to change the size of the pools other than by
changing a constant and rebuilding the kernel.

This patch fixes that by changing the hardcoded MIN_IOS (and certain
other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
sysctl-modifiable values.  This lets us change the size of these pools
on the fly, we can reduce the size of the pools and reduce memory
pressure.

We tested performance of dm-mpath with smaller MIN_IOS sizes for both dm
and dm-mpath, from a value of 32 all the way down to zero.  Bearing in
mind that the underlying devices were network-based, we saw essentially
no performance degradation; if there was any, it was down in the noise.
One might wonder why these sizes are the way they are; I investigated
and they've been unchanged since at least 2006.

Signed-off-by: Frank Mayhar <fmayhar@google.com>
--- 
 drivers/md/dm-crypt.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++----
 drivers/md/dm-io.c    | 58 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/dm-mpath.c | 48 ++++++++++++++++++++++++++++++++++++++-
 drivers/md/dm-snap.c  | 45 +++++++++++++++++++++++++++++++++++-
 drivers/md/dm.c       | 41 ++++++++++++++++++++++++++++++---
 5 files changed, 244 insertions(+), 11 deletions(-)
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 6d2d41a..d68f10a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -178,12 +178,55 @@ struct crypt_config {
 #define MIN_IOS        16
 #define MIN_POOL_PAGES 32
 
+static int sysctl_dm_crypt_min_ios = MIN_IOS;
+static int sysctl_dm_crypt_min_pool_pages = MIN_POOL_PAGES;
+
 static struct kmem_cache *_crypt_io_pool;
 
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
 
+static struct ctl_table_header *dm_crypt_table_header;
+static ctl_table dm_crypt_ctl_table[] = {
+	{
+		.procname	= "min_ios",
+		.data		= &sysctl_dm_crypt_min_ios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "min_pool_pages",
+		.data		= &sysctl_dm_crypt_min_pool_pages,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table dm_crypt_dir_table[] = {
+	{ .procname	= "crypt",
+	  .mode		= 0555,
+	  .child	= dm_crypt_ctl_table },
+	{ }
+};
+
+static ctl_table dm_crypt_dm_dir_table[] = {
+	{ .procname	= "dm",
+	  .mode		= 0555,
+	  .child	= dm_crypt_dir_table },
+	{ }
+};
+
+static ctl_table dm_crypt_root_table[] = {
+	{ .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= dm_crypt_dm_dir_table },
+	{ }
+};
+
 static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
 {
 	return this_cpu_ptr(cc->cpu);
@@ -1571,7 +1614,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 
 	ret = -ENOMEM;
-	cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
+	cc->io_pool = mempool_create_slab_pool(sysctl_dm_crypt_min_ios,
+					       _crypt_io_pool);
 	if (!cc->io_pool) {
 		ti->error = "Cannot allocate crypt io mempool";
 		goto bad;
@@ -1583,20 +1627,22 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
 			   ~(crypto_tfm_ctx_alignment() - 1);
 
-	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
+	cc->req_pool = mempool_create_kmalloc_pool(sysctl_dm_crypt_min_ios,
+			cc->dmreq_start +
 			sizeof(struct dm_crypt_request) + cc->iv_size);
 	if (!cc->req_pool) {
 		ti->error = "Cannot allocate crypt request mempool";
 		goto bad;
 	}
 
-	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
+	cc->page_pool = mempool_create_page_pool(sysctl_dm_crypt_min_pool_pages,
+						 0);
 	if (!cc->page_pool) {
 		ti->error = "Cannot allocate page mempool";
 		goto bad;
 	}
 
-	cc->bs = bioset_create(MIN_IOS, 0);
+	cc->bs = bioset_create(sysctl_dm_crypt_min_ios, 0);
 	if (!cc->bs) {
 		ti->error = "Cannot allocate crypt bioset";
 		goto bad;
@@ -1851,11 +1897,20 @@ static int __init dm_crypt_init(void)
 		kmem_cache_destroy(_crypt_io_pool);
 	}
 
+	dm_crypt_table_header = register_sysctl_table(dm_crypt_root_table);
+	if (!dm_crypt_table_header) {
+		DMERR("failed to register sysctl");
+		dm_unregister_target(&crypt_target);
+		kmem_cache_destroy(_crypt_io_pool);
+		return -ENOMEM;
+	}
+
 	return r;
 }
 
 static void __exit dm_crypt_exit(void)
 {
+	unregister_sysctl_table(dm_crypt_table_header);
 	dm_unregister_target(&crypt_target);
 	kmem_cache_destroy(_crypt_io_pool);
 }
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea49834..8c45c60 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -22,6 +22,9 @@
 #define MIN_IOS		16
 #define MIN_BIOS	16
 
+static int sysctl_dm_io_min_ios = MIN_IOS;
+static int sysctl_dm_io_min_bios = MIN_BIOS;
+
 struct dm_io_client {
 	mempool_t *pool;
 	struct bio_set *bios;
@@ -44,6 +47,46 @@ struct io {
 
 static struct kmem_cache *_dm_io_cache;
 
+static struct ctl_table_header *dm_io_table_header;
+static ctl_table dm_io_ctl_table[] = {
+	{
+		.procname	= "min_ios",
+		.data		= &sysctl_dm_io_min_ios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "min_bios",
+		.data		= &sysctl_dm_io_min_bios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table dm_io_dir_table[] = {
+	{ .procname	= "io",
+	  .mode		= 0555,
+	  .child	= dm_io_ctl_table },
+	{ }
+};
+
+static ctl_table dm_io_dm_dir_table[] = {
+	{ .procname	= "dm",
+	  .mode		= 0555,
+	  .child	= dm_io_dir_table },
+	{ }
+};
+
+static ctl_table dm_io_root_table[] = {
+	{ .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= dm_io_dm_dir_table },
+	{ }
+};
+
 /*
  * Create a client with mempool and bioset.
  */
@@ -55,11 +98,12 @@ struct dm_io_client *dm_io_client_create(void)
 	if (!client)
 		return ERR_PTR(-ENOMEM);
 
-	client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
+	client->pool = mempool_create_slab_pool(sysctl_dm_io_min_ios,
+						_dm_io_cache);
 	if (!client->pool)
 		goto bad;
 
-	client->bios = bioset_create(MIN_BIOS, 0);
+	client->bios = bioset_create(sysctl_dm_io_min_bios, 0);
 	if (!client->bios)
 		goto bad;
 
@@ -515,11 +559,21 @@ int __init dm_io_init(void)
 	if (!_dm_io_cache)
 		return -ENOMEM;
 
+	dm_io_table_header = register_sysctl_table(dm_io_root_table);
+	if (!dm_io_table_header) {
+		DMERR("failed to register sysctl");
+		kmem_cache_destroy(_dm_io_cache);
+		_dm_io_cache = NULL;
+		return -ENOMEM;
+	}
+
+
 	return 0;
 }
 
 void dm_io_exit(void)
 {
+	unregister_sysctl_table(dm_io_table_header);
 	kmem_cache_destroy(_dm_io_cache);
 	_dm_io_cache = NULL;
 }
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 5adede1..099af1b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -118,6 +118,8 @@ typedef int (*action_fn) (struct pgpath *pgpath);
 
 #define MIN_IOS 256	/* Mempool size */
 
+static int sysctl_dm_mpath_min_ios = MIN_IOS;
+
 static struct kmem_cache *_mpio_cache;
 
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
@@ -125,6 +127,38 @@ static void process_queued_ios(struct work_struct *work);
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
 
+static struct ctl_table_header *dm_mpath_table_header;
+static ctl_table dm_mpath_ctl_table[] = {
+	{
+		.procname	= "min_ios",
+		.data		= &sysctl_dm_mpath_min_ios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table dm_mpath_dir_table[] = {
+	{ .procname	= "mpath",
+	  .mode		= 0555,
+	  .child	= dm_mpath_ctl_table },
+	{ }
+};
+
+static ctl_table dm_mpath_dm_dir_table[] = {
+	{ .procname	= "dm",
+	  .mode		= 0555,
+	  .child	= dm_mpath_dir_table },
+	{ }
+};
+
+static ctl_table dm_mpath_root_table[] = {
+	{ .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= dm_mpath_dm_dir_table },
+	{ }
+};
 
 /*-----------------------------------------------
  * Allocation routines
@@ -202,7 +236,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 		INIT_WORK(&m->trigger_event, trigger_event);
 		init_waitqueue_head(&m->pg_init_wait);
 		mutex_init(&m->work_mutex);
-		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
+		m->mpio_pool = mempool_create_slab_pool(sysctl_dm_mpath_min_ios,
+							_mpio_cache);
 		if (!m->mpio_pool) {
 			kfree(m);
 			return NULL;
@@ -1745,6 +1780,15 @@ static int __init dm_multipath_init(void)
 		kmem_cache_destroy(_mpio_cache);
 		return -ENOMEM;
 	}
+	dm_mpath_table_header = register_sysctl_table(dm_mpath_root_table);
+	if (!dm_mpath_table_header) {
+		DMERR("failed to register sysctl");
+		destroy_workqueue(kmpath_handlerd);
+		destroy_workqueue(kmultipathd);
+		dm_unregister_target(&multipath_target);
+		kmem_cache_destroy(_mpio_cache);
+		return -ENOMEM;
+	}
 
 	DMINFO("version %u.%u.%u loaded",
 	       multipath_target.version[0], multipath_target.version[1],
@@ -1755,6 +1799,8 @@ static int __init dm_multipath_init(void)
 
 static void __exit dm_multipath_exit(void)
 {
+	unregister_sysctl_table(dm_mpath_table_header);
+
 	destroy_workqueue(kmpath_handlerd);
 	destroy_workqueue(kmultipathd);
 
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c434e5a..723b3c2 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -33,11 +33,45 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
  * The size of the mempool used to track chunks in use.
  */
 #define MIN_IOS 256
+static int sysctl_dm_snap_min_ios = MIN_IOS;
 
 #define DM_TRACKED_CHUNK_HASH_SIZE	16
 #define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & \
 					 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
 
+static struct ctl_table_header *dm_snap_table_header;
+static ctl_table dm_snap_ctl_table[] = {
+	{
+		.procname	= "min_ios",
+		.data		= &sysctl_dm_snap_min_ios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table dm_snap_dir_table[] = {
+	{ .procname	= "snap",
+	  .mode		= 0555,
+	  .child	= dm_snap_ctl_table },
+	{ }
+};
+
+static ctl_table dm_snap_dm_dir_table[] = {
+	{ .procname	= "dm",
+	  .mode		= 0555,
+	  .child	= dm_snap_dir_table },
+	{ }
+};
+
+static ctl_table dm_snap_root_table[] = {
+	{ .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= dm_snap_dm_dir_table },
+	{ }
+};
+
 struct dm_exception_table {
 	uint32_t hash_mask;
 	unsigned hash_shift;
@@ -1118,7 +1152,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_kcopyd;
 	}
 
-	s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
+	s->pending_pool = mempool_create_slab_pool(sysctl_dm_snap_min_ios,
+											   pending_cache);
 	if (!s->pending_pool) {
 		ti->error = "Could not allocate mempool for pending exceptions";
 		r = -ENOMEM;
@@ -2268,8 +2303,14 @@ static int __init dm_snapshot_init(void)
 		goto bad_pending_cache;
 	}
 
+	dm_snap_table_header = register_sysctl_table(dm_snap_root_table);
+	if (!dm_snap_table_header)
+		goto bad_sysctl_table;
+
 	return 0;
 
+bad_sysctl_table:
+	kmem_cache_destroy(pending_cache);
 bad_pending_cache:
 	kmem_cache_destroy(exception_cache);
 bad_exception_cache:
@@ -2288,6 +2329,8 @@ bad_register_snapshot_target:
 
 static void __exit dm_snapshot_exit(void)
 {
+	unregister_sysctl_table(dm_snap_table_header);
+
 	dm_unregister_target(&snapshot_target);
 	dm_unregister_target(&origin_target);
 	dm_unregister_target(&merge_target);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9e39d2b..fdff32f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -19,6 +19,7 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/delay.h>
+#include <linux/sysctl.h>
 
 #include <trace/events/block.h>
 
@@ -209,9 +210,36 @@ struct dm_md_mempools {
 };
 
 #define MIN_IOS 256
+static int sysctl_dm_min_ios = MIN_IOS;
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 
+static struct ctl_table_header *dm_table_header;
+static ctl_table dm_ctl_table[] = {
+	{
+		.procname	= "min_ios",
+		.data		= &sysctl_dm_min_ios,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO|S_IWUSR,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table dm_dir_table[] = {
+	{ .procname	= "dm",
+	  .mode		= 0555,
+	  .child	= dm_ctl_table },
+	{ }
+};
+
+static ctl_table dm_root_table[] = {
+	{ .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= dm_dir_table },
+	{ }
+};
+
 static int __init local_init(void)
 {
 	int r = -ENOMEM;
@@ -229,16 +257,22 @@ static int __init local_init(void)
 	if (r)
 		goto out_free_rq_tio_cache;
 
+	dm_table_header = register_sysctl_table(dm_root_table);
+	if (!dm_table_header)
+		goto out_uevent_exit;
+
 	_major = major;
 	r = register_blkdev(_major, _name);
 	if (r < 0)
-		goto out_uevent_exit;
+		goto out_unregister_sysctl_table;
 
 	if (!_major)
 		_major = r;
 
 	return 0;
 
+out_unregister_sysctl_table:
+	unregister_sysctl_table(dm_table_header);
 out_uevent_exit:
 	dm_uevent_exit();
 out_free_rq_tio_cache:
@@ -251,6 +285,7 @@ out_free_io_cache:
 
 static void local_exit(void)
 {
+	unregister_sysctl_table(dm_table_header);
 	kmem_cache_destroy(_rq_tio_cache);
 	kmem_cache_destroy(_io_cache);
 	unregister_blkdev(_major, _name);
@@ -2806,14 +2841,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 	} else if (type == DM_TYPE_REQUEST_BASED) {
 		cachep = _rq_tio_cache;
-		pool_size = MIN_IOS;
+		pool_size = sysctl_dm_min_ios;
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
 		/* per_bio_data_size is not used. See __bind_mempools(). */
 		WARN_ON(per_bio_data_size != 0);
 	} else
 		goto out;
 
-	pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep);
+	pools->io_pool = mempool_create_slab_pool(sysctl_dm_min_ios, cachep);
 	if (!pools->io_pool)
 		goto out;
 

-- 
Frank Mayhar
310-460-4042

^ permalink raw reply related	[flat|nested] 63+ messages in thread

end of thread, other threads:[~2013-09-18 15:17 UTC | newest]

Thread overview: 63+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-08-09 17:48 [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl Frank Mayhar
2013-08-13 16:41 ` Frank Mayhar
2013-08-16 22:58 ` Frank Mayhar
2013-08-17 12:30 ` [dm-devel] " Alasdair G Kergon
2013-08-19 13:40   ` Mike Snitzer
2013-08-19 15:04     ` Frank Mayhar
2013-08-19 14:00 ` Mike Snitzer
2013-08-19 14:00   ` Mike Snitzer
2013-08-19 17:54   ` [dm-devel] " Frank Mayhar
2013-08-19 18:15     ` Mike Snitzer
2013-08-20 21:44     ` [dm-devel] " Mikulas Patocka
2013-08-20 21:52       ` Frank Mayhar
2013-08-20 21:41   ` Mikulas Patocka
2013-08-20 21:22 ` [dm-devel] [PATCH] " Mikulas Patocka
2013-08-20 21:28   ` Frank Mayhar
2013-08-20 21:47     ` Mikulas Patocka
2013-08-20 21:57       ` Frank Mayhar
2013-08-20 22:24         ` Mike Snitzer
2013-08-20 22:52           ` Mikulas Patocka
2013-08-20 23:14           ` Frank Mayhar
2013-08-22 17:26             ` Frank Mayhar
2013-08-26 14:28             ` Mikulas Patocka
2013-09-12 22:24               ` [PATCH 0/7] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
2013-09-12 22:24                 ` [PATCH 1/7] dm: lower bio-based mempool reservation Mike Snitzer
2013-09-12 22:40                   ` Mikulas Patocka
2013-09-12 22:24                 ` [PATCH 2/7] dm: add reserved_rq_based_ios module parameter Mike Snitzer
2013-09-12 22:45                   ` Mikulas Patocka
2013-09-12 23:15                     ` Mike Snitzer
2013-09-12 23:27                       ` Mikulas Patocka
2013-09-12 23:32                         ` Mike Snitzer
2013-09-12 22:24                 ` [PATCH 3/7] dm: add reserved_bio_based_ios " Mike Snitzer
2013-09-12 22:47                   ` Mikulas Patocka
2013-09-12 23:11                     ` Mike Snitzer
2013-09-12 23:17                       ` Mikulas Patocka
2013-09-18 15:17                         ` Frank Mayhar
2013-09-12 22:24                 ` [PATCH 4/7] dm io: use dm_get_reserved_bio_based_ios to size reserves Mike Snitzer
2013-09-12 22:48                   ` Mikulas Patocka
2013-09-12 22:24                 ` [PATCH 5/7] dm mpath: use dm_get_reserved_rq_based_ios to size mempool Mike Snitzer
2013-09-12 22:48                   ` Mikulas Patocka
2013-09-12 22:24                 ` [PATCH 6/7] dm: track the maximum number of bios in a cloned request Mike Snitzer
2013-09-12 22:55                   ` Mikulas Patocka
2013-09-12 23:09                     ` Mike Snitzer
2013-09-12 22:24                 ` [PATCH 7/7] dm: optimize clone_rq() when track_peak_rq_based_ios is disabled Mike Snitzer
2013-09-12 23:00                   ` Mikulas Patocka
2013-09-12 23:06                     ` Mike Snitzer
2013-09-12 23:30                       ` Mikulas Patocka
2013-09-12 23:53                         ` Mike Snitzer
2013-09-13  4:46                           ` Jun'ichi Nomura
2013-09-13 13:04                             ` Mike Snitzer
2013-09-13 14:34                             ` Mikulas Patocka
2013-09-13 18:59                 ` [PATCH v2 0/3] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
2013-09-13 18:59                   ` [PATCH v2 1/3] dm: lower bio-based mempool reservation Mike Snitzer
2013-09-13 18:59                   ` [PATCH v2 2/3] dm: add reserved_rq_based_ios module parameter Mike Snitzer
2013-09-13 18:59                   ` [PATCH v2 3/3] dm: add reserved_bio_based_ios " Mike Snitzer
2013-09-13 19:22                   ` [PATCH v2 0/3] dm: allow mempool and bioset reserves to be tuned Mike Snitzer
2013-09-13 20:30                     ` Mike Snitzer
2013-09-13 21:08                       ` [PATCH v3 " Mike Snitzer
2013-09-13 21:08                         ` [PATCH v3 1/3] dm: lower bio-based mempool reservation Mike Snitzer
2013-09-13 21:08                         ` [PATCH v3 2/3] dm: add reserved_rq_based_ios module parameter Mike Snitzer
2013-09-13 21:08                         ` [PATCH v3 3/3] dm: add reserved_bio_based_ios " Mike Snitzer
2013-09-18 15:10                         ` [PATCH v3 0/3] dm: allow mempool and bioset reserves to be tuned Frank Mayhar
2013-08-16 20:51 [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl Frank Mayhar
2013-08-16 22:55 Frank Mayhar

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.