linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 1/3] bdi patches
@ 2007-11-29 11:10 Miklos Szeredi
  2007-11-29 11:14 ` [patch 2/3] mm: bdi: allow to set a minimum to the bdi dirty limit Miklos Szeredi
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Miklos Szeredi @ 2007-11-29 11:10 UTC (permalink / raw)
  To: a.p.zijlstra; +Cc: fengguang.wu, linux-kernel

> http://programming.kicks-ass.net/kernel-patches/foo/
> 
> bdi-task-dirty.patch
> bdi-sysfs.patch
> bdi-min.patch
> bdi-max.patch
> 
> 
> Is my current rather experimental stack, I just wrote the max part after
> having slept on it. I'm not fond of the multiplication there, but I
> dno't see a way around it.
> 
> Compile tested only.

I've done some testing on these patches and did some changes. So here
they go.

Thanks,
Miklos

---------
Subject: mm: sysfs: expose the BDI object in sysfs

Provide a place in sysfs for the backing_dev_info object.
This allows us to see and set the various BDI specific variables.

In particular this properly exposes the read-ahead window for all
relevant users and /sys/block/<block>/queue/read_ahead_kb should be
deprecated.

With patient help from Kay Sievers and Greg KH

[mszeredi@suse.cz]
Fix fuse: move allocation of the connection ID before the bdi_init_fmt() call.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---

Index: linux/block/genhd.c
===================================================================
--- linux.orig/block/genhd.c	2007-11-22 14:13:20.000000000 +0100
+++ linux/block/genhd.c	2007-11-26 17:24:19.000000000 +0100
@@ -183,6 +183,8 @@ void add_disk(struct gendisk *disk)
 			    disk->minors, NULL, exact_match, exact_lock, disk);
 	register_disk(disk);
 	blk_register_queue(disk);
+	bdi_register(&disk->queue->backing_dev_info, NULL,
+		"blk-%s", disk->disk_name);
 }
 
 EXPORT_SYMBOL(add_disk);
@@ -191,6 +193,7 @@ EXPORT_SYMBOL(del_gendisk);	/* in partit
 void unlink_gendisk(struct gendisk *disk)
 {
 	blk_unregister_queue(disk);
+	bdi_unregister(&disk->queue->backing_dev_info);
 	blk_unregister_region(MKDEV(disk->major, disk->first_minor),
 			      disk->minors);
 }
Index: linux/fs/fuse/inode.c
===================================================================
--- linux.orig/fs/fuse/inode.c	2007-11-22 14:13:21.000000000 +0100
+++ linux/fs/fuse/inode.c	2007-11-26 17:24:19.000000000 +0100
@@ -460,6 +460,12 @@ static int fuse_show_options(struct seq_
 	return 0;
 }
 
+static u64 conn_id(void)
+{
+	static u64 ctr = 1;
+	return ctr++;
+}
+
 static struct fuse_conn *new_conn(void)
 {
 	struct fuse_conn *fc;
@@ -480,7 +486,11 @@ static struct fuse_conn *new_conn(void)
 		atomic_set(&fc->num_waiting, 0);
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
-		err = bdi_init(&fc->bdi);
+		mutex_lock(&fuse_mutex);
+		fc->id = conn_id();
+		mutex_unlock(&fuse_mutex);
+		err = bdi_init_fmt(&fc->bdi, NULL,
+				"fuse-%llu", (unsigned long long)fc->id);
 		if (err) {
 			kfree(fc);
 			fc = NULL;
@@ -590,12 +600,6 @@ static void fuse_send_init(struct fuse_c
 	request_send_background(fc, req);
 }
 
-static u64 conn_id(void)
-{
-	static u64 ctr = 1;
-	return ctr++;
-}
-
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct fuse_conn *fc;
@@ -675,7 +679,6 @@ static int fuse_fill_super(struct super_
 	if (file->private_data)
 		goto err_unlock;
 
-	fc->id = conn_id();
 	err = fuse_ctl_add_conn(fc);
 	if (err)
 		goto err_unlock;
Index: linux/fs/nfs/client.c
===================================================================
--- linux.orig/fs/nfs/client.c	2007-11-22 14:13:21.000000000 +0100
+++ linux/fs/nfs/client.c	2007-11-26 17:24:19.000000000 +0100
@@ -658,7 +658,8 @@ static void nfs_server_set_fsinfo(struct
 /*
  * Probe filesystem information, including the FSID on v2/v3
  */
-static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr)
+static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh,
+		struct nfs_fattr *fattr, const char *dev_name)
 {
 	struct nfs_fsinfo fsinfo;
 	struct nfs_client *clp = server->nfs_client;
@@ -679,7 +680,15 @@ static int nfs_probe_fsinfo(struct nfs_s
 		goto out_error;
 
 	nfs_server_set_fsinfo(server, &fsinfo);
+#if 0
+	/*
+	 * sysfs is broken in that it doesn't allow names > 20 chars
+	 * and this format will easily generate in excess of that.
+	 */
+	error = bdi_init_fmt(&server->backing_dev_info, NULL, "nfs-%s", dev_name);
+#else
 	error = bdi_init(&server->backing_dev_info);
+#endif
 	if (error)
 		goto out_error;
 
@@ -776,7 +785,7 @@ void nfs_free_server(struct nfs_server *
  * - keyed on server and FSID
  */
 struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
-				     struct nfs_fh *mntfh)
+				     struct nfs_fh *mntfh, const char *dev_name)
 {
 	struct nfs_server *server;
 	struct nfs_fattr fattr;
@@ -796,7 +805,7 @@ struct nfs_server *nfs_create_server(con
 	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
 
 	/* Probe the root fh to retrieve its FSID */
-	error = nfs_probe_fsinfo(server, mntfh, &fattr);
+	error = nfs_probe_fsinfo(server, mntfh, &fattr, dev_name);
 	if (error < 0)
 		goto error;
 	if (server->nfs_client->rpc_ops->version == 3) {
@@ -953,7 +962,7 @@ static int nfs4_init_server(struct nfs_s
  * - keyed on server and FSID
  */
 struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
-				      struct nfs_fh *mntfh)
+				      struct nfs_fh *mntfh, const char *dev_name)
 {
 	struct nfs_fattr fattr;
 	struct nfs_server *server;
@@ -995,7 +1004,7 @@ struct nfs_server *nfs4_create_server(co
 		(unsigned long long) server->fsid.minor);
 	dprintk("Mount FH: %d\n", mntfh->size);
 
-	error = nfs_probe_fsinfo(server, mntfh, &fattr);
+	error = nfs_probe_fsinfo(server, mntfh, &fattr, dev_name);
 	if (error < 0)
 		goto error;
 
@@ -1025,7 +1034,8 @@ error:
  * Create an NFS4 referral server record
  */
 struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
-					       struct nfs_fh *mntfh)
+					       struct nfs_fh *mntfh,
+					       const char *dev_name)
 {
 	struct nfs_client *parent_client;
 	struct nfs_server *server, *parent_server;
@@ -1070,7 +1080,7 @@ struct nfs_server *nfs4_create_referral_
 		goto error;
 
 	/* probe the filesystem info for this server filesystem */
-	error = nfs_probe_fsinfo(server, mntfh, &fattr);
+	error = nfs_probe_fsinfo(server, mntfh, &fattr, dev_name);
 	if (error < 0)
 		goto error;
 
@@ -1104,7 +1114,8 @@ error:
  */
 struct nfs_server *nfs_clone_server(struct nfs_server *source,
 				    struct nfs_fh *fh,
-				    struct nfs_fattr *fattr)
+				    struct nfs_fattr *fattr,
+				    const char *dev_name)
 {
 	struct nfs_server *server;
 	struct nfs_fattr fattr_fsinfo;
@@ -1132,7 +1143,7 @@ struct nfs_server *nfs_clone_server(stru
 		nfs_init_server_aclclient(server);
 
 	/* probe the filesystem info for this server filesystem */
-	error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo);
+	error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo, dev_name);
 	if (error < 0)
 		goto out_free_server;
 
Index: linux/include/linux/backing-dev.h
===================================================================
--- linux.orig/include/linux/backing-dev.h	2007-11-22 14:13:03.000000000 +0100
+++ linux/include/linux/backing-dev.h	2007-11-26 17:24:19.000000000 +0100
@@ -11,6 +11,8 @@
 #include <linux/percpu_counter.h>
 #include <linux/log2.h>
 #include <linux/proportions.h>
+#include <linux/kernel.h>
+#include <linux/device.h>
 #include <asm/atomic.h>
 
 struct page;
@@ -48,11 +50,28 @@ struct backing_dev_info {
 
 	struct prop_local_percpu completions;
 	int dirty_exceeded;
+
+	struct device *dev;
 };
 
 int bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
 
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+		const char *fmt, ...);
+void bdi_unregister(struct backing_dev_info *bdi);
+
+#define bdi_init_fmt(bdi, parent, fmt...)			\
+	({							\
+	 	int ret = bdi_init(bdi);			\
+	 	if (!ret) {					\
+	 		ret = bdi_register(bdi, parent, ##fmt);	\
+	 		if (ret)				\
+	 			bdi_destroy(bdi);		\
+	 	}						\
+	 	ret;						\
+	 })
+
 static inline void __add_bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item, s64 amount)
 {
Index: linux/include/linux/writeback.h
===================================================================
--- linux.orig/include/linux/writeback.h	2007-11-22 14:13:21.000000000 +0100
+++ linux/include/linux/writeback.h	2007-11-26 17:24:19.000000000 +0100
@@ -113,6 +113,9 @@ struct file;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
 				      void __user *, size_t *, loff_t *);
 
+void get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+		 struct backing_dev_info *bdi);
+
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 					unsigned long nr_pages_dirtied);
Index: linux/mm/backing-dev.c
===================================================================
--- linux.orig/mm/backing-dev.c	2007-11-22 14:13:03.000000000 +0100
+++ linux/mm/backing-dev.c	2007-11-26 17:24:19.000000000 +0100
@@ -4,12 +4,119 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/device.h>
+
+
+static struct class *bdi_class;
+
+static ssize_t read_ahead_kb_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+	char *end;
+
+	bdi->ra_pages = simple_strtoul(buf, &end, 10) >> (PAGE_SHIFT - 10);
+
+	return end - buf;
+}
+
+#define K(pages) ((pages) << (PAGE_SHIFT - 10))
+
+#define BDI_SHOW(name, expr)						\
+static ssize_t name##_show(struct device *dev,				\
+			   struct device_attribute *attr, char *page)	\
+{									\
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);		\
+									\
+	return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr);	\
+}
+
+BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
+
+BDI_SHOW(reclaimable_kb, K(bdi_stat(bdi, BDI_RECLAIMABLE)))
+BDI_SHOW(writeback_kb, K(bdi_stat(bdi, BDI_WRITEBACK)))
+
+static inline unsigned long get_dirty(struct backing_dev_info *bdi, int i)
+{
+	unsigned long thresh[3];
+
+	get_dirty_limits(&thresh[0], &thresh[1], &thresh[2], bdi);
+
+	return thresh[i];
+}
+
+BDI_SHOW(dirty_kb, K(get_dirty(bdi, 1)))
+BDI_SHOW(bdi_dirty_kb, K(get_dirty(bdi, 2)))
+
+#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
+
+static struct device_attribute bdi_dev_attrs[] = {
+	__ATTR_RW(read_ahead_kb),
+	__ATTR_RO(reclaimable_kb),
+	__ATTR_RO(writeback_kb),
+	__ATTR_RO(dirty_kb),
+	__ATTR_RO(bdi_dirty_kb),
+	__ATTR_NULL,
+};
+
+static __init int bdi_class_init(void)
+{
+	bdi_class = class_create(THIS_MODULE, "bdi");
+	bdi_class->dev_attrs = bdi_dev_attrs;
+	return 0;
+}
+
+__initcall(bdi_class_init);
+
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+		const char *fmt, ...)
+{
+	char *name;
+	va_list args;
+	int ret = 0;
+	struct device *dev;
+
+	va_start(args, fmt);
+	name = kvasprintf(GFP_KERNEL, fmt, args);
+	va_end(args);
+
+	if (!name)
+		return -ENOMEM;
+
+	dev = device_create(bdi_class, parent, MKDEV(0,0), name);
+	if (IS_ERR(dev)) {
+		ret = PTR_ERR(dev);
+		goto exit;
+	}
+
+	bdi->dev = dev;
+	dev_set_drvdata(bdi->dev, bdi);
+
+exit:
+	kfree(name);
+	return ret;
+}
+
+void bdi_unregister(struct backing_dev_info *bdi)
+{
+	if (bdi->dev) {
+		device_unregister(bdi->dev);
+		bdi->dev = NULL;
+	}
+}
+
+EXPORT_SYMBOL(bdi_register);
+EXPORT_SYMBOL(bdi_unregister);
 
 int bdi_init(struct backing_dev_info *bdi)
 {
 	int i, j;
 	int err;
 
+	bdi->dev = NULL;
+
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
 		err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
 		if (err)
@@ -33,6 +140,8 @@ void bdi_destroy(struct backing_dev_info
 {
 	int i;
 
+	bdi_unregister(bdi);
+
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 		percpu_counter_destroy(&bdi->bdi_stat[i]);
 
Index: linux/mm/page-writeback.c
===================================================================
--- linux.orig/mm/page-writeback.c	2007-11-26 17:24:04.000000000 +0100
+++ linux/mm/page-writeback.c	2007-11-26 17:24:19.000000000 +0100
@@ -295,7 +295,7 @@ static unsigned long determine_dirtyable
 	return x + 1;	/* Ensure that we never return 0 */
 }
 
-static void
+void
 get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
 		 struct backing_dev_info *bdi)
 {
Index: linux/lib/percpu_counter.c
===================================================================
--- linux.orig/lib/percpu_counter.c	2007-11-22 14:13:03.000000000 +0100
+++ linux/lib/percpu_counter.c	2007-11-26 17:24:19.000000000 +0100
@@ -102,6 +102,7 @@ void percpu_counter_destroy(struct percp
 		return;
 
 	free_percpu(fbc->counters);
+	fbc->counters = NULL;
 #ifdef CONFIG_HOTPLUG_CPU
 	mutex_lock(&percpu_counters_lock);
 	list_del(&fbc->list);
Index: linux/fs/nfs/internal.h
===================================================================
--- linux.orig/fs/nfs/internal.h	2007-11-22 14:13:21.000000000 +0100
+++ linux/fs/nfs/internal.h	2007-11-26 17:24:19.000000000 +0100
@@ -65,16 +65,18 @@ extern void nfs_put_client(struct nfs_cl
 extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int);
 extern struct nfs_server *nfs_create_server(
 					const struct nfs_parsed_mount_data *,
-					struct nfs_fh *);
+					struct nfs_fh *, const char *);
 extern struct nfs_server *nfs4_create_server(
 					const struct nfs_parsed_mount_data *,
-					struct nfs_fh *);
+					struct nfs_fh *, const char *);
 extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
-						      struct nfs_fh *);
+						      struct nfs_fh *,
+						      const char *);
 extern void nfs_free_server(struct nfs_server *server);
 extern struct nfs_server *nfs_clone_server(struct nfs_server *,
 					   struct nfs_fh *,
-					   struct nfs_fattr *);
+					   struct nfs_fattr *,
+					   const char *);
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
Index: linux/fs/nfs/super.c
===================================================================
--- linux.orig/fs/nfs/super.c	2007-11-22 14:13:21.000000000 +0100
+++ linux/fs/nfs/super.c	2007-11-26 17:24:19.000000000 +0100
@@ -1383,7 +1383,7 @@ static int nfs_get_sb(struct file_system
 		goto out;
 
 	/* Get a volume representation */
-	server = nfs_create_server(&data, &mntfh);
+	server = nfs_create_server(&data, &mntfh, dev_name);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
 		goto out;
@@ -1466,7 +1466,7 @@ static int nfs_xdev_get_sb(struct file_s
 	dprintk("--> nfs_xdev_get_sb()\n");
 
 	/* create a new volume representation */
-	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
+	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr, dev_name);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
 		goto out_err_noserver;
@@ -1726,7 +1726,7 @@ static int nfs4_get_sb(struct file_syste
 		goto out;
 
 	/* Get a volume representation */
-	server = nfs4_create_server(&data, &mntfh);
+	server = nfs4_create_server(&data, &mntfh, dev_name);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
 		goto out;
@@ -1811,7 +1811,7 @@ static int nfs4_xdev_get_sb(struct file_
 	dprintk("--> nfs4_xdev_get_sb()\n");
 
 	/* create a new volume representation */
-	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
+	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr, dev_name);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
 		goto out_err_noserver;
@@ -1885,7 +1885,7 @@ static int nfs4_referral_get_sb(struct f
 	dprintk("--> nfs4_referral_get_sb()\n");
 
 	/* create a new volume representation */
-	server = nfs4_create_referral_server(data, &mntfh);
+	server = nfs4_create_referral_server(data, &mntfh, dev_name);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
 		goto out_err_noserver;
Index: linux/mm/readahead.c
===================================================================
--- linux.orig/mm/readahead.c	2007-11-22 14:13:03.000000000 +0100
+++ linux/mm/readahead.c	2007-11-26 17:24:19.000000000 +0100
@@ -235,7 +235,7 @@ unsigned long max_sane_readahead(unsigne
 
 static int __init readahead_init(void)
 {
-	return bdi_init(&default_backing_dev_info);
+	return bdi_init_fmt(&default_backing_dev_info, NULL, "default");
 }
 subsys_initcall(readahead_init);
 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [patch 2/3] mm: bdi: allow to set a minimum to the bdi dirty limit
  2007-11-29 11:10 [patch 1/3] bdi patches Miklos Szeredi
@ 2007-11-29 11:14 ` Miklos Szeredi
  2007-11-29 11:16 ` [patch 3/3] mm: bdi: allow to set a maximum " Miklos Szeredi
  2007-11-30  0:52 ` [patch 1/3] bdi patches Neil Brown
  2 siblings, 0 replies; 7+ messages in thread
From: Miklos Szeredi @ 2007-11-29 11:14 UTC (permalink / raw)
  To: a.p.zijlstra; +Cc: fengguang.wu, linux-kernel

[mszeredi@suse.cz]
Fix parsing in min_ratio_store()

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---

Index: linux/include/linux/backing-dev.h
===================================================================
--- linux.orig/include/linux/backing-dev.h	2007-11-26 17:24:19.000000000 +0100
+++ linux/include/linux/backing-dev.h	2007-11-26 17:24:39.000000000 +0100
@@ -51,6 +51,8 @@ struct backing_dev_info {
 	struct prop_local_percpu completions;
 	int dirty_exceeded;
 
+	unsigned int min_ratio;
+
 	struct device *dev;
 };
 
@@ -147,6 +149,8 @@ static inline unsigned long bdi_stat_err
 #endif
 }
 
+int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
+
 /*
  * Flags in backing_dev_info::capability
  * - The first two flags control whether dirty pages will contribute to the
Index: linux/mm/backing-dev.c
===================================================================
--- linux.orig/mm/backing-dev.c	2007-11-26 17:24:19.000000000 +0100
+++ linux/mm/backing-dev.c	2007-11-26 17:24:39.000000000 +0100
@@ -50,6 +50,24 @@ static inline unsigned long get_dirty(st
 BDI_SHOW(dirty_kb, K(get_dirty(bdi, 1)))
 BDI_SHOW(bdi_dirty_kb, K(get_dirty(bdi, 2)))
 
+static ssize_t min_ratio_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+	char *end;
+	unsigned int ratio;
+	ssize_t ret = -EINVAL;
+
+	ratio = simple_strtoul(buf, &end, 10);
+	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+		ret = bdi_set_min_ratio(bdi, ratio);
+		if (!ret)
+			ret = count;
+	}
+	return ret;
+}
+BDI_SHOW(min_ratio, bdi->min_ratio)
+
 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 
 static struct device_attribute bdi_dev_attrs[] = {
@@ -58,6 +76,7 @@ static struct device_attribute bdi_dev_a
 	__ATTR_RO(writeback_kb),
 	__ATTR_RO(dirty_kb),
 	__ATTR_RO(bdi_dirty_kb),
+	__ATTR_RW(min_ratio),
 	__ATTR_NULL,
 };
 
@@ -199,3 +218,4 @@ long congestion_wait(int rw, long timeou
 }
 EXPORT_SYMBOL(congestion_wait);
 
+
Index: linux/mm/page-writeback.c
===================================================================
--- linux.orig/mm/page-writeback.c	2007-11-26 17:24:19.000000000 +0100
+++ linux/mm/page-writeback.c	2007-11-26 17:24:40.000000000 +0100
@@ -241,6 +241,29 @@ static void task_dirty_limit(struct task
 }
 
 /*
+ *
+ */
+static DEFINE_SPINLOCK(bdi_lock);
+static unsigned int bdi_min_ratio;
+
+int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
+{
+	int ret = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&bdi_lock, flags);
+	min_ratio -= bdi->min_ratio;
+	if (bdi_min_ratio + min_ratio < 100) {
+		bdi_min_ratio += min_ratio;
+		bdi->min_ratio += min_ratio;
+	} else
+		ret = -EINVAL;
+	spin_unlock_irqrestore(&bdi_lock, flags);
+
+	return ret;
+}
+
+/*
  * Work out the current dirty-memory clamping and background writeout
  * thresholds.
  *
@@ -325,7 +348,7 @@ get_dirty_limits(long *pbackground, long
 	*pdirty = dirty;
 
 	if (bdi) {
-		u64 bdi_dirty = dirty;
+		u64 bdi_dirty;
 		long numerator, denominator;
 
 		/*
@@ -333,8 +356,10 @@ get_dirty_limits(long *pbackground, long
 		 */
 		bdi_writeout_fraction(bdi, &numerator, &denominator);
 
+		bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
 		bdi_dirty *= numerator;
 		do_div(bdi_dirty, denominator);
+		bdi_dirty += (dirty * bdi->min_ratio) / 100;
 
 		*pbdi_dirty = bdi_dirty;
 		clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [patch 3/3] mm: bdi: allow to set a maximum to the bdi dirty limit
  2007-11-29 11:10 [patch 1/3] bdi patches Miklos Szeredi
  2007-11-29 11:14 ` [patch 2/3] mm: bdi: allow to set a minimum to the bdi dirty limit Miklos Szeredi
@ 2007-11-29 11:16 ` Miklos Szeredi
  2007-11-30  0:52 ` [patch 1/3] bdi patches Neil Brown
  2 siblings, 0 replies; 7+ messages in thread
From: Miklos Szeredi @ 2007-11-29 11:16 UTC (permalink / raw)
  To: a.p.zijlstra; +Cc: fengguang.wu, linux-kernel

[mszeredi@suse.cz]
- Fix parsing in max_ratio_store().
- Export bdi_set_max_ratio() to modules (for fuse)
- Limit bdi_dirty with bdi->max_ratio

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---

Index: linux/include/linux/backing-dev.h
===================================================================
--- linux.orig/include/linux/backing-dev.h	2007-11-27 20:55:54.000000000 +0100
+++ linux/include/linux/backing-dev.h	2007-11-27 20:56:02.000000000 +0100
@@ -52,6 +52,7 @@ struct backing_dev_info {
 	int dirty_exceeded;
 
 	unsigned int min_ratio;
+	unsigned int max_ratio, max_prop_frac;
 
 	struct device *dev;
 };
@@ -150,6 +151,7 @@ static inline unsigned long bdi_stat_err
 }
 
 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
+int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 
 /*
  * Flags in backing_dev_info::capability
Index: linux/include/linux/proportions.h
===================================================================
--- linux.orig/include/linux/proportions.h	2007-11-27 12:45:32.000000000 +0100
+++ linux/include/linux/proportions.h	2007-11-27 20:55:56.000000000 +0100
@@ -78,6 +78,19 @@ void prop_inc_percpu(struct prop_descrip
 }
 
 /*
+ * Limit the time part in order to ensure there are some bits left for the
+ * cycle counter and fraction multiply.
+ */
+#define PROP_MAX_SHIFT (3*BITS_PER_LONG/4)
+
+#define PROP_FRAC_SHIFT		(BITS_PER_LONG - PROP_MAX_SHIFT - 1)
+#define PROP_FRAC_BASE		(1UL << PROP_FRAC_SHIFT)
+
+void __prop_inc_percpu_max(struct prop_descriptor *pd,
+			   struct prop_local_percpu *pl, long frac);
+
+
+/*
  * ----- SINGLE ------
  */
 
Index: linux/lib/proportions.c
===================================================================
--- linux.orig/lib/proportions.c	2007-11-27 12:45:32.000000000 +0100
+++ linux/lib/proportions.c	2007-11-27 20:55:56.000000000 +0100
@@ -73,12 +73,6 @@
 #include <linux/proportions.h>
 #include <linux/rcupdate.h>
 
-/*
- * Limit the time part in order to ensure there are some bits left for the
- * cycle counter.
- */
-#define PROP_MAX_SHIFT (3*BITS_PER_LONG/4)
-
 int prop_descriptor_init(struct prop_descriptor *pd, int shift)
 {
 	int err;
@@ -273,6 +267,38 @@ void __prop_inc_percpu(struct prop_descr
 }
 
 /*
+ * identical to __prop_inc_percpu, except that it limits this pl's fraction to
+ * @frac/PROP_FRAC_BASE by ignoring events when this limit has been exceeded.
+ */
+void __prop_inc_percpu_max(struct prop_descriptor *pd,
+			   struct prop_local_percpu *pl, long frac)
+{
+	struct prop_global *pg = prop_get_global(pd);
+
+	prop_norm_percpu(pg, pl);
+
+	if (unlikely(frac != PROP_FRAC_BASE)) {
+		unsigned long period_2 = 1UL << (pg->shift - 1);
+		unsigned long counter_mask = period_2 - 1;
+		unsigned long global_count;
+		long numerator, denominator;
+
+		numerator = percpu_counter_read_positive(&pl->events);
+		global_count = percpu_counter_read(&pg->events);
+		denominator = period_2 + (global_count & counter_mask);
+
+		if (numerator > ((denominator * frac) >> PROP_FRAC_SHIFT))
+			goto out_put;
+	}
+
+	percpu_counter_add(&pl->events, 1);
+	percpu_counter_add(&pg->events, 1);
+
+out_put:
+	prop_put_global(pd, pg);
+}
+
+/*
  * Obtain a fraction of this proportion
  *
  *   p_{j} = x_{j} / (period/2 + t % period/2)
Index: linux/mm/backing-dev.c
===================================================================
--- linux.orig/mm/backing-dev.c	2007-11-27 20:55:54.000000000 +0100
+++ linux/mm/backing-dev.c	2007-11-27 20:55:56.000000000 +0100
@@ -68,6 +68,24 @@ static ssize_t min_ratio_store(struct de
 }
 BDI_SHOW(min_ratio, bdi->min_ratio)
 
+static ssize_t max_ratio_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+	char *end;
+	unsigned int ratio;
+	ssize_t ret = -EINVAL;
+
+	ratio = simple_strtoul(buf, &end, 10);
+	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+		ret = bdi_set_max_ratio(bdi, ratio);
+		if (!ret)
+			ret = count;
+	}
+	return ret;
+}
+BDI_SHOW(max_ratio, bdi->max_ratio)
+
 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 
 static struct device_attribute bdi_dev_attrs[] = {
@@ -77,6 +95,7 @@ static struct device_attribute bdi_dev_a
 	__ATTR_RO(dirty_kb),
 	__ATTR_RO(bdi_dirty_kb),
 	__ATTR_RW(min_ratio),
+	__ATTR_RW(max_ratio),
 	__ATTR_NULL,
 };
 
@@ -136,6 +155,10 @@ int bdi_init(struct backing_dev_info *bd
 
 	bdi->dev = NULL;
 
+	bdi->min_ratio = 0;
+	bdi->max_ratio = 100;
+	bdi->max_prop_frac = PROP_FRAC_BASE;
+
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
 		err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
 		if (err)
Index: linux/mm/page-writeback.c
===================================================================
--- linux.orig/mm/page-writeback.c	2007-11-27 20:55:54.000000000 +0100
+++ linux/mm/page-writeback.c	2007-11-27 21:54:43.000000000 +0100
@@ -158,7 +158,7 @@ int dirty_ratio_handler(struct ctl_table
  */
 static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
 {
-	__prop_inc_percpu(&vm_completions, &bdi->completions);
+	__prop_inc_percpu_max(&vm_completions, &bdi->completions, bdi->max_prop_frac);
 }
 
 static inline void task_dirty_inc(struct task_struct *tsk)
@@ -252,17 +252,43 @@ int bdi_set_min_ratio(struct backing_dev
 	unsigned long flags;
 
 	spin_lock_irqsave(&bdi_lock, flags);
-	min_ratio -= bdi->min_ratio;
-	if (bdi_min_ratio + min_ratio < 100) {
-		bdi_min_ratio += min_ratio;
-		bdi->min_ratio += min_ratio;
-	} else
+	if (min_ratio > bdi->max_ratio) {
 		ret = -EINVAL;
+	} else {
+		min_ratio -= bdi->min_ratio;
+		if (bdi_min_ratio + min_ratio < 100) {
+			bdi_min_ratio += min_ratio;
+			bdi->min_ratio += min_ratio;
+		} else {
+			ret = -EINVAL;
+		}
+	}
 	spin_unlock_irqrestore(&bdi_lock, flags);
 
 	return ret;
 }
 
+int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	if (max_ratio > 100)
+		return -EINVAL;
+
+	spin_lock_irqsave(&bdi_lock, flags);
+	if (bdi->min_ratio > max_ratio) {
+		ret = -EINVAL;
+	} else {
+		bdi->max_ratio = max_ratio;
+		bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
+	}
+	spin_unlock_irqrestore(&bdi_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(bdi_set_max_ratio);
+
 /*
  * Work out the current dirty-memory clamping and background writeout
  * thresholds.
@@ -360,6 +386,8 @@ get_dirty_limits(long *pbackground, long
 		bdi_dirty *= numerator;
 		do_div(bdi_dirty, denominator);
 		bdi_dirty += (dirty * bdi->min_ratio) / 100;
+		if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
+			bdi_dirty = dirty * bdi->max_ratio / 100;
 
 		*pbdi_dirty = bdi_dirty;
 		clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch 1/3] bdi patches
  2007-11-29 11:10 [patch 1/3] bdi patches Miklos Szeredi
  2007-11-29 11:14 ` [patch 2/3] mm: bdi: allow to set a minimum to the bdi dirty limit Miklos Szeredi
  2007-11-29 11:16 ` [patch 3/3] mm: bdi: allow to set a maximum " Miklos Szeredi
@ 2007-11-30  0:52 ` Neil Brown
  2007-11-30  8:06   ` Miklos Szeredi
  2007-12-05 10:22   ` Peter Zijlstra
  2 siblings, 2 replies; 7+ messages in thread
From: Neil Brown @ 2007-11-30  0:52 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: a.p.zijlstra, fengguang.wu, linux-kernel

On Thursday November 29, miklos@szeredi.hu wrote:
> > http://programming.kicks-ass.net/kernel-patches/foo/
> > 
> > bdi-task-dirty.patch
> > bdi-sysfs.patch
> > bdi-min.patch
> > bdi-max.patch
> > 
> > 
> > Is my current rather experimental stack, I just wrote the max part after
> > having slept on it. I'm not fond of the multiplication there, but I
> > dno't see a way around it.
> > 
> > Compile tested only.
> 
> I've done some testing on these patches and did some changes. So here
> they go.
> 
> Thanks,
> Miklos
> 
> ---------
> Subject: mm: sysfs: expose the BDI object in sysfs
> 
> Provide a place in sysfs for the backing_dev_info object.
> This allows us to see and set the various BDI specific variables.

You don't say what the place is, and I'm not quite familiar enough
with sysfs internals to figure it out my self.  Help?

And while I was looking I noticed that bdi_register (and bdi_init_fmt)
takes a second argument 'parent', which is always NULL, and which is
undocumented as to purpose.
If no-one would ever add another call to bdi_register, why have the
second arg, and if they might, how would they know what to put there?

Finally, the omission of NFS bothers me - and makes me wonder if the
choice of name in sysfs is appropriate.

Would a program ever want to generate the name (in sysfs) for a
particular bdi?  If so, how would it do it.

It seems to me after a fairly quick look that a bdi is always
associated with a device number.  For block devices the device number
is obvious.  For NFS and FUSE, the device number is an anon device
number allocated at mount time.
Maybe the name of the bdi should be based on that number.  Then it
would be possible to map directly from e.g. a file to the bdi that the
file would be written to. 

NeilBrown

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch 1/3] bdi patches
  2007-11-30  0:52 ` [patch 1/3] bdi patches Neil Brown
@ 2007-11-30  8:06   ` Miklos Szeredi
  2007-11-30  8:36     ` Kay Sievers
  2007-12-05 10:22   ` Peter Zijlstra
  1 sibling, 1 reply; 7+ messages in thread
From: Miklos Szeredi @ 2007-11-30  8:06 UTC (permalink / raw)
  To: neilb; +Cc: a.p.zijlstra, fengguang.wu, linux-kernel

> > Provide a place in sysfs for the backing_dev_info object.
> > This allows us to see and set the various BDI specific variables.
> 
> You don't say what the place is, and I'm not quite familiar enough
> with sysfs internals to figure it out my self.  Help?

/sys/class/bdi  (which sometimes links into /sys/devices/virtual/bdi)

> And while I was looking I noticed that bdi_register (and bdi_init_fmt)
> takes a second argument 'parent', which is always NULL, and which is
> undocumented as to purpose.
> If no-one would ever add another call to bdi_register, why have the
> second arg, and if they might, how would they know what to put there?
> 
> Finally, the omission of NFS bothers me - and makes me wonder if the
> choice of name in sysfs is appropriate.
> 
> Would a program ever want to generate the name (in sysfs) for a
> particular bdi?  If so, how would it do it.

I'll let Peter answer these, when he gets back.

> It seems to me after a fairly quick look that a bdi is always
> associated with a device number.  For block devices the device number
> is obvious.  For NFS and FUSE, the device number is an anon device
> number allocated at mount time.
> Maybe the name of the bdi should be based on that number.  Then it
> would be possible to map directly from e.g. a file to the bdi that the
> file would be written to. 

Sounds like a good idea to me.

Miklos

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch 1/3] bdi patches
  2007-11-30  8:06   ` Miklos Szeredi
@ 2007-11-30  8:36     ` Kay Sievers
  0 siblings, 0 replies; 7+ messages in thread
From: Kay Sievers @ 2007-11-30  8:36 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: neilb, a.p.zijlstra, fengguang.wu, linux-kernel

On Nov 30, 2007 9:06 AM, Miklos Szeredi <miklos@szeredi.hu> wrote:
> > > Provide a place in sysfs for the backing_dev_info object.
> > > This allows us to see and set the various BDI specific variables.
> >
> > You don't say what the place is, and I'm not quite familiar enough
> > with sysfs internals to figure it out my self.  Help?
>
> /sys/class/bdi  (which sometimes links into /sys/devices/virtual/bdi)
>
> > And while I was looking I noticed that bdi_register (and bdi_init_fmt)
> > takes a second argument 'parent', which is always NULL, and which is
> > undocumented as to purpose.
> > If no-one would ever add another call to bdi_register, why have the
> > second arg, and if they might, how would they know what to put there?

It will get the parent after the block devices are converted from raw
kobjects to core devices. The patch is in Greg's tree.

Kay

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch 1/3] bdi patches
  2007-11-30  0:52 ` [patch 1/3] bdi patches Neil Brown
  2007-11-30  8:06   ` Miklos Szeredi
@ 2007-12-05 10:22   ` Peter Zijlstra
  1 sibling, 0 replies; 7+ messages in thread
From: Peter Zijlstra @ 2007-12-05 10:22 UTC (permalink / raw)
  To: Neil Brown; +Cc: Miklos Szeredi, fengguang.wu, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2843 bytes --]


On Fri, 2007-11-30 at 11:52 +1100, Neil Brown wrote:
> On Thursday November 29, miklos@szeredi.hu wrote:
> > > http://programming.kicks-ass.net/kernel-patches/foo/
> > > 
> > > bdi-task-dirty.patch
> > > bdi-sysfs.patch
> > > bdi-min.patch
> > > bdi-max.patch
> > > 
> > > 
> > > Is my current rather experimental stack, I just wrote the max part after
> > > having slept on it. I'm not fond of the multiplication there, but I
> > > dno't see a way around it.
> > > 
> > > Compile tested only.
> > 
> > I've done some testing on these patches and did some changes. So here
> > they go.
> > 
> > Thanks,
> > Miklos
> > 
> > ---------
> > Subject: mm: sysfs: expose the BDI object in sysfs
> > 
> > Provide a place in sysfs for the backing_dev_info object.
> > This allows us to see and set the various BDI specific variables.
> 
> You don't say what the place is, and I'm not quite familiar enough
> with sysfs internals to figure it out my self.  Help?

/sys/class/bdi/<name>/

> And while I was looking I noticed that bdi_register (and bdi_init_fmt)
> takes a second argument 'parent', which is always NULL, and which is
> undocumented as to purpose.
> If no-one would ever add another call to bdi_register, why have the
> second arg, and if they might, how would they know what to put there?

As Kay said, that is to be used once there are proper BDI parent
objects. (The block layer conversion from kobject to device should
provide some).

> Finally, the omission of NFS bothers me - and makes me wonder if the
> choice of name in sysfs is appropriate.

That is due to a currently silly filename limit in sysfs that is being
worked on by Greg and Kay.

> Would a program ever want to generate the name (in sysfs) for a
> particular bdi?  If so, how would it do it.

That would be a tad involved I guess, but not impossible.

For block devices you'd need to get hold of the block device name, no
idea on how you would go about doing that from user-space, but I'm sure
its possible.

For nfs, /proc/mounts seems to contain the proper string. And I'm sure
FUSE has the right info somewhere to construct it as well.

> It seems to me after a fairly quick look that a bdi is always
> associated with a device number.  For block devices the device number
> is obvious.  For NFS and FUSE, the device number is an anon device
> number allocated at mount time.
> Maybe the name of the bdi should be based on that number.  Then it
> would be possible to map directly from e.g. a file to the bdi that the
> file would be written to. 

Sounds like a good idea, except that I don't find these numbers to be
very convenient to use. Currently ls /sys/class/bdi/ shows a human
interpretable list of names, and its rather clear what device is meant.
Using numbers would loose that.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2007-12-05 10:22 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-11-29 11:10 [patch 1/3] bdi patches Miklos Szeredi
2007-11-29 11:14 ` [patch 2/3] mm: bdi: allow to set a minimum to the bdi dirty limit Miklos Szeredi
2007-11-29 11:16 ` [patch 3/3] mm: bdi: allow to set a maximum " Miklos Szeredi
2007-11-30  0:52 ` [patch 1/3] bdi patches Neil Brown
2007-11-30  8:06   ` Miklos Szeredi
2007-11-30  8:36     ` Kay Sievers
2007-12-05 10:22   ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).