linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch] bdi: add a user-tunable cpu_list for the bdi flusher threads
@ 2012-11-28 23:23 Jeff Moyer
  2012-11-30 22:15 ` Zach Brown
  0 siblings, 1 reply; 5+ messages in thread
From: Jeff Moyer @ 2012-11-28 23:23 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel, linux-mm

Hi,

In realtime environments, it may be desirable to keep the per-bdi
flusher threads from running on certain cpus.  This patch adds a
cpu_list file to /sys/class/bdi/* to enable this.  The default is to tie
the flusher threads to the same numa node as the backing device (though
I could be convinced to make it a mask of all cpus to avoid a change in
behaviour).

Comments, as always, are appreciated.

Cheers,
Jeff

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 2a9a9ab..68263e0 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -18,6 +18,7 @@
 #include <linux/writeback.h>
 #include <linux/atomic.h>
 #include <linux/sysctl.h>
+#include <linux/mutex.h>
 
 struct page;
 struct device;
@@ -105,6 +106,9 @@ struct backing_dev_info {
 
 	struct timer_list laptop_mode_wb_timer;
 
+	cpumask_t *flusher_cpumask; /* used for writeback thread scheduling */
+	struct mutex flusher_cpumask_mutex;
+
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debug_dir;
 	struct dentry *debug_stats;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d3ca2b3..c4f7dde 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/writeback.h>
 #include <linux/device.h>
+#include <linux/slab.h>
 #include <trace/events/writeback.h>
 
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
@@ -221,12 +222,59 @@ static ssize_t max_ratio_store(struct device *dev,
 }
 BDI_SHOW(max_ratio, bdi->max_ratio)
 
+static ssize_t cpu_list_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+	struct bdi_writeback *wb = &bdi->wb;
+	cpumask_var_t newmask;
+	ssize_t ret;
+	struct task_struct *task;
+
+	if (!alloc_cpumask_var(&newmask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = cpulist_parse(buf, newmask);
+	if (!ret) {
+		spin_lock(&bdi->wb_lock);
+		task = wb->task;
+		get_task_struct(task);
+		spin_unlock(&bdi->wb_lock);
+		if (task)
+			ret = set_cpus_allowed_ptr(task, newmask);
+		put_task_struct(task);
+		if (ret == 0) {
+			mutex_lock(&bdi->flusher_cpumask_mutex);
+			cpumask_copy(bdi->flusher_cpumask, newmask);
+			mutex_unlock(&bdi->flusher_cpumask_mutex);
+			ret = count;
+		}
+	}
+	free_cpumask_var(newmask);
+
+	return ret;
+}
+
+static ssize_t cpu_list_show(struct device *dev,
+		struct device_attribute *attr, char *page)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+	ssize_t ret;
+
+	mutex_lock(&bdi->flusher_cpumask_mutex);
+	ret = cpulist_scnprintf(page, PAGE_SIZE-1, bdi->flusher_cpumask);
+	mutex_unlock(&bdi->flusher_cpumask_mutex);
+
+	return ret;
+}
+
 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 
 static struct device_attribute bdi_dev_attrs[] = {
 	__ATTR_RW(read_ahead_kb),
 	__ATTR_RW(min_ratio),
 	__ATTR_RW(max_ratio),
+	__ATTR_RW(cpu_list),
 	__ATTR_NULL,
 };
 
@@ -428,6 +476,7 @@ static int bdi_forker_thread(void *ptr)
 				writeback_inodes_wb(&bdi->wb, 1024,
 						    WB_REASON_FORKER_THREAD);
 			} else {
+				int ret;
 				/*
 				 * The spinlock makes sure we do not lose
 				 * wake-ups when racing with 'bdi_queue_work()'.
@@ -437,6 +486,14 @@ static int bdi_forker_thread(void *ptr)
 				spin_lock_bh(&bdi->wb_lock);
 				bdi->wb.task = task;
 				spin_unlock_bh(&bdi->wb_lock);
+				mutex_lock(&bdi->flusher_cpumask_mutex);
+				ret = set_cpus_allowed_ptr(task,
+							bdi->flusher_cpumask);
+				mutex_unlock(&bdi->flusher_cpumask_mutex);
+				if (ret)
+					printk_once("%s: failed to bind flusher"
+						    " thread %s, error %d\n",
+						    __func__, task->comm, ret);
 				wake_up_process(task);
 			}
 			bdi_clear_pending(bdi);
@@ -509,6 +566,17 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 						dev_name(dev));
 		if (IS_ERR(wb->task))
 			return PTR_ERR(wb->task);
+	} else {
+		int node;
+		/*
+		 * Set up a default cpumask for the flusher threads that
+		 * includes all cpus on the same numa node as the device.
+		 * The mask may be overridden via sysfs.
+		 */
+		node = dev_to_node(bdi->dev);
+		if (node != NUMA_NO_NODE)
+			cpumask_copy(bdi->flusher_cpumask,
+				     cpumask_of_node(node));
 	}
 
 	bdi_debug_register(bdi, dev_name(dev));
@@ -634,6 +702,15 @@ int bdi_init(struct backing_dev_info *bdi)
 
 	bdi_wb_init(&bdi->wb, bdi);
 
+	if (!bdi_cap_flush_forker(bdi)) {
+		bdi->flusher_cpumask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+		if (!bdi->flusher_cpumask)
+			return -ENOMEM;
+		cpumask_setall(bdi->flusher_cpumask);
+		mutex_init(&bdi->flusher_cpumask_mutex);
+	} else
+		bdi->flusher_cpumask = NULL;
+
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
 		err = percpu_counter_init(&bdi->bdi_stat[i], 0);
 		if (err)
@@ -683,6 +760,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
 
 	bdi_unregister(bdi);
 
+	kfree(bdi->flusher_cpumask);
+
 	/*
 	 * If bdi_unregister() had already been called earlier, the
 	 * wakeup_timer could still be armed because bdi_prune_sb()

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [patch] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-11-28 23:23 [patch] bdi: add a user-tunable cpu_list for the bdi flusher threads Jeff Moyer
@ 2012-11-30 22:15 ` Zach Brown
  2012-12-03 15:49   ` Jeff Moyer
  0 siblings, 1 reply; 5+ messages in thread
From: Zach Brown @ 2012-11-30 22:15 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Jens Axboe, linux-kernel, linux-mm

> +	ret = cpulist_parse(buf, newmask);
> +	if (!ret) {
> +		spin_lock(&bdi->wb_lock);
> +		task = wb->task;
> +		get_task_struct(task);
> +		spin_unlock(&bdi->wb_lock);
> +		if (task)
> +			ret = set_cpus_allowed_ptr(task, newmask);
> +		put_task_struct(task);

If that test for a non-null task is needed then surely the get and put
need to be similarly protected :).

> +		bdi->flusher_cpumask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
> +		if (!bdi->flusher_cpumask)
> +			return -ENOMEM;

The bare GFP_KERNEL raises an eyebrow.  Some bdi_init() callers like
blk_alloc_queue_node() look like they'll want to pass in a gfp_t for the
allocation.

And shouldn't this be freed in the error path of bdi_init()?

- z

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-11-30 22:15 ` Zach Brown
@ 2012-12-03 15:49   ` Jeff Moyer
  2012-12-03 16:22     ` Jeff Moyer
  0 siblings, 1 reply; 5+ messages in thread
From: Jeff Moyer @ 2012-12-03 15:49 UTC (permalink / raw)
  To: Zach Brown; +Cc: Jens Axboe, linux-kernel, linux-mm

Zach Brown <zab@redhat.com> writes:

>> +	ret = cpulist_parse(buf, newmask);
>> +	if (!ret) {
>> +		spin_lock(&bdi->wb_lock);
>> +		task = wb->task;
>> +		get_task_struct(task);
>> +		spin_unlock(&bdi->wb_lock);
>> +		if (task)
>> +			ret = set_cpus_allowed_ptr(task, newmask);
>> +		put_task_struct(task);
>
> If that test for a non-null task is needed then surely the get and put
> need to be similarly protected :).

How embarrassing.

>> +		bdi->flusher_cpumask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
>> +		if (!bdi->flusher_cpumask)
>> +			return -ENOMEM;
>
> The bare GFP_KERNEL raises an eyebrow.  Some bdi_init() callers like
> blk_alloc_queue_node() look like they'll want to pass in a gfp_t for the
> allocation.

I'd be surprised if that was necessary, seeing how every single caller
of blk_alloc_queue_node passes in GFP_KERNEL.  I'll make the change,
though, there aren't too many callers of bdi_init out there.

> And shouldn't this be freed in the error path of bdi_init()?

Yes.  ;-)

Thanks!
Jeff

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-03 15:49   ` Jeff Moyer
@ 2012-12-03 16:22     ` Jeff Moyer
  2012-12-03 19:07       ` Zach Brown
  0 siblings, 1 reply; 5+ messages in thread
From: Jeff Moyer @ 2012-12-03 16:22 UTC (permalink / raw)
  To: Zach Brown; +Cc: Jens Axboe, linux-kernel, linux-mm

Jeff Moyer <jmoyer@redhat.com> writes:

>>> +		bdi->flusher_cpumask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
>>> +		if (!bdi->flusher_cpumask)
>>> +			return -ENOMEM;
>>
>> The bare GFP_KERNEL raises an eyebrow.  Some bdi_init() callers like
>> blk_alloc_queue_node() look like they'll want to pass in a gfp_t for the
>> allocation.
>
> I'd be surprised if that was necessary, seeing how every single caller
> of blk_alloc_queue_node passes in GFP_KERNEL.  I'll make the change,
> though, there aren't too many callers of bdi_init out there.

No other callers of bdi_init want anything but GFP_KERNEL.  In the case
of blk_alloc_queue_node, even *it* doesn't honor the gfp_t passed in!
Have a look at blkcg_init_queue (called from blk_alloc_queue_node) to
see what I mean.  Maybe that's a bug?

I've written the patch to modify bdi_init to take a gfp_t, but I'm
actually not in favor of this change, so I'm not going to post it
(unless, of course, you can provide a compelling argument).  :-)

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-03 16:22     ` Jeff Moyer
@ 2012-12-03 19:07       ` Zach Brown
  0 siblings, 0 replies; 5+ messages in thread
From: Zach Brown @ 2012-12-03 19:07 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Jens Axboe, linux-kernel, linux-mm

On Mon, Dec 03, 2012 at 11:22:31AM -0500, Jeff Moyer wrote:
> Jeff Moyer <jmoyer@redhat.com> writes:
> 
> >>> +		bdi->flusher_cpumask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
> >>> +		if (!bdi->flusher_cpumask)
> >>> +			return -ENOMEM;
> >>
> >> The bare GFP_KERNEL raises an eyebrow.  Some bdi_init() callers like
> >> blk_alloc_queue_node() look like they'll want to pass in a gfp_t for the
> >> allocation.
> >
> > I'd be surprised if that was necessary, seeing how every single caller
> > of blk_alloc_queue_node passes in GFP_KERNEL.  I'll make the change,
> > though, there aren't too many callers of bdi_init out there.
> 
> No other callers of bdi_init want anything but GFP_KERNEL.  In the case
> of blk_alloc_queue_node, even *it* doesn't honor the gfp_t passed in!
> Have a look at blkcg_init_queue (called from blk_alloc_queue_node) to
> see what I mean.  Maybe that's a bug?

Heh, indeed.

> I've written the patch to modify bdi_init to take a gfp_t, but I'm
> actually not in favor of this change, so I'm not going to post it
> (unless, of course, you can provide a compelling argument).  :-)

No argument here,  it just jumped out at me in the code.  I didn't check
out the callers or history of why it was that way :).

- z

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2012-12-03 19:07 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-11-28 23:23 [patch] bdi: add a user-tunable cpu_list for the bdi flusher threads Jeff Moyer
2012-11-30 22:15 ` Zach Brown
2012-12-03 15:49   ` Jeff Moyer
2012-12-03 16:22     ` Jeff Moyer
2012-12-03 19:07       ` Zach Brown

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).