linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads
@ 2012-12-03 18:53 Jeff Moyer
  2012-12-04  2:34 ` Dave Chinner
  2012-12-04 20:14 ` Jens Axboe
  0 siblings, 2 replies; 15+ messages in thread
From: Jeff Moyer @ 2012-12-03 18:53 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel, linux-mm; +Cc: Zach Brown

Hi,

In realtime environments, it may be desirable to keep the per-bdi
flusher threads from running on certain cpus.  This patch adds a
cpu_list file to /sys/class/bdi/* to enable this.  The default is to tie
the flusher threads to the same numa node as the backing device (though
I could be convinced to make it a mask of all cpus to avoid a change in
behaviour).

Comments, as always, are appreciated.

Cheers,
Jeff

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>

--
changes from v1->v2:
- fixed missing free in error path of bdi_init
- fixed up unchecked references to task in the store function

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 2a9a9ab..68263e0 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -18,6 +18,7 @@
 #include <linux/writeback.h>
 #include <linux/atomic.h>
 #include <linux/sysctl.h>
+#include <linux/mutex.h>
 
 struct page;
 struct device;
@@ -105,6 +106,9 @@ struct backing_dev_info {
 
 	struct timer_list laptop_mode_wb_timer;
 
+	cpumask_t *flusher_cpumask; /* used for writeback thread scheduling */
+	struct mutex flusher_cpumask_mutex;
+
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debug_dir;
 	struct dentry *debug_stats;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d3ca2b3..1cc1ff2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/writeback.h>
 #include <linux/device.h>
+#include <linux/slab.h>
 #include <trace/events/writeback.h>
 
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
@@ -221,12 +222,61 @@ static ssize_t max_ratio_store(struct device *dev,
 }
 BDI_SHOW(max_ratio, bdi->max_ratio)
 
+static ssize_t cpu_list_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+	struct bdi_writeback *wb = &bdi->wb;
+	cpumask_var_t newmask;
+	ssize_t ret;
+	struct task_struct *task;
+
+	if (!alloc_cpumask_var(&newmask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = cpulist_parse(buf, newmask);
+	if (!ret) {
+		spin_lock(&bdi->wb_lock);
+		task = wb->task;
+		if (task)
+			get_task_struct(task);
+		spin_unlock(&bdi->wb_lock);
+		if (task) {
+			ret = set_cpus_allowed_ptr(task, newmask);
+			put_task_struct(task);
+		}
+		if (ret == 0) {
+			mutex_lock(&bdi->flusher_cpumask_mutex);
+			cpumask_copy(bdi->flusher_cpumask, newmask);
+			mutex_unlock(&bdi->flusher_cpumask_mutex);
+			ret = count;
+		}
+	}
+	free_cpumask_var(newmask);
+
+	return ret;
+}
+
+static ssize_t cpu_list_show(struct device *dev,
+		struct device_attribute *attr, char *page)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+	ssize_t ret;
+
+	mutex_lock(&bdi->flusher_cpumask_mutex);
+	ret = cpulist_scnprintf(page, PAGE_SIZE-1, bdi->flusher_cpumask);
+	mutex_unlock(&bdi->flusher_cpumask_mutex);
+
+	return ret;
+}
+
 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 
 static struct device_attribute bdi_dev_attrs[] = {
 	__ATTR_RW(read_ahead_kb),
 	__ATTR_RW(min_ratio),
 	__ATTR_RW(max_ratio),
+	__ATTR_RW(cpu_list),
 	__ATTR_NULL,
 };
 
@@ -428,6 +478,7 @@ static int bdi_forker_thread(void *ptr)
 				writeback_inodes_wb(&bdi->wb, 1024,
 						    WB_REASON_FORKER_THREAD);
 			} else {
+				int ret;
 				/*
 				 * The spinlock makes sure we do not lose
 				 * wake-ups when racing with 'bdi_queue_work()'.
@@ -437,6 +488,14 @@ static int bdi_forker_thread(void *ptr)
 				spin_lock_bh(&bdi->wb_lock);
 				bdi->wb.task = task;
 				spin_unlock_bh(&bdi->wb_lock);
+				mutex_lock(&bdi->flusher_cpumask_mutex);
+				ret = set_cpus_allowed_ptr(task,
+							bdi->flusher_cpumask);
+				mutex_unlock(&bdi->flusher_cpumask_mutex);
+				if (ret)
+					printk_once("%s: failed to bind flusher"
+						    " thread %s, error %d\n",
+						    __func__, task->comm, ret);
 				wake_up_process(task);
 			}
 			bdi_clear_pending(bdi);
@@ -509,6 +568,17 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 						dev_name(dev));
 		if (IS_ERR(wb->task))
 			return PTR_ERR(wb->task);
+	} else {
+		int node;
+		/*
+		 * Set up a default cpumask for the flusher threads that
+		 * includes all cpus on the same numa node as the device.
+		 * The mask may be overridden via sysfs.
+		 */
+		node = dev_to_node(bdi->dev);
+		if (node != NUMA_NO_NODE)
+			cpumask_copy(bdi->flusher_cpumask,
+				     cpumask_of_node(node));
 	}
 
 	bdi_debug_register(bdi, dev_name(dev));
@@ -634,6 +704,15 @@ int bdi_init(struct backing_dev_info *bdi)
 
 	bdi_wb_init(&bdi->wb, bdi);
 
+	if (!bdi_cap_flush_forker(bdi)) {
+		bdi->flusher_cpumask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+		if (!bdi->flusher_cpumask)
+			return -ENOMEM;
+		cpumask_setall(bdi->flusher_cpumask);
+		mutex_init(&bdi->flusher_cpumask_mutex);
+	} else
+		bdi->flusher_cpumask = NULL;
+
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
 		err = percpu_counter_init(&bdi->bdi_stat[i], 0);
 		if (err)
@@ -656,6 +735,7 @@ int bdi_init(struct backing_dev_info *bdi)
 err:
 		while (i--)
 			percpu_counter_destroy(&bdi->bdi_stat[i]);
+		kfree(bdi->flusher_cpumask);
 	}
 
 	return err;
@@ -683,6 +763,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
 
 	bdi_unregister(bdi);
 
+	kfree(bdi->flusher_cpumask);
+
 	/*
 	 * If bdi_unregister() had already been called earlier, the
 	 * wakeup_timer could still be armed because bdi_prune_sb()

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-03 18:53 [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads Jeff Moyer
@ 2012-12-04  2:34 ` Dave Chinner
  2012-12-04 14:42   ` Jeff Moyer
  2012-12-04 20:14 ` Jens Axboe
  1 sibling, 1 reply; 15+ messages in thread
From: Dave Chinner @ 2012-12-04  2:34 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Jens Axboe, linux-kernel, linux-mm, Zach Brown

On Mon, Dec 03, 2012 at 01:53:39PM -0500, Jeff Moyer wrote:
> Hi,
> 
> In realtime environments, it may be desirable to keep the per-bdi
> flusher threads from running on certain cpus.  This patch adds a
> cpu_list file to /sys/class/bdi/* to enable this.  The default is to tie
> the flusher threads to the same numa node as the backing device (though
> I could be convinced to make it a mask of all cpus to avoid a change in
> behaviour).

The default seems reasonable to me.

> Comments, as always, are appreciated.
.....

> +static ssize_t cpu_list_store(struct device *dev,
> +		struct device_attribute *attr, const char *buf, size_t count)
> +{
> +	struct backing_dev_info *bdi = dev_get_drvdata(dev);
> +	struct bdi_writeback *wb = &bdi->wb;
> +	cpumask_var_t newmask;
> +	ssize_t ret;
> +	struct task_struct *task;
> +
> +	if (!alloc_cpumask_var(&newmask, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	ret = cpulist_parse(buf, newmask);
> +	if (!ret) {
> +		spin_lock(&bdi->wb_lock);
> +		task = wb->task;
> +		if (task)
> +			get_task_struct(task);
> +		spin_unlock(&bdi->wb_lock);
> +		if (task) {
> +			ret = set_cpus_allowed_ptr(task, newmask);
> +			put_task_struct(task);
> +		}

Why is this set here outside the bdi->flusher_cpumask_mutex?

Also, I'd prefer it named "..._lock" as that is the normal
convention for such variables. You can tell the type of lock from
the declaration or the use...

....

> @@ -437,6 +488,14 @@ static int bdi_forker_thread(void *ptr)
>  				spin_lock_bh(&bdi->wb_lock);
>  				bdi->wb.task = task;
>  				spin_unlock_bh(&bdi->wb_lock);
> +				mutex_lock(&bdi->flusher_cpumask_mutex);
> +				ret = set_cpus_allowed_ptr(task,
> +							bdi->flusher_cpumask);
> +				mutex_unlock(&bdi->flusher_cpumask_mutex);

As it is set under the lock here....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-04  2:34 ` Dave Chinner
@ 2012-12-04 14:42   ` Jeff Moyer
  2012-12-04 20:35     ` Dave Chinner
  0 siblings, 1 reply; 15+ messages in thread
From: Jeff Moyer @ 2012-12-04 14:42 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Jens Axboe, linux-kernel, linux-mm, Zach Brown

Dave Chinner <david@fromorbit.com> writes:

> On Mon, Dec 03, 2012 at 01:53:39PM -0500, Jeff Moyer wrote:
>> +static ssize_t cpu_list_store(struct device *dev,
>> +		struct device_attribute *attr, const char *buf, size_t count)
>> +{
>> +	struct backing_dev_info *bdi = dev_get_drvdata(dev);
>> +	struct bdi_writeback *wb = &bdi->wb;
>> +	cpumask_var_t newmask;
>> +	ssize_t ret;
>> +	struct task_struct *task;
>> +
>> +	if (!alloc_cpumask_var(&newmask, GFP_KERNEL))
>> +		return -ENOMEM;
>> +
>> +	ret = cpulist_parse(buf, newmask);
>> +	if (!ret) {
>> +		spin_lock(&bdi->wb_lock);
>> +		task = wb->task;
>> +		if (task)
>> +			get_task_struct(task);
>> +		spin_unlock(&bdi->wb_lock);
>> +		if (task) {
>> +			ret = set_cpus_allowed_ptr(task, newmask);
>> +			put_task_struct(task);
>> +		}
>
> Why is this set here outside the bdi->flusher_cpumask_mutex?

The cpumask mutex protects updates to bdi->flusher_cpumask, it has
nothing to do with the call to set_cpus_allowed.  We are protected from
concurrent calls to cpu_list_store by the sysfs mutex that is taken on
entry.  I understand that this is non-obvious, and it wouldn't be wrong
to hold the mutex here.  If you'd like me to do that for clarity, that
would be ok with me.

> Also, I'd prefer it named "..._lock" as that is the normal
> convention for such variables. You can tell the type of lock from
> the declaration or the use...

I'm sure I can find counter-examples, but it doesn't really matter to
me.  I'll change it.

>> @@ -437,6 +488,14 @@ static int bdi_forker_thread(void *ptr)
>>  				spin_lock_bh(&bdi->wb_lock);
>>  				bdi->wb.task = task;
>>  				spin_unlock_bh(&bdi->wb_lock);
>> +				mutex_lock(&bdi->flusher_cpumask_mutex);
>> +				ret = set_cpus_allowed_ptr(task,
>> +							bdi->flusher_cpumask);
>> +				mutex_unlock(&bdi->flusher_cpumask_mutex);
>
> As it is set under the lock here....

It's done under the lock here since we need to keep bdi->flusher_cpumask
from changing during the call to set_cpus_allowed.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-03 18:53 [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads Jeff Moyer
  2012-12-04  2:34 ` Dave Chinner
@ 2012-12-04 20:14 ` Jens Axboe
  2012-12-04 20:23   ` Jeff Moyer
  1 sibling, 1 reply; 15+ messages in thread
From: Jens Axboe @ 2012-12-04 20:14 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-kernel, linux-mm, Zach Brown

On 2012-12-03 19:53, Jeff Moyer wrote:
> Hi,
> 
> In realtime environments, it may be desirable to keep the per-bdi
> flusher threads from running on certain cpus.  This patch adds a
> cpu_list file to /sys/class/bdi/* to enable this.  The default is to tie
> the flusher threads to the same numa node as the backing device (though
> I could be convinced to make it a mask of all cpus to avoid a change in
> behaviour).

Looks sane, and I think defaulting to the home node is a sane default.
One comment:

> +	ret = cpulist_parse(buf, newmask);
> +	if (!ret) {
> +		spin_lock(&bdi->wb_lock);
> +		task = wb->task;
> +		if (task)
> +			get_task_struct(task);
> +		spin_unlock(&bdi->wb_lock);

bdi->wb_lock needs to be bh safe. The above should have caused lockdep
warnings for you.

> +		if (task) {
> +			ret = set_cpus_allowed_ptr(task, newmask);
> +			put_task_struct(task);
> +		}
> +		if (ret == 0) {
> +			mutex_lock(&bdi->flusher_cpumask_mutex);
> +			cpumask_copy(bdi->flusher_cpumask, newmask);
> +			mutex_unlock(&bdi->flusher_cpumask_mutex);
> +			ret = count;
> +		}
> +	}

> @@ -437,6 +488,14 @@ static int bdi_forker_thread(void *ptr)
>  				spin_lock_bh(&bdi->wb_lock);
>  				bdi->wb.task = task;
>  				spin_unlock_bh(&bdi->wb_lock);
> +				mutex_lock(&bdi->flusher_cpumask_mutex);
> +				ret = set_cpus_allowed_ptr(task,
> +							bdi->flusher_cpumask);
> +				mutex_unlock(&bdi->flusher_cpumask_mutex);

It'd be very useful if we had a kthread_create_cpu_on_cpumask() instead
of a _node() variant, since the latter could easily be implemented on
top of the former. But not really a show stopper for the patch...

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-04 20:14 ` Jens Axboe
@ 2012-12-04 20:23   ` Jeff Moyer
  2012-12-04 20:27     ` Jens Axboe
  0 siblings, 1 reply; 15+ messages in thread
From: Jeff Moyer @ 2012-12-04 20:23 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, linux-mm, Zach Brown

Jens Axboe <jaxboe@fusionio.com> writes:

> On 2012-12-03 19:53, Jeff Moyer wrote:
>> Hi,
>> 
>> In realtime environments, it may be desirable to keep the per-bdi
>> flusher threads from running on certain cpus.  This patch adds a
>> cpu_list file to /sys/class/bdi/* to enable this.  The default is to tie
>> the flusher threads to the same numa node as the backing device (though
>> I could be convinced to make it a mask of all cpus to avoid a change in
>> behaviour).
>
> Looks sane, and I think defaulting to the home node is a sane default.
> One comment:
>
>> +	ret = cpulist_parse(buf, newmask);
>> +	if (!ret) {
>> +		spin_lock(&bdi->wb_lock);
>> +		task = wb->task;
>> +		if (task)
>> +			get_task_struct(task);
>> +		spin_unlock(&bdi->wb_lock);
>
> bdi->wb_lock needs to be bh safe. The above should have caused lockdep
> warnings for you.

No lockdep complaints.  I'll double check that's enabled (but I usually
have it enabled...).

>> @@ -437,6 +488,14 @@ static int bdi_forker_thread(void *ptr)
>>  				spin_lock_bh(&bdi->wb_lock);
>>  				bdi->wb.task = task;
>>  				spin_unlock_bh(&bdi->wb_lock);
>> +				mutex_lock(&bdi->flusher_cpumask_mutex);
>> +				ret = set_cpus_allowed_ptr(task,
>> +							bdi->flusher_cpumask);
>> +				mutex_unlock(&bdi->flusher_cpumask_mutex);
>
> It'd be very useful if we had a kthread_create_cpu_on_cpumask() instead
> of a _node() variant, since the latter could easily be implemented on
> top of the former. But not really a show stopper for the patch...

Hmm, if it isn't too scary, I might give this a try.

Thanks!
Jeff

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-04 20:23   ` Jeff Moyer
@ 2012-12-04 20:27     ` Jens Axboe
  2012-12-04 22:26       ` Jeff Moyer
  0 siblings, 1 reply; 15+ messages in thread
From: Jens Axboe @ 2012-12-04 20:27 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-kernel, linux-mm, Zach Brown

On 2012-12-04 21:23, Jeff Moyer wrote:
> Jens Axboe <jaxboe@fusionio.com> writes:
> 
>> On 2012-12-03 19:53, Jeff Moyer wrote:
>>> Hi,
>>>
>>> In realtime environments, it may be desirable to keep the per-bdi
>>> flusher threads from running on certain cpus.  This patch adds a
>>> cpu_list file to /sys/class/bdi/* to enable this.  The default is to tie
>>> the flusher threads to the same numa node as the backing device (though
>>> I could be convinced to make it a mask of all cpus to avoid a change in
>>> behaviour).
>>
>> Looks sane, and I think defaulting to the home node is a sane default.
>> One comment:
>>
>>> +	ret = cpulist_parse(buf, newmask);
>>> +	if (!ret) {
>>> +		spin_lock(&bdi->wb_lock);
>>> +		task = wb->task;
>>> +		if (task)
>>> +			get_task_struct(task);
>>> +		spin_unlock(&bdi->wb_lock);
>>
>> bdi->wb_lock needs to be bh safe. The above should have caused lockdep
>> warnings for you.
> 
> No lockdep complaints.  I'll double check that's enabled (but I usually
> have it enabled...).
> 
>>> @@ -437,6 +488,14 @@ static int bdi_forker_thread(void *ptr)
>>>  				spin_lock_bh(&bdi->wb_lock);
>>>  				bdi->wb.task = task;
>>>  				spin_unlock_bh(&bdi->wb_lock);
>>> +				mutex_lock(&bdi->flusher_cpumask_mutex);
>>> +				ret = set_cpus_allowed_ptr(task,
>>> +							bdi->flusher_cpumask);
>>> +				mutex_unlock(&bdi->flusher_cpumask_mutex);
>>
>> It'd be very useful if we had a kthread_create_cpu_on_cpumask() instead
>> of a _node() variant, since the latter could easily be implemented on
>> top of the former. But not really a show stopper for the patch...
> 
> Hmm, if it isn't too scary, I might give this a try.

Should not be, pretty much just removing the node part of the create
struct passed in and making it a cpumask. And for the on_node() case,
cpumask_of_ndoe() will do the trick.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-04 14:42   ` Jeff Moyer
@ 2012-12-04 20:35     ` Dave Chinner
  0 siblings, 0 replies; 15+ messages in thread
From: Dave Chinner @ 2012-12-04 20:35 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Jens Axboe, linux-kernel, linux-mm, Zach Brown

On Tue, Dec 04, 2012 at 09:42:55AM -0500, Jeff Moyer wrote:
> Dave Chinner <david@fromorbit.com> writes:
> 
> > On Mon, Dec 03, 2012 at 01:53:39PM -0500, Jeff Moyer wrote:
> >> +static ssize_t cpu_list_store(struct device *dev,
> >> +		struct device_attribute *attr, const char *buf, size_t count)
> >> +{
> >> +	struct backing_dev_info *bdi = dev_get_drvdata(dev);
> >> +	struct bdi_writeback *wb = &bdi->wb;
> >> +	cpumask_var_t newmask;
> >> +	ssize_t ret;
> >> +	struct task_struct *task;
> >> +
> >> +	if (!alloc_cpumask_var(&newmask, GFP_KERNEL))
> >> +		return -ENOMEM;
> >> +
> >> +	ret = cpulist_parse(buf, newmask);
> >> +	if (!ret) {
> >> +		spin_lock(&bdi->wb_lock);
> >> +		task = wb->task;
> >> +		if (task)
> >> +			get_task_struct(task);
> >> +		spin_unlock(&bdi->wb_lock);
> >> +		if (task) {
> >> +			ret = set_cpus_allowed_ptr(task, newmask);
> >> +			put_task_struct(task);
> >> +		}
> >
> > Why is this set here outside the bdi->flusher_cpumask_mutex?
> 
> The cpumask mutex protects updates to bdi->flusher_cpumask, it has
> nothing to do with the call to set_cpus_allowed.  We are protected from
> concurrent calls to cpu_list_store by the sysfs mutex that is taken on
> entry.  I understand that this is non-obvious, and it wouldn't be wrong
> to hold the mutex here.  If you'd like me to do that for clarity, that
> would be ok with me.

At minimum it needs a comment like this otherwise someone is going
to come along and ask "why is that safe?" like I just did. I'd
prefer the code to be obviously consistent to avoid the need for
commenting about the special case, especially when the obviously
correct code is simpler ;)

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-04 20:27     ` Jens Axboe
@ 2012-12-04 22:26       ` Jeff Moyer
  2012-12-05  7:43         ` Jens Axboe
  2012-12-06 18:01         ` Tejun Heo
  0 siblings, 2 replies; 15+ messages in thread
From: Jeff Moyer @ 2012-12-04 22:26 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, linux-mm, Zach Brown, tj, Peter Zijlstra, Ingo

Jens Axboe <jaxboe@fusionio.com> writes:

>>>> @@ -437,6 +488,14 @@ static int bdi_forker_thread(void *ptr)
>>>>  				spin_lock_bh(&bdi->wb_lock);
>>>>  				bdi->wb.task = task;
>>>>  				spin_unlock_bh(&bdi->wb_lock);
>>>> +				mutex_lock(&bdi->flusher_cpumask_mutex);
>>>> +				ret = set_cpus_allowed_ptr(task,
>>>> +							bdi->flusher_cpumask);
>>>> +				mutex_unlock(&bdi->flusher_cpumask_mutex);
>>>
>>> It'd be very useful if we had a kthread_create_cpu_on_cpumask() instead
>>> of a _node() variant, since the latter could easily be implemented on
>>> top of the former. But not really a show stopper for the patch...
>> 
>> Hmm, if it isn't too scary, I might give this a try.
>
> Should not be, pretty much just removing the node part of the create
> struct passed in and making it a cpumask. And for the on_node() case,
> cpumask_of_ndoe() will do the trick.

I think it's a bit more involved than that.  If you look at
kthread_create_on_node, the node portion only applies to where the
memory comes from, it says nothing of scheduling.  To whit:

                /*                                                              
                 * root may have changed our (kthreadd's) priority or CPU mask.
                 * The kernel thread should not inherit these properties.       
                 */
                sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
                set_cpus_allowed_ptr(create.result, cpu_all_mask);

So, if I were to make the change you suggested, I would be modifying the
existing behaviour.  The way things stand, I think
kthread_create_on_node violates the principal of least surprise.  ;-)  I
would prefer a variant that affected scheduling behaviour as well as
memory placement.  Tejun, Peter, Ingo, what are your opinions?

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-04 22:26       ` Jeff Moyer
@ 2012-12-05  7:43         ` Jens Axboe
  2012-12-06 18:01         ` Tejun Heo
  1 sibling, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2012-12-05  7:43 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-kernel, linux-mm, Zach Brown, tj, Peter Zijlstra, Ingo

On 2012-12-04 23:26, Jeff Moyer wrote:
> Jens Axboe <jaxboe@fusionio.com> writes:
> 
>>>>> @@ -437,6 +488,14 @@ static int bdi_forker_thread(void *ptr)
>>>>>  				spin_lock_bh(&bdi->wb_lock);
>>>>>  				bdi->wb.task = task;
>>>>>  				spin_unlock_bh(&bdi->wb_lock);
>>>>> +				mutex_lock(&bdi->flusher_cpumask_mutex);
>>>>> +				ret = set_cpus_allowed_ptr(task,
>>>>> +							bdi->flusher_cpumask);
>>>>> +				mutex_unlock(&bdi->flusher_cpumask_mutex);
>>>>
>>>> It'd be very useful if we had a kthread_create_cpu_on_cpumask() instead
>>>> of a _node() variant, since the latter could easily be implemented on
>>>> top of the former. But not really a show stopper for the patch...
>>>
>>> Hmm, if it isn't too scary, I might give this a try.
>>
>> Should not be, pretty much just removing the node part of the create
>> struct passed in and making it a cpumask. And for the on_node() case,
>> cpumask_of_ndoe() will do the trick.
> 
> I think it's a bit more involved than that.  If you look at
> kthread_create_on_node, the node portion only applies to where the
> memory comes from, it says nothing of scheduling.  To whit:
> 
>                 /*                                                              
>                  * root may have changed our (kthreadd's) priority or CPU mask.
>                  * The kernel thread should not inherit these properties.       
>                  */
>                 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
>                 set_cpus_allowed_ptr(create.result, cpu_all_mask);
> 
> So, if I were to make the change you suggested, I would be modifying the
> existing behaviour.  The way things stand, I think
> kthread_create_on_node violates the principal of least surprise.  ;-)  I
> would prefer a variant that affected scheduling behaviour as well as
> memory placement.  Tejun, Peter, Ingo, what are your opinions?

Huh you are right, I completely missed that set_cpus_allowed_ptr() uses
cpu_all_mask and not mask_of_node(node). Doesn't make a lot of sense to
me... And yes, in any case, it definitely is a bad API, not very
logical.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-04 22:26       ` Jeff Moyer
  2012-12-05  7:43         ` Jens Axboe
@ 2012-12-06 18:01         ` Tejun Heo
  2012-12-06 18:08           ` Jeff Moyer
  2012-12-06 18:19           ` Jens Axboe
  1 sibling, 2 replies; 15+ messages in thread
From: Tejun Heo @ 2012-12-06 18:01 UTC (permalink / raw)
  To: Jeff Moyer
  Cc: Jens Axboe, linux-kernel, linux-mm, Zach Brown, Peter Zijlstra, Ingo

Hello,

On Tue, Dec 04, 2012 at 05:26:26PM -0500, Jeff Moyer wrote:
> I think it's a bit more involved than that.  If you look at
> kthread_create_on_node, the node portion only applies to where the
> memory comes from, it says nothing of scheduling.  To whit:
> 
>                 /*                                                              
>                  * root may have changed our (kthreadd's) priority or CPU mask.
>                  * The kernel thread should not inherit these properties.       
>                  */
>                 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
>                 set_cpus_allowed_ptr(create.result, cpu_all_mask);
> 
> So, if I were to make the change you suggested, I would be modifying the
> existing behaviour.  The way things stand, I think
> kthread_create_on_node violates the principal of least surprise.  ;-)  I
> would prefer a variant that affected scheduling behaviour as well as
> memory placement.  Tejun, Peter, Ingo, what are your opinions?

Hmmm... cpu binding usually is done by kthread_bind() or explicit
set_cpus_allowed_ptr() by the kthread itself.  The node part of the
API was added later because there was no way to control where the
stack is allocated and we often ended up with kthreads which are bound
to a CPU with stack on a remote node.

I don't know.  @node usually controls memory allocation and it could
be surprising for it to control cpu binding, especially because most
kthreads which are bound to CPU[s] require explicit affinity
management as CPUs go up and down.  I don't know.  Maybe I'm just too
used to the existing interface.

As for the original patch, I think it's a bit too much to expose to
userland.  It's probably a good idea to bind the flusher to the local
node but do we really need to expose an interface to let userland
control the affinity directly?  Do we actually have a use case at
hand?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-06 18:01         ` Tejun Heo
@ 2012-12-06 18:08           ` Jeff Moyer
  2012-12-06 18:13             ` Tejun Heo
  2012-12-06 18:19           ` Jens Axboe
  1 sibling, 1 reply; 15+ messages in thread
From: Jeff Moyer @ 2012-12-06 18:08 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Jens Axboe, linux-kernel, linux-mm, Zach Brown, Peter Zijlstra, Ingo

Tejun Heo <tj@kernel.org> writes:

> Hmmm... cpu binding usually is done by kthread_bind() or explicit
> set_cpus_allowed_ptr() by the kthread itself.  The node part of the
> API was added later because there was no way to control where the
> stack is allocated and we often ended up with kthreads which are bound
> to a CPU with stack on a remote node.
>
> I don't know.  @node usually controls memory allocation and it could
> be surprising for it to control cpu binding, especially because most
> kthreads which are bound to CPU[s] require explicit affinity
> management as CPUs go up and down.  I don't know.  Maybe I'm just too
> used to the existing interface.

OK, I can understand this line of reasoning.

> As for the original patch, I think it's a bit too much to expose to
> userland.  It's probably a good idea to bind the flusher to the local
> node but do we really need to expose an interface to let userland
> control the affinity directly?  Do we actually have a use case at
> hand?

Yeah, folks pinning realtime processes to a particular cpu don't want
the flusher threads interfering with their latency.  I don't have any
performance numbers on hand to convince you of the benefit, though.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-06 18:08           ` Jeff Moyer
@ 2012-12-06 18:13             ` Tejun Heo
  0 siblings, 0 replies; 15+ messages in thread
From: Tejun Heo @ 2012-12-06 18:13 UTC (permalink / raw)
  To: Jeff Moyer
  Cc: Jens Axboe, linux-kernel, linux-mm, Zach Brown, Peter Zijlstra, Ingo

Hello,

On Thu, Dec 06, 2012 at 01:08:18PM -0500, Jeff Moyer wrote:
> > As for the original patch, I think it's a bit too much to expose to
> > userland.  It's probably a good idea to bind the flusher to the local
> > node but do we really need to expose an interface to let userland
> > control the affinity directly?  Do we actually have a use case at
> > hand?
> 
> Yeah, folks pinning realtime processes to a particular cpu don't want
> the flusher threads interfering with their latency.  I don't have any
> performance numbers on hand to convince you of the benefit, though.

What I don't get is, RT tasks win over bdi flushers every time and I'm
skeptical allowing bdi or not on a particular CPU would make a big
difference on non-RT kernels anyway.  If the use case calls for
stricter isolation, there's isolcpus.  While I can see why someone
might think that they need something like this, I'm not sure it's
actually something necessary.

And, even if it's actually something necessary, I think we'll probably
be better off with adding a mechanism to notify userland of new
kthreads and let userland adjust affinity using the usual mechanism
rather than adding dedicated knobs for each kthread users.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-06 18:01         ` Tejun Heo
  2012-12-06 18:08           ` Jeff Moyer
@ 2012-12-06 18:19           ` Jens Axboe
  2012-12-06 18:22             ` Tejun Heo
  1 sibling, 1 reply; 15+ messages in thread
From: Jens Axboe @ 2012-12-06 18:19 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Jeff Moyer, linux-kernel, linux-mm, Zach Brown, Peter Zijlstra, Ingo

On 2012-12-06 19:01, Tejun Heo wrote:
> As for the original patch, I think it's a bit too much to expose to
> userland.  It's probably a good idea to bind the flusher to the local
> node but do we really need to expose an interface to let userland
> control the affinity directly?  Do we actually have a use case at
> hand?

We need to expose it. Once the binding is set from the kernel side on a
kernel thread, it can't be modified.

Binding either for performance reasons or for ensuring that we
explicitly don't run in some places is a very useful feature.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-06 18:19           ` Jens Axboe
@ 2012-12-06 18:22             ` Tejun Heo
  2012-12-06 18:33               ` Jeff Moyer
  0 siblings, 1 reply; 15+ messages in thread
From: Tejun Heo @ 2012-12-06 18:22 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Jeff Moyer, linux-kernel, linux-mm, Zach Brown, Peter Zijlstra, Ingo

Hello, Jens.

On Thu, Dec 06, 2012 at 07:19:34PM +0100, Jens Axboe wrote:
> We need to expose it. Once the binding is set from the kernel side on a
> kernel thread, it can't be modified.

That's only if kthread_bind() is used.  Caling set_cpus_allowed_ptr()
doesn't set PF_THREAD_BOUND and userland can adjust affinity like any
other tasks.

> Binding either for performance reasons or for ensuring that we
> explicitly don't run in some places is a very useful feature.

Sure, but I think this is too specific.  Something more generic would
be much better.  It can be as simple as generating a uevent.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads
  2012-12-06 18:22             ` Tejun Heo
@ 2012-12-06 18:33               ` Jeff Moyer
  0 siblings, 0 replies; 15+ messages in thread
From: Jeff Moyer @ 2012-12-06 18:33 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Jens Axboe, linux-kernel, linux-mm, Zach Brown, Peter Zijlstra, Ingo

Tejun Heo <tj@kernel.org> writes:

> Hello, Jens.
>
> On Thu, Dec 06, 2012 at 07:19:34PM +0100, Jens Axboe wrote:
>> We need to expose it. Once the binding is set from the kernel side on a
>> kernel thread, it can't be modified.
>
> That's only if kthread_bind() is used.  Caling set_cpus_allowed_ptr()
> doesn't set PF_THREAD_BOUND and userland can adjust affinity like any
> other tasks.
>
>> Binding either for performance reasons or for ensuring that we
>> explicitly don't run in some places is a very useful feature.
>
> Sure, but I think this is too specific.  Something more generic would
> be much better.  It can be as simple as generating a uevent.

I'm in favor of a more general approach.  For now, I'm still going to
push a patch that at least binds to the proper numa node.  I'll post
that after I've tested it.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2012-12-06 18:34 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-12-03 18:53 [patch,v2] bdi: add a user-tunable cpu_list for the bdi flusher threads Jeff Moyer
2012-12-04  2:34 ` Dave Chinner
2012-12-04 14:42   ` Jeff Moyer
2012-12-04 20:35     ` Dave Chinner
2012-12-04 20:14 ` Jens Axboe
2012-12-04 20:23   ` Jeff Moyer
2012-12-04 20:27     ` Jens Axboe
2012-12-04 22:26       ` Jeff Moyer
2012-12-05  7:43         ` Jens Axboe
2012-12-06 18:01         ` Tejun Heo
2012-12-06 18:08           ` Jeff Moyer
2012-12-06 18:13             ` Tejun Heo
2012-12-06 18:19           ` Jens Axboe
2012-12-06 18:22             ` Tejun Heo
2012-12-06 18:33               ` Jeff Moyer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).