All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
@ 2018-06-14 13:38 Hannes Reinecke
  2018-06-14 14:47 ` Jens Axboe
  2018-06-15 14:07 ` Bart Van Assche
  0 siblings, 2 replies; 18+ messages in thread
From: Hannes Reinecke @ 2018-06-14 13:38 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Mel Gorman, Hannes Reinecke, Hannes Reinecke

For performance reasons we should be able to allocate all memory
from a given NUMA node, so this patch adds a new parameter
'rd_numa_node' to allow the user to specify the NUMA node id.
When restricing fio to use the same NUMA node I'm seeing a performance
boost of more than 200%.

Signed-off-by: Hannes Reinecke <hare@suse.com>
---
 drivers/block/brd.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index bb976598ee43..7142d836539e 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -36,6 +36,7 @@
  */
 struct brd_device {
 	int		brd_number;
+	int		brd_numa_node;
 
 	struct request_queue	*brd_queue;
 	struct gendisk		*brd_disk;
@@ -103,7 +104,7 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
 	 * restriction might be able to be lifted.
 	 */
 	gfp_flags = GFP_NOIO | __GFP_ZERO;
-	page = alloc_page(gfp_flags);
+	page = alloc_pages_node(brd->brd_numa_node, gfp_flags, 0);
 	if (!page)
 		return NULL;
 
@@ -342,6 +343,10 @@ static int max_part = 1;
 module_param(max_part, int, 0444);
 MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
 
+static int rd_numa_node = NUMA_NO_NODE;
+module_param(rd_numa_node, int, 0444);
+MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM disk on.");
+
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
 MODULE_ALIAS("rd");
@@ -363,7 +368,7 @@ __setup("ramdisk_size=", ramdisk_size);
 static LIST_HEAD(brd_devices);
 static DEFINE_MUTEX(brd_devices_mutex);
 
-static struct brd_device *brd_alloc(int i)
+static struct brd_device *brd_alloc(int i, int node)
 {
 	struct brd_device *brd;
 	struct gendisk *disk;
@@ -372,10 +377,11 @@ static struct brd_device *brd_alloc(int i)
 	if (!brd)
 		goto out;
 	brd->brd_number		= i;
+	brd->brd_numa_node = node;
 	spin_lock_init(&brd->brd_lock);
 	INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
 
-	brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
+	brd->brd_queue = blk_alloc_queue_node(GFP_KERNEL, node, NULL);
 	if (!brd->brd_queue)
 		goto out_free_dev;
 
@@ -434,7 +440,7 @@ static struct brd_device *brd_init_one(int i, bool *new)
 			goto out;
 	}
 
-	brd = brd_alloc(i);
+	brd = brd_alloc(i, rd_numa_node);
 	if (brd) {
 		add_disk(brd->brd_disk);
 		list_add_tail(&brd->brd_list, &brd_devices);
@@ -495,7 +501,7 @@ static int __init brd_init(void)
 		max_part = 1;
 
 	for (i = 0; i < rd_nr; i++) {
-		brd = brd_alloc(i);
+		brd = brd_alloc(i, rd_numa_node);
 		if (!brd)
 			goto out_free;
 		list_add_tail(&brd->brd_list, &brd_devices);
-- 
2.12.3

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-14 13:38 [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node Hannes Reinecke
@ 2018-06-14 14:47 ` Jens Axboe
  2018-06-14 15:29   ` Hannes Reinecke
  2018-06-15 14:07 ` Bart Van Assche
  1 sibling, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2018-06-14 14:47 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: linux-block, Mel Gorman, Hannes Reinecke

On 6/14/18 7:38 AM, Hannes Reinecke wrote:
> For performance reasons we should be able to allocate all memory
> from a given NUMA node, so this patch adds a new parameter
> 'rd_numa_node' to allow the user to specify the NUMA node id.
> When restricing fio to use the same NUMA node I'm seeing a performance
> boost of more than 200%.

Looks fine to me. One comment.

> @@ -342,6 +343,10 @@ static int max_part = 1;
>  module_param(max_part, int, 0444);
>  MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
>  
> +static int rd_numa_node = NUMA_NO_NODE;
> +module_param(rd_numa_node, int, 0444);
> +MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM disk on.");

This could feasibly be 0644, as there would be nothing wrong with altering
this at runtime.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-14 14:47 ` Jens Axboe
@ 2018-06-14 15:29   ` Hannes Reinecke
  2018-06-14 15:33     ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Hannes Reinecke @ 2018-06-14 15:29 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Mel Gorman, Hannes Reinecke

On Thu, 14 Jun 2018 08:47:33 -0600
Jens Axboe <axboe@kernel.dk> wrote:

> On 6/14/18 7:38 AM, Hannes Reinecke wrote:
> > For performance reasons we should be able to allocate all memory
> > from a given NUMA node, so this patch adds a new parameter
> > 'rd_numa_node' to allow the user to specify the NUMA node id.
> > When restricing fio to use the same NUMA node I'm seeing a
> > performance boost of more than 200%.  
> 
> Looks fine to me. One comment.
> 
> > @@ -342,6 +343,10 @@ static int max_part = 1;
> >  module_param(max_part, int, 0444);
> >  MODULE_PARM_DESC(max_part, "Num Minors to reserve between
> > devices"); 
> > +static int rd_numa_node = NUMA_NO_NODE;
> > +module_param(rd_numa_node, int, 0444);
> > +MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM
> > disk on.");  
> 
> This could feasibly be 0644, as there would be nothing wrong with
> altering this at runtime.
> 

While we could it would not change the allocation of _existing_ ram
devices, making behaviour rather unpredictable.
Hence I did decide against it (and yes, I actually thought about it).

But if you insist ...

Cheers,

Hannes

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-14 15:29   ` Hannes Reinecke
@ 2018-06-14 15:33     ` Jens Axboe
  2018-06-14 16:09       ` Hannes Reinecke
  2018-06-15  7:30       ` Christoph Hellwig
  0 siblings, 2 replies; 18+ messages in thread
From: Jens Axboe @ 2018-06-14 15:33 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: linux-block, Mel Gorman, Hannes Reinecke

On 6/14/18 9:29 AM, Hannes Reinecke wrote:
> On Thu, 14 Jun 2018 08:47:33 -0600
> Jens Axboe <axboe@kernel.dk> wrote:
> 
>> On 6/14/18 7:38 AM, Hannes Reinecke wrote:
>>> For performance reasons we should be able to allocate all memory
>>> from a given NUMA node, so this patch adds a new parameter
>>> 'rd_numa_node' to allow the user to specify the NUMA node id.
>>> When restricing fio to use the same NUMA node I'm seeing a
>>> performance boost of more than 200%.  
>>
>> Looks fine to me. One comment.
>>
>>> @@ -342,6 +343,10 @@ static int max_part = 1;
>>>  module_param(max_part, int, 0444);
>>>  MODULE_PARM_DESC(max_part, "Num Minors to reserve between
>>> devices"); 
>>> +static int rd_numa_node = NUMA_NO_NODE;
>>> +module_param(rd_numa_node, int, 0444);
>>> +MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM
>>> disk on.");  
>>
>> This could feasibly be 0644, as there would be nothing wrong with
>> altering this at runtime.
>>
> 
> While we could it would not change the allocation of _existing_ ram
> devices, making behaviour rather unpredictable.
> Hence I did decide against it (and yes, I actually thought about it).
> 
> But if you insist ...

Right, it would just change new allocations. Probably not a common use
case, but there's really nothing that prevents it from being feasible.

Next question - what does the memory allocator do if we run out of
memory on the given node? Should we punt to a different node if that
happens? Slower, but functional, seems preferable to not being able
to get memory.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-14 15:33     ` Jens Axboe
@ 2018-06-14 16:09       ` Hannes Reinecke
  2018-06-14 20:32         ` Adam Manzanares
  2018-06-15  7:30       ` Christoph Hellwig
  1 sibling, 1 reply; 18+ messages in thread
From: Hannes Reinecke @ 2018-06-14 16:09 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Mel Gorman, Hannes Reinecke

On Thu, 14 Jun 2018 09:33:35 -0600
Jens Axboe <axboe@kernel.dk> wrote:

> On 6/14/18 9:29 AM, Hannes Reinecke wrote:
> > On Thu, 14 Jun 2018 08:47:33 -0600
> > Jens Axboe <axboe@kernel.dk> wrote:
> >   
> >> On 6/14/18 7:38 AM, Hannes Reinecke wrote:  
> >>> For performance reasons we should be able to allocate all memory
> >>> from a given NUMA node, so this patch adds a new parameter
> >>> 'rd_numa_node' to allow the user to specify the NUMA node id.
> >>> When restricing fio to use the same NUMA node I'm seeing a
> >>> performance boost of more than 200%.    
> >>
> >> Looks fine to me. One comment.
> >>  
> >>> @@ -342,6 +343,10 @@ static int max_part = 1;
> >>>  module_param(max_part, int, 0444);
> >>>  MODULE_PARM_DESC(max_part, "Num Minors to reserve between
> >>> devices"); 
> >>> +static int rd_numa_node = NUMA_NO_NODE;
> >>> +module_param(rd_numa_node, int, 0444);
> >>> +MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM
> >>> disk on.");    
> >>
> >> This could feasibly be 0644, as there would be nothing wrong with
> >> altering this at runtime.
> >>  
> > 
> > While we could it would not change the allocation of _existing_ ram
> > devices, making behaviour rather unpredictable.
> > Hence I did decide against it (and yes, I actually thought about
> > it).
> > 
> > But if you insist ...  
> 
> Right, it would just change new allocations. Probably not a common use
> case, but there's really nothing that prevents it from being feasible.
> 
> Next question - what does the memory allocator do if we run out of
> memory on the given node? Should we punt to a different node if that
> happens? Slower, but functional, seems preferable to not being able
> to get memory.
> 

Hmm. That I haven't considered; yes, that really sounds like an idea.
Will be sending an updated patch.

Cheers,

Hannes

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-14 16:09       ` Hannes Reinecke
@ 2018-06-14 20:32         ` Adam Manzanares
  2018-06-14 20:37           ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Adam Manzanares @ 2018-06-14 20:32 UTC (permalink / raw)
  To: Hannes Reinecke, Jens Axboe; +Cc: linux-block, Mel Gorman, Hannes Reinecke

DQoNCk9uIDYvMTQvMTggOTowOSBBTSwgSGFubmVzIFJlaW5lY2tlIHdyb3RlOg0KPiBPbiBUaHUs
IDE0IEp1biAyMDE4IDA5OjMzOjM1IC0wNjAwDQo+IEplbnMgQXhib2UgPGF4Ym9lQGtlcm5lbC5k
az4gd3JvdGU6DQo+IA0KPj4gT24gNi8xNC8xOCA5OjI5IEFNLCBIYW5uZXMgUmVpbmVja2Ugd3Jv
dGU6DQo+Pj4gT24gVGh1LCAxNCBKdW4gMjAxOCAwODo0NzozMyAtMDYwMA0KPj4+IEplbnMgQXhi
b2UgPGF4Ym9lQGtlcm5lbC5kaz4gd3JvdGU6DQo+Pj4gICAgDQo+Pj4+IE9uIDYvMTQvMTggNzoz
OCBBTSwgSGFubmVzIFJlaW5lY2tlIHdyb3RlOg0KPj4+Pj4gRm9yIHBlcmZvcm1hbmNlIHJlYXNv
bnMgd2Ugc2hvdWxkIGJlIGFibGUgdG8gYWxsb2NhdGUgYWxsIG1lbW9yeQ0KPj4+Pj4gZnJvbSBh
IGdpdmVuIE5VTUEgbm9kZSwgc28gdGhpcyBwYXRjaCBhZGRzIGEgbmV3IHBhcmFtZXRlcg0KPj4+
Pj4gJ3JkX251bWFfbm9kZScgdG8gYWxsb3cgdGhlIHVzZXIgdG8gc3BlY2lmeSB0aGUgTlVNQSBu
b2RlIGlkLg0KPj4+Pj4gV2hlbiByZXN0cmljaW5nIGZpbyB0byB1c2UgdGhlIHNhbWUgTlVNQSBu
b2RlIEknbSBzZWVpbmcgYQ0KPj4+Pj4gcGVyZm9ybWFuY2UgYm9vc3Qgb2YgbW9yZSB0aGFuIDIw
MCUuDQo+Pj4+DQo+Pj4+IExvb2tzIGZpbmUgdG8gbWUuIE9uZSBjb21tZW50Lg0KPj4+PiAgIA0K
Pj4+Pj4gQEAgLTM0Miw2ICszNDMsMTAgQEAgc3RhdGljIGludCBtYXhfcGFydCA9IDE7DQo+Pj4+
PiAgIG1vZHVsZV9wYXJhbShtYXhfcGFydCwgaW50LCAwNDQ0KTsNCj4+Pj4+ICAgTU9EVUxFX1BB
Uk1fREVTQyhtYXhfcGFydCwgIk51bSBNaW5vcnMgdG8gcmVzZXJ2ZSBiZXR3ZWVuDQo+Pj4+PiBk
ZXZpY2VzIik7DQo+Pj4+PiArc3RhdGljIGludCByZF9udW1hX25vZGUgPSBOVU1BX05PX05PREU7
DQo+Pj4+PiArbW9kdWxlX3BhcmFtKHJkX251bWFfbm9kZSwgaW50LCAwNDQ0KTsNCj4+Pj4+ICtN
T0RVTEVfUEFSTV9ERVNDKHJkX251bWFfbm9kZSwgIk5VTUEgbm9kZSBudW1iZXIgdG8gYWxsb2Nh
dGUgUkFNDQo+Pj4+PiBkaXNrIG9uLiIpOw0KPj4+Pg0KPj4+PiBUaGlzIGNvdWxkIGZlYXNpYmx5
IGJlIDA2NDQsIGFzIHRoZXJlIHdvdWxkIGJlIG5vdGhpbmcgd3Jvbmcgd2l0aA0KPj4+PiBhbHRl
cmluZyB0aGlzIGF0IHJ1bnRpbWUuDQo+Pj4+ICAgDQo+Pj4NCj4+PiBXaGlsZSB3ZSBjb3VsZCBp
dCB3b3VsZCBub3QgY2hhbmdlIHRoZSBhbGxvY2F0aW9uIG9mIF9leGlzdGluZ18gcmFtDQo+Pj4g
ZGV2aWNlcywgbWFraW5nIGJlaGF2aW91ciByYXRoZXIgdW5wcmVkaWN0YWJsZS4NCj4+PiBIZW5j
ZSBJIGRpZCBkZWNpZGUgYWdhaW5zdCBpdCAoYW5kIHllcywgSSBhY3R1YWxseSB0aG91Z2h0IGFi
b3V0DQo+Pj4gaXQpLg0KPj4+DQo+Pj4gQnV0IGlmIHlvdSBpbnNpc3QgLi4uDQo+Pg0KPj4gUmln
aHQsIGl0IHdvdWxkIGp1c3QgY2hhbmdlIG5ldyBhbGxvY2F0aW9ucy4gUHJvYmFibHkgbm90IGEg
Y29tbW9uIHVzZQ0KPj4gY2FzZSwgYnV0IHRoZXJlJ3MgcmVhbGx5IG5vdGhpbmcgdGhhdCBwcmV2
ZW50cyBpdCBmcm9tIGJlaW5nIGZlYXNpYmxlLg0KPj4NCj4+IE5leHQgcXVlc3Rpb24gLSB3aGF0
IGRvZXMgdGhlIG1lbW9yeSBhbGxvY2F0b3IgZG8gaWYgd2UgcnVuIG91dCBvZg0KPj4gbWVtb3J5
IG9uIHRoZSBnaXZlbiBub2RlPyBTaG91bGQgd2UgcHVudCB0byBhIGRpZmZlcmVudCBub2RlIGlm
IHRoYXQNCj4+IGhhcHBlbnM/IFNsb3dlciwgYnV0IGZ1bmN0aW9uYWwsIHNlZW1zIHByZWZlcmFi
bGUgdG8gbm90IGJlaW5nIGFibGUNCj4+IHRvIGdldCBtZW1vcnkuDQo+Pg0KPiANCj4gSG1tLiBU
aGF0IEkgaGF2ZW4ndCBjb25zaWRlcmVkOyB5ZXMsIHRoYXQgcmVhbGx5IHNvdW5kcyBsaWtlIGFu
IGlkZWEuDQo+IFdpbGwgYmUgc2VuZGluZyBhbiB1cGRhdGVkIHBhdGNoLg0KDQpXaWxsIG51bWFj
dGwgLi4uIG1vZHByb2JlIGJyZCAuLi4gc29sdmUgdGhpcyBwcm9ibGVtPw0KDQo+IA0KPiBDaGVl
cnMsDQo+IA0KPiBIYW5uZXMNCj4g

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-14 20:32         ` Adam Manzanares
@ 2018-06-14 20:37           ` Jens Axboe
  2018-06-14 20:41             ` Adam Manzanares
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2018-06-14 20:37 UTC (permalink / raw)
  To: Adam Manzanares, Hannes Reinecke; +Cc: linux-block, Mel Gorman, Hannes Reinecke

On 6/14/18 2:32 PM, Adam Manzanares wrote:
> 
> 
> On 6/14/18 9:09 AM, Hannes Reinecke wrote:
>> On Thu, 14 Jun 2018 09:33:35 -0600
>> Jens Axboe <axboe@kernel.dk> wrote:
>>
>>> On 6/14/18 9:29 AM, Hannes Reinecke wrote:
>>>> On Thu, 14 Jun 2018 08:47:33 -0600
>>>> Jens Axboe <axboe@kernel.dk> wrote:
>>>>    
>>>>> On 6/14/18 7:38 AM, Hannes Reinecke wrote:
>>>>>> For performance reasons we should be able to allocate all memory
>>>>>> from a given NUMA node, so this patch adds a new parameter
>>>>>> 'rd_numa_node' to allow the user to specify the NUMA node id.
>>>>>> When restricing fio to use the same NUMA node I'm seeing a
>>>>>> performance boost of more than 200%.
>>>>>
>>>>> Looks fine to me. One comment.
>>>>>   
>>>>>> @@ -342,6 +343,10 @@ static int max_part = 1;
>>>>>>   module_param(max_part, int, 0444);
>>>>>>   MODULE_PARM_DESC(max_part, "Num Minors to reserve between
>>>>>> devices");
>>>>>> +static int rd_numa_node = NUMA_NO_NODE;
>>>>>> +module_param(rd_numa_node, int, 0444);
>>>>>> +MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM
>>>>>> disk on.");
>>>>>
>>>>> This could feasibly be 0644, as there would be nothing wrong with
>>>>> altering this at runtime.
>>>>>   
>>>>
>>>> While we could it would not change the allocation of _existing_ ram
>>>> devices, making behaviour rather unpredictable.
>>>> Hence I did decide against it (and yes, I actually thought about
>>>> it).
>>>>
>>>> But if you insist ...
>>>
>>> Right, it would just change new allocations. Probably not a common use
>>> case, but there's really nothing that prevents it from being feasible.
>>>
>>> Next question - what does the memory allocator do if we run out of
>>> memory on the given node? Should we punt to a different node if that
>>> happens? Slower, but functional, seems preferable to not being able
>>> to get memory.
>>>
>>
>> Hmm. That I haven't considered; yes, that really sounds like an idea.
>> Will be sending an updated patch.
> 
> Will numactl ... modprobe brd ... solve this problem?

It won't, pages are allocated as needed.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-14 20:37           ` Jens Axboe
@ 2018-06-14 20:41             ` Adam Manzanares
  2018-06-14 20:47               ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Adam Manzanares @ 2018-06-14 20:41 UTC (permalink / raw)
  To: Jens Axboe, Hannes Reinecke; +Cc: linux-block, Mel Gorman, Hannes Reinecke

DQoNCk9uIDYvMTQvMTggMTozNyBQTSwgSmVucyBBeGJvZSB3cm90ZToNCj4gT24gNi8xNC8xOCAy
OjMyIFBNLCBBZGFtIE1hbnphbmFyZXMgd3JvdGU6DQo+Pg0KPj4NCj4+IE9uIDYvMTQvMTggOTow
OSBBTSwgSGFubmVzIFJlaW5lY2tlIHdyb3RlOg0KPj4+IE9uIFRodSwgMTQgSnVuIDIwMTggMDk6
MzM6MzUgLTA2MDANCj4+PiBKZW5zIEF4Ym9lIDxheGJvZUBrZXJuZWwuZGs+IHdyb3RlOg0KPj4+
DQo+Pj4+IE9uIDYvMTQvMTggOToyOSBBTSwgSGFubmVzIFJlaW5lY2tlIHdyb3RlOg0KPj4+Pj4g
T24gVGh1LCAxNCBKdW4gMjAxOCAwODo0NzozMyAtMDYwMA0KPj4+Pj4gSmVucyBBeGJvZSA8YXhi
b2VAa2VybmVsLmRrPiB3cm90ZToNCj4+Pj4+ICAgICANCj4+Pj4+PiBPbiA2LzE0LzE4IDc6Mzgg
QU0sIEhhbm5lcyBSZWluZWNrZSB3cm90ZToNCj4+Pj4+Pj4gRm9yIHBlcmZvcm1hbmNlIHJlYXNv
bnMgd2Ugc2hvdWxkIGJlIGFibGUgdG8gYWxsb2NhdGUgYWxsIG1lbW9yeQ0KPj4+Pj4+PiBmcm9t
IGEgZ2l2ZW4gTlVNQSBub2RlLCBzbyB0aGlzIHBhdGNoIGFkZHMgYSBuZXcgcGFyYW1ldGVyDQo+
Pj4+Pj4+ICdyZF9udW1hX25vZGUnIHRvIGFsbG93IHRoZSB1c2VyIHRvIHNwZWNpZnkgdGhlIE5V
TUEgbm9kZSBpZC4NCj4+Pj4+Pj4gV2hlbiByZXN0cmljaW5nIGZpbyB0byB1c2UgdGhlIHNhbWUg
TlVNQSBub2RlIEknbSBzZWVpbmcgYQ0KPj4+Pj4+PiBwZXJmb3JtYW5jZSBib29zdCBvZiBtb3Jl
IHRoYW4gMjAwJS4NCj4+Pj4+Pg0KPj4+Pj4+IExvb2tzIGZpbmUgdG8gbWUuIE9uZSBjb21tZW50
Lg0KPj4+Pj4+ICAgIA0KPj4+Pj4+PiBAQCAtMzQyLDYgKzM0MywxMCBAQCBzdGF0aWMgaW50IG1h
eF9wYXJ0ID0gMTsNCj4+Pj4+Pj4gICAgbW9kdWxlX3BhcmFtKG1heF9wYXJ0LCBpbnQsIDA0NDQp
Ow0KPj4+Pj4+PiAgICBNT0RVTEVfUEFSTV9ERVNDKG1heF9wYXJ0LCAiTnVtIE1pbm9ycyB0byBy
ZXNlcnZlIGJldHdlZW4NCj4+Pj4+Pj4gZGV2aWNlcyIpOw0KPj4+Pj4+PiArc3RhdGljIGludCBy
ZF9udW1hX25vZGUgPSBOVU1BX05PX05PREU7DQo+Pj4+Pj4+ICttb2R1bGVfcGFyYW0ocmRfbnVt
YV9ub2RlLCBpbnQsIDA0NDQpOw0KPj4+Pj4+PiArTU9EVUxFX1BBUk1fREVTQyhyZF9udW1hX25v
ZGUsICJOVU1BIG5vZGUgbnVtYmVyIHRvIGFsbG9jYXRlIFJBTQ0KPj4+Pj4+PiBkaXNrIG9uLiIp
Ow0KPj4+Pj4+DQo+Pj4+Pj4gVGhpcyBjb3VsZCBmZWFzaWJseSBiZSAwNjQ0LCBhcyB0aGVyZSB3
b3VsZCBiZSBub3RoaW5nIHdyb25nIHdpdGgNCj4+Pj4+PiBhbHRlcmluZyB0aGlzIGF0IHJ1bnRp
bWUuDQo+Pj4+Pj4gICAgDQo+Pj4+Pg0KPj4+Pj4gV2hpbGUgd2UgY291bGQgaXQgd291bGQgbm90
IGNoYW5nZSB0aGUgYWxsb2NhdGlvbiBvZiBfZXhpc3RpbmdfIHJhbQ0KPj4+Pj4gZGV2aWNlcywg
bWFraW5nIGJlaGF2aW91ciByYXRoZXIgdW5wcmVkaWN0YWJsZS4NCj4+Pj4+IEhlbmNlIEkgZGlk
IGRlY2lkZSBhZ2FpbnN0IGl0IChhbmQgeWVzLCBJIGFjdHVhbGx5IHRob3VnaHQgYWJvdXQNCj4+
Pj4+IGl0KS4NCj4+Pj4+DQo+Pj4+PiBCdXQgaWYgeW91IGluc2lzdCAuLi4NCj4+Pj4NCj4+Pj4g
UmlnaHQsIGl0IHdvdWxkIGp1c3QgY2hhbmdlIG5ldyBhbGxvY2F0aW9ucy4gUHJvYmFibHkgbm90
IGEgY29tbW9uIHVzZQ0KPj4+PiBjYXNlLCBidXQgdGhlcmUncyByZWFsbHkgbm90aGluZyB0aGF0
IHByZXZlbnRzIGl0IGZyb20gYmVpbmcgZmVhc2libGUuDQo+Pj4+DQo+Pj4+IE5leHQgcXVlc3Rp
b24gLSB3aGF0IGRvZXMgdGhlIG1lbW9yeSBhbGxvY2F0b3IgZG8gaWYgd2UgcnVuIG91dCBvZg0K
Pj4+PiBtZW1vcnkgb24gdGhlIGdpdmVuIG5vZGU/IFNob3VsZCB3ZSBwdW50IHRvIGEgZGlmZmVy
ZW50IG5vZGUgaWYgdGhhdA0KPj4+PiBoYXBwZW5zPyBTbG93ZXIsIGJ1dCBmdW5jdGlvbmFsLCBz
ZWVtcyBwcmVmZXJhYmxlIHRvIG5vdCBiZWluZyBhYmxlDQo+Pj4+IHRvIGdldCBtZW1vcnkuDQo+
Pj4+DQo+Pj4NCj4+PiBIbW0uIFRoYXQgSSBoYXZlbid0IGNvbnNpZGVyZWQ7IHllcywgdGhhdCBy
ZWFsbHkgc291bmRzIGxpa2UgYW4gaWRlYS4NCj4+PiBXaWxsIGJlIHNlbmRpbmcgYW4gdXBkYXRl
ZCBwYXRjaC4NCj4+DQo+PiBXaWxsIG51bWFjdGwgLi4uIG1vZHByb2JlIGJyZCAuLi4gc29sdmUg
dGhpcyBwcm9ibGVtPw0KPiANCj4gSXQgd29uJ3QsIHBhZ2VzIGFyZSBhbGxvY2F0ZWQgYXMgbmVl
ZGVkLg0KPiANCg0KVGhlbiBob3cgYWJvdXQgYSBudW1hY3RsIC4uLiBkZCAvZGV2L3JhbSAuLi4g
YWZ0ZXIgdGhlIG1vZHByb2JlLg==

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-14 20:41             ` Adam Manzanares
@ 2018-06-14 20:47               ` Jens Axboe
  2018-06-14 20:53                 ` Adam Manzanares
  2018-06-15  9:23                 ` Mel Gorman
  0 siblings, 2 replies; 18+ messages in thread
From: Jens Axboe @ 2018-06-14 20:47 UTC (permalink / raw)
  To: Adam Manzanares, Hannes Reinecke; +Cc: linux-block, Mel Gorman, Hannes Reinecke

On 6/14/18 2:41 PM, Adam Manzanares wrote:
> 
> 
> On 6/14/18 1:37 PM, Jens Axboe wrote:
>> On 6/14/18 2:32 PM, Adam Manzanares wrote:
>>>
>>>
>>> On 6/14/18 9:09 AM, Hannes Reinecke wrote:
>>>> On Thu, 14 Jun 2018 09:33:35 -0600
>>>> Jens Axboe <axboe@kernel.dk> wrote:
>>>>
>>>>> On 6/14/18 9:29 AM, Hannes Reinecke wrote:
>>>>>> On Thu, 14 Jun 2018 08:47:33 -0600
>>>>>> Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>     
>>>>>>> On 6/14/18 7:38 AM, Hannes Reinecke wrote:
>>>>>>>> For performance reasons we should be able to allocate all memory
>>>>>>>> from a given NUMA node, so this patch adds a new parameter
>>>>>>>> 'rd_numa_node' to allow the user to specify the NUMA node id.
>>>>>>>> When restricing fio to use the same NUMA node I'm seeing a
>>>>>>>> performance boost of more than 200%.
>>>>>>>
>>>>>>> Looks fine to me. One comment.
>>>>>>>    
>>>>>>>> @@ -342,6 +343,10 @@ static int max_part = 1;
>>>>>>>>    module_param(max_part, int, 0444);
>>>>>>>>    MODULE_PARM_DESC(max_part, "Num Minors to reserve between
>>>>>>>> devices");
>>>>>>>> +static int rd_numa_node = NUMA_NO_NODE;
>>>>>>>> +module_param(rd_numa_node, int, 0444);
>>>>>>>> +MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM
>>>>>>>> disk on.");
>>>>>>>
>>>>>>> This could feasibly be 0644, as there would be nothing wrong with
>>>>>>> altering this at runtime.
>>>>>>>    
>>>>>>
>>>>>> While we could it would not change the allocation of _existing_ ram
>>>>>> devices, making behaviour rather unpredictable.
>>>>>> Hence I did decide against it (and yes, I actually thought about
>>>>>> it).
>>>>>>
>>>>>> But if you insist ...
>>>>>
>>>>> Right, it would just change new allocations. Probably not a common use
>>>>> case, but there's really nothing that prevents it from being feasible.
>>>>>
>>>>> Next question - what does the memory allocator do if we run out of
>>>>> memory on the given node? Should we punt to a different node if that
>>>>> happens? Slower, but functional, seems preferable to not being able
>>>>> to get memory.
>>>>>
>>>>
>>>> Hmm. That I haven't considered; yes, that really sounds like an idea.
>>>> Will be sending an updated patch.
>>>
>>> Will numactl ... modprobe brd ... solve this problem?
>>
>> It won't, pages are allocated as needed.
>>
> 
> Then how about a numactl ... dd /dev/ram ... after the modprobe.

Yes of course, or you could do that for every application that ends
up in the path of the doing IO to it. The point of the option is to
just make it explicit, and not have to either NUMA pin each task,
or prefill all possible pages.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-14 20:47               ` Jens Axboe
@ 2018-06-14 20:53                 ` Adam Manzanares
  2018-06-15  6:06                   ` Hannes Reinecke
  2018-06-15  9:23                 ` Mel Gorman
  1 sibling, 1 reply; 18+ messages in thread
From: Adam Manzanares @ 2018-06-14 20:53 UTC (permalink / raw)
  To: Jens Axboe, Hannes Reinecke; +Cc: linux-block, Mel Gorman, Hannes Reinecke

DQoNCk9uIDYvMTQvMTggMTo0NyBQTSwgSmVucyBBeGJvZSB3cm90ZToNCj4gT24gNi8xNC8xOCAy
OjQxIFBNLCBBZGFtIE1hbnphbmFyZXMgd3JvdGU6DQo+Pg0KPj4NCj4+IE9uIDYvMTQvMTggMToz
NyBQTSwgSmVucyBBeGJvZSB3cm90ZToNCj4+PiBPbiA2LzE0LzE4IDI6MzIgUE0sIEFkYW0gTWFu
emFuYXJlcyB3cm90ZToNCj4+Pj4NCj4+Pj4NCj4+Pj4gT24gNi8xNC8xOCA5OjA5IEFNLCBIYW5u
ZXMgUmVpbmVja2Ugd3JvdGU6DQo+Pj4+PiBPbiBUaHUsIDE0IEp1biAyMDE4IDA5OjMzOjM1IC0w
NjAwDQo+Pj4+PiBKZW5zIEF4Ym9lIDxheGJvZUBrZXJuZWwuZGs+IHdyb3RlOg0KPj4+Pj4NCj4+
Pj4+PiBPbiA2LzE0LzE4IDk6MjkgQU0sIEhhbm5lcyBSZWluZWNrZSB3cm90ZToNCj4+Pj4+Pj4g
T24gVGh1LCAxNCBKdW4gMjAxOCAwODo0NzozMyAtMDYwMA0KPj4+Pj4+PiBKZW5zIEF4Ym9lIDxh
eGJvZUBrZXJuZWwuZGs+IHdyb3RlOg0KPj4+Pj4+PiAgICAgIA0KPj4+Pj4+Pj4gT24gNi8xNC8x
OCA3OjM4IEFNLCBIYW5uZXMgUmVpbmVja2Ugd3JvdGU6DQo+Pj4+Pj4+Pj4gRm9yIHBlcmZvcm1h
bmNlIHJlYXNvbnMgd2Ugc2hvdWxkIGJlIGFibGUgdG8gYWxsb2NhdGUgYWxsIG1lbW9yeQ0KPj4+
Pj4+Pj4+IGZyb20gYSBnaXZlbiBOVU1BIG5vZGUsIHNvIHRoaXMgcGF0Y2ggYWRkcyBhIG5ldyBw
YXJhbWV0ZXINCj4+Pj4+Pj4+PiAncmRfbnVtYV9ub2RlJyB0byBhbGxvdyB0aGUgdXNlciB0byBz
cGVjaWZ5IHRoZSBOVU1BIG5vZGUgaWQuDQo+Pj4+Pj4+Pj4gV2hlbiByZXN0cmljaW5nIGZpbyB0
byB1c2UgdGhlIHNhbWUgTlVNQSBub2RlIEknbSBzZWVpbmcgYQ0KPj4+Pj4+Pj4+IHBlcmZvcm1h
bmNlIGJvb3N0IG9mIG1vcmUgdGhhbiAyMDAlLg0KPj4+Pj4+Pj4NCj4+Pj4+Pj4+IExvb2tzIGZp
bmUgdG8gbWUuIE9uZSBjb21tZW50Lg0KPj4+Pj4+Pj4gICAgIA0KPj4+Pj4+Pj4+IEBAIC0zNDIs
NiArMzQzLDEwIEBAIHN0YXRpYyBpbnQgbWF4X3BhcnQgPSAxOw0KPj4+Pj4+Pj4+ICAgICBtb2R1
bGVfcGFyYW0obWF4X3BhcnQsIGludCwgMDQ0NCk7DQo+Pj4+Pj4+Pj4gICAgIE1PRFVMRV9QQVJN
X0RFU0MobWF4X3BhcnQsICJOdW0gTWlub3JzIHRvIHJlc2VydmUgYmV0d2Vlbg0KPj4+Pj4+Pj4+
IGRldmljZXMiKTsNCj4+Pj4+Pj4+PiArc3RhdGljIGludCByZF9udW1hX25vZGUgPSBOVU1BX05P
X05PREU7DQo+Pj4+Pj4+Pj4gK21vZHVsZV9wYXJhbShyZF9udW1hX25vZGUsIGludCwgMDQ0NCk7
DQo+Pj4+Pj4+Pj4gK01PRFVMRV9QQVJNX0RFU0MocmRfbnVtYV9ub2RlLCAiTlVNQSBub2RlIG51
bWJlciB0byBhbGxvY2F0ZSBSQU0NCj4+Pj4+Pj4+PiBkaXNrIG9uLiIpOw0KPj4+Pj4+Pj4NCj4+
Pj4+Pj4+IFRoaXMgY291bGQgZmVhc2libHkgYmUgMDY0NCwgYXMgdGhlcmUgd291bGQgYmUgbm90
aGluZyB3cm9uZyB3aXRoDQo+Pj4+Pj4+PiBhbHRlcmluZyB0aGlzIGF0IHJ1bnRpbWUuDQo+Pj4+
Pj4+PiAgICAgDQo+Pj4+Pj4+DQo+Pj4+Pj4+IFdoaWxlIHdlIGNvdWxkIGl0IHdvdWxkIG5vdCBj
aGFuZ2UgdGhlIGFsbG9jYXRpb24gb2YgX2V4aXN0aW5nXyByYW0NCj4+Pj4+Pj4gZGV2aWNlcywg
bWFraW5nIGJlaGF2aW91ciByYXRoZXIgdW5wcmVkaWN0YWJsZS4NCj4+Pj4+Pj4gSGVuY2UgSSBk
aWQgZGVjaWRlIGFnYWluc3QgaXQgKGFuZCB5ZXMsIEkgYWN0dWFsbHkgdGhvdWdodCBhYm91dA0K
Pj4+Pj4+PiBpdCkuDQo+Pj4+Pj4+DQo+Pj4+Pj4+IEJ1dCBpZiB5b3UgaW5zaXN0IC4uLg0KPj4+
Pj4+DQo+Pj4+Pj4gUmlnaHQsIGl0IHdvdWxkIGp1c3QgY2hhbmdlIG5ldyBhbGxvY2F0aW9ucy4g
UHJvYmFibHkgbm90IGEgY29tbW9uIHVzZQ0KPj4+Pj4+IGNhc2UsIGJ1dCB0aGVyZSdzIHJlYWxs
eSBub3RoaW5nIHRoYXQgcHJldmVudHMgaXQgZnJvbSBiZWluZyBmZWFzaWJsZS4NCj4+Pj4+Pg0K
Pj4+Pj4+IE5leHQgcXVlc3Rpb24gLSB3aGF0IGRvZXMgdGhlIG1lbW9yeSBhbGxvY2F0b3IgZG8g
aWYgd2UgcnVuIG91dCBvZg0KPj4+Pj4+IG1lbW9yeSBvbiB0aGUgZ2l2ZW4gbm9kZT8gU2hvdWxk
IHdlIHB1bnQgdG8gYSBkaWZmZXJlbnQgbm9kZSBpZiB0aGF0DQo+Pj4+Pj4gaGFwcGVucz8gU2xv
d2VyLCBidXQgZnVuY3Rpb25hbCwgc2VlbXMgcHJlZmVyYWJsZSB0byBub3QgYmVpbmcgYWJsZQ0K
Pj4+Pj4+IHRvIGdldCBtZW1vcnkuDQo+Pj4+Pj4NCj4+Pj4+DQo+Pj4+PiBIbW0uIFRoYXQgSSBo
YXZlbid0IGNvbnNpZGVyZWQ7IHllcywgdGhhdCByZWFsbHkgc291bmRzIGxpa2UgYW4gaWRlYS4N
Cj4+Pj4+IFdpbGwgYmUgc2VuZGluZyBhbiB1cGRhdGVkIHBhdGNoLg0KPj4+Pg0KPj4+PiBXaWxs
IG51bWFjdGwgLi4uIG1vZHByb2JlIGJyZCAuLi4gc29sdmUgdGhpcyBwcm9ibGVtPw0KPj4+DQo+
Pj4gSXQgd29uJ3QsIHBhZ2VzIGFyZSBhbGxvY2F0ZWQgYXMgbmVlZGVkLg0KPj4+DQo+Pg0KPj4g
VGhlbiBob3cgYWJvdXQgYSBudW1hY3RsIC4uLiBkZCAvZGV2L3JhbSAuLi4gYWZ0ZXIgdGhlIG1v
ZHByb2JlLg0KPiANCj4gWWVzIG9mIGNvdXJzZSwgb3IgeW91IGNvdWxkIGRvIHRoYXQgZm9yIGV2
ZXJ5IGFwcGxpY2F0aW9uIHRoYXQgZW5kcw0KPiB1cCBpbiB0aGUgcGF0aCBvZiB0aGUgZG9pbmcg
SU8gdG8gaXQuIFRoZSBwb2ludCBvZiB0aGUgb3B0aW9uIGlzIHRvDQo+IGp1c3QgbWFrZSBpdCBl
eHBsaWNpdCwgYW5kIG5vdCBoYXZlIHRvIGVpdGhlciBOVU1BIHBpbiBlYWNoIHRhc2ssDQo+IG9y
IHByZWZpbGwgYWxsIHBvc3NpYmxlIHBhZ2VzLg0KPiANCg0KTWFrZXMgc2Vuc2UsIEkgaGF2ZSBk
b25lIHNvbWUgc2ltaWxhciBiZW5jaG1hcmtpbmcgYW5kIGhhZCB0byB3b3JyeSANCmFib3V0IE5V
TUEgYXdhcmVuZXNzIGFuZCB0aGUgbnVtYWN0bCArIGRkIGFwcHJvYWNoIHNlZW1lZCB0byB3b3Jr
IA0KYmVjYXVzZSBJIHdhbnRlZCB0byBub3QgdGFrZSBhIHBlcmZvcm1hbmNlIGhpdCBmb3IgcGFn
ZSBhbGxvY2F0aW9uIA0KZHVyaW5nIHRoZSBiZW5jaG1hcmtpbmcuDQoNCldvdWxkIGFueW9uZSBi
ZSBpbnRlcmVzdGVkIGluIGZvcmNpbmcgdGhlIGFsbG9jYXRpb25zIHRvIG9jY3VyIGR1cmluZyAN
Cm1vZHVsZSBpbml0aWFsaXphdGlvbj8NCg0K

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-14 20:53                 ` Adam Manzanares
@ 2018-06-15  6:06                   ` Hannes Reinecke
  0 siblings, 0 replies; 18+ messages in thread
From: Hannes Reinecke @ 2018-06-15  6:06 UTC (permalink / raw)
  To: Adam Manzanares, Jens Axboe; +Cc: linux-block, Mel Gorman, Hannes Reinecke

On 06/14/2018 10:53 PM, Adam Manzanares wrote:
[ .. ]
>>>
>>> Then how about a numactl ... dd /dev/ram ... after the modprobe.
>>
>> Yes of course, or you could do that for every application that ends
>> up in the path of the doing IO to it. The point of the option is to
>> just make it explicit, and not have to either NUMA pin each task,
>> or prefill all possible pages.
>>
> 
> Makes sense, I have done some similar benchmarking and had to worry
> about NUMA awareness and the numactl + dd approach seemed to work
> because I wanted to not take a performance hit for page allocation
> during the benchmarking.
> 
> Would anyone be interested in forcing the allocations to occur during
> module initialization?
> 
YES.

Cheers,

Hannes

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-14 15:33     ` Jens Axboe
  2018-06-14 16:09       ` Hannes Reinecke
@ 2018-06-15  7:30       ` Christoph Hellwig
  2018-06-15 14:12         ` Jens Axboe
  1 sibling, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2018-06-15  7:30 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Hannes Reinecke, linux-block, Mel Gorman, Hannes Reinecke

On Thu, Jun 14, 2018 at 09:33:35AM -0600, Jens Axboe wrote:
> Next question - what does the memory allocator do if we run out of
> memory on the given node? Should we punt to a different node if that
> happens? Slower, but functional, seems preferable to not being able
> to get memory.

When using alloc_pages_node the passed in node id is just a hint, the
allocator will use all avaiable memory if nedeed.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-14 20:47               ` Jens Axboe
  2018-06-14 20:53                 ` Adam Manzanares
@ 2018-06-15  9:23                 ` Mel Gorman
  2018-06-15 14:28                   ` Jens Axboe
  1 sibling, 1 reply; 18+ messages in thread
From: Mel Gorman @ 2018-06-15  9:23 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Adam Manzanares, Hannes Reinecke, linux-block, Hannes Reinecke

On Thu, Jun 14, 2018 at 02:47:39PM -0600, Jens Axboe wrote:
> >>> Will numactl ... modprobe brd ... solve this problem?
> >>
> >> It won't, pages are allocated as needed.
> >>
> > 
> > Then how about a numactl ... dd /dev/ram ... after the modprobe.
> 
> Yes of course, or you could do that for every application that ends
> up in the path of the doing IO to it. The point of the option is to
> just make it explicit, and not have to either NUMA pin each task,
> or prefill all possible pages.
> 

It's certainly possible from userspace using dd and numactl setting the
desired memory policy. mmtests has the following snippet when setting
up a benchmark using brd to deal with both NUMA artifacts and variable
performance due to first faults early in the lifetime of a benchmark.

                modprobe brd rd_size=$((TESTDISK_RD_SIZE/1024))
                if [ "$TESTDISK_RD_PREALLOC" == "yes" ]; then
                        if [ "$TESTDISK_RD_PREALLOC_NODE" != "" ]; then
                                tmp_prealloc_cmd="numactl -N $TESTDISK_RD_PREALLOC_NODE"
                        else
                                tmp_prealloc_cmd="numactl -i all"
                        fi
                        $tmp_prealloc_cmd dd if=/dev/zero of=/dev/ram0 bs=1M &>/dev/null
                fi

(Haven't actually validated this in a long time but it worked at some point)

First option allocates just from one node, the other interleaves between
everything. Any combination of nodes or policies can be used and this was
very simple, but it's what was needed at the time. The question is how
far do you want to go with supporting policies within the module?

One option would be to keep this very simple like the patch suggests so users
get the hint that it's even worth considering and then point at a document
on how to do more complex policies from userspace at device creation time.
Another is simply to document the hazard that the locality of memory is
controlled by the memory policy of the first task that touches it.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-14 13:38 [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node Hannes Reinecke
  2018-06-14 14:47 ` Jens Axboe
@ 2018-06-15 14:07 ` Bart Van Assche
  2018-06-15 16:55   ` Hannes Reinecke
  1 sibling, 1 reply; 18+ messages in thread
From: Bart Van Assche @ 2018-06-15 14:07 UTC (permalink / raw)
  To: hare, axboe; +Cc: mgorman, linux-block, hare

T24gVGh1LCAyMDE4LTA2LTE0IGF0IDE1OjM4ICswMjAwLCBIYW5uZXMgUmVpbmVja2Ugd3JvdGU6
DQo+IEZvciBwZXJmb3JtYW5jZSByZWFzb25zIHdlIHNob3VsZCBiZSBhYmxlIHRvIGFsbG9jYXRl
IGFsbCBtZW1vcnkNCj4gZnJvbSBhIGdpdmVuIE5VTUEgbm9kZSwgc28gdGhpcyBwYXRjaCBhZGRz
IGEgbmV3IHBhcmFtZXRlcg0KPiAncmRfbnVtYV9ub2RlJyB0byBhbGxvdyB0aGUgdXNlciB0byBz
cGVjaWZ5IHRoZSBOVU1BIG5vZGUgaWQuDQo+IFdoZW4gcmVzdHJpY2luZyBmaW8gdG8gdXNlIHRo
ZSBzYW1lIE5VTUEgbm9kZSBJJ20gc2VlaW5nIGEgcGVyZm9ybWFuY2UNCj4gYm9vc3Qgb2YgbW9y
ZSB0aGFuIDIwMCUuDQoNClBhc3NpbmcgdGhpcyBpbmZvcm1hdGlvbiB0aHJvdWdoIGEga2VybmVs
IG1vZHVsZSBwYXJhbWV0ZXIgdG8gdGhlIGJyZCBrZXJuZWwNCm1vZHVsZSBzZWVtcyB3cm9uZyB0
byBtZS4gVGhlcmUgY2FuIGJlIG11bHRpcGxlIGJyZCBpbnN0YW5jZXMuIFVzaW5nIGEga2VybmVs
DQptb2R1bGUgcGFyYW1ldGVyIG1ha2VzIGl0IGltcG9zc2libGUgdG8gc3BlY2lmeSBhIGRpZmZl
cmVudCBOVU1BIG5vZGUgZm9yDQpkaWZmZXJlbnQgYnJkIGluc3RhbmNlcy4NCg0KQmFydC4NCg0K
DQoNCg0K

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-15  7:30       ` Christoph Hellwig
@ 2018-06-15 14:12         ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2018-06-15 14:12 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Hannes Reinecke, linux-block, Mel Gorman, Hannes Reinecke

On 6/15/18 1:30 AM, Christoph Hellwig wrote:
> On Thu, Jun 14, 2018 at 09:33:35AM -0600, Jens Axboe wrote:
>> Next question - what does the memory allocator do if we run out of
>> memory on the given node? Should we punt to a different node if that
>> happens? Slower, but functional, seems preferable to not being able
>> to get memory.
> 
> When using alloc_pages_node the passed in node id is just a hint, the
> allocator will use all avaiable memory if nedeed.

OK good, that's not a concern then.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-15  9:23                 ` Mel Gorman
@ 2018-06-15 14:28                   ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2018-06-15 14:28 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Adam Manzanares, Hannes Reinecke, linux-block, Hannes Reinecke

On 6/15/18 3:23 AM, Mel Gorman wrote:
> On Thu, Jun 14, 2018 at 02:47:39PM -0600, Jens Axboe wrote:
>>>>> Will numactl ... modprobe brd ... solve this problem?
>>>>
>>>> It won't, pages are allocated as needed.
>>>>
>>>
>>> Then how about a numactl ... dd /dev/ram ... after the modprobe.
>>
>> Yes of course, or you could do that for every application that ends
>> up in the path of the doing IO to it. The point of the option is to
>> just make it explicit, and not have to either NUMA pin each task,
>> or prefill all possible pages.
>>
> 
> It's certainly possible from userspace using dd and numactl setting the
> desired memory policy. mmtests has the following snippet when setting
> up a benchmark using brd to deal with both NUMA artifacts and variable
> performance due to first faults early in the lifetime of a benchmark.
> 
>                 modprobe brd rd_size=$((TESTDISK_RD_SIZE/1024))
>                 if [ "$TESTDISK_RD_PREALLOC" == "yes" ]; then
>                         if [ "$TESTDISK_RD_PREALLOC_NODE" != "" ]; then
>                                 tmp_prealloc_cmd="numactl -N $TESTDISK_RD_PREALLOC_NODE"
>                         else
>                                 tmp_prealloc_cmd="numactl -i all"
>                         fi
>                         $tmp_prealloc_cmd dd if=/dev/zero of=/dev/ram0 bs=1M &>/dev/null
>                 fi
> 
> (Haven't actually validated this in a long time but it worked at some point)

You'd want to make this oflag=direct as well (this goes for Adam, too), or
you could have pages being written that are NOT issued by dd.

> First option allocates just from one node, the other interleaves between
> everything. Any combination of nodes or policies can be used and this was
> very simple, but it's what was needed at the time. The question is how
> far do you want to go with supporting policies within the module?

Not far, imho :-)

> One option would be to keep this very simple like the patch suggests so users
> get the hint that it's even worth considering and then point at a document
> on how to do more complex policies from userspace at device creation time.
> Another is simply to document the hazard that the locality of memory is
> controlled by the memory policy of the first task that touches it.

I like the simple option, especially since (as Christoph pointed out) that
if we fail allocating from the given node, then we'll just go elsewhere.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-15 14:07 ` Bart Van Assche
@ 2018-06-15 16:55   ` Hannes Reinecke
  2018-06-15 16:58     ` Bart Van Assche
  0 siblings, 1 reply; 18+ messages in thread
From: Hannes Reinecke @ 2018-06-15 16:55 UTC (permalink / raw)
  To: Bart Van Assche, axboe; +Cc: mgorman, linux-block, hare

On 06/15/2018 04:07 PM, Bart Van Assche wrote:
> On Thu, 2018-06-14 at 15:38 +0200, Hannes Reinecke wrote:
>> For performance reasons we should be able to allocate all memory
>> from a given NUMA node, so this patch adds a new parameter
>> 'rd_numa_node' to allow the user to specify the NUMA node id.
>> When restricing fio to use the same NUMA node I'm seeing a performance
>> boost of more than 200%.
> 
> Passing this information through a kernel module parameter to the brd kernel
> module seems wrong to me. There can be multiple brd instances. Using a kernel
> module parameter makes it impossible to specify a different NUMA node for
> different brd instances.
> 
This patch has primarily done for simplicity; all the existing brd 
parameters affect _all_ ramdisks, so this patch keeps that style.

If you want soemthing more fine-grained you could use the approach 
suggested by Mel Gorman and use 'numactl' to pre-fill the ramdisk via 'dd'.

Cheers,

Hannes

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node
  2018-06-15 16:55   ` Hannes Reinecke
@ 2018-06-15 16:58     ` Bart Van Assche
  0 siblings, 0 replies; 18+ messages in thread
From: Bart Van Assche @ 2018-06-15 16:58 UTC (permalink / raw)
  To: hare, axboe; +Cc: mgorman, linux-block, hare

T24gRnJpLCAyMDE4LTA2LTE1IGF0IDE4OjU1ICswMjAwLCBIYW5uZXMgUmVpbmVja2Ugd3JvdGU6
DQo+IE9uIDA2LzE1LzIwMTggMDQ6MDcgUE0sIEJhcnQgVmFuIEFzc2NoZSB3cm90ZToNCj4gPiBP
biBUaHUsIDIwMTgtMDYtMTQgYXQgMTU6MzggKzAyMDAsIEhhbm5lcyBSZWluZWNrZSB3cm90ZToN
Cj4gPiA+IEZvciBwZXJmb3JtYW5jZSByZWFzb25zIHdlIHNob3VsZCBiZSBhYmxlIHRvIGFsbG9j
YXRlIGFsbCBtZW1vcnkNCj4gPiA+IGZyb20gYSBnaXZlbiBOVU1BIG5vZGUsIHNvIHRoaXMgcGF0
Y2ggYWRkcyBhIG5ldyBwYXJhbWV0ZXINCj4gPiA+ICdyZF9udW1hX25vZGUnIHRvIGFsbG93IHRo
ZSB1c2VyIHRvIHNwZWNpZnkgdGhlIE5VTUEgbm9kZSBpZC4NCj4gPiA+IFdoZW4gcmVzdHJpY2lu
ZyBmaW8gdG8gdXNlIHRoZSBzYW1lIE5VTUEgbm9kZSBJJ20gc2VlaW5nIGEgcGVyZm9ybWFuY2UN
Cj4gPiA+IGJvb3N0IG9mIG1vcmUgdGhhbiAyMDAlLg0KPiA+IA0KPiA+IFBhc3NpbmcgdGhpcyBp
bmZvcm1hdGlvbiB0aHJvdWdoIGEga2VybmVsIG1vZHVsZSBwYXJhbWV0ZXIgdG8gdGhlIGJyZCBr
ZXJuZWwNCj4gPiBtb2R1bGUgc2VlbXMgd3JvbmcgdG8gbWUuIFRoZXJlIGNhbiBiZSBtdWx0aXBs
ZSBicmQgaW5zdGFuY2VzLiBVc2luZyBhIGtlcm5lbA0KPiA+IG1vZHVsZSBwYXJhbWV0ZXIgbWFr
ZXMgaXQgaW1wb3NzaWJsZSB0byBzcGVjaWZ5IGEgZGlmZmVyZW50IE5VTUEgbm9kZSBmb3INCj4g
PiBkaWZmZXJlbnQgYnJkIGluc3RhbmNlcy4NCj4gDQo+IFRoaXMgcGF0Y2ggaGFzIHByaW1hcmls
eSBkb25lIGZvciBzaW1wbGljaXR5OyBhbGwgdGhlIGV4aXN0aW5nIGJyZCANCj4gcGFyYW1ldGVy
cyBhZmZlY3QgX2FsbF8gcmFtZGlza3MsIHNvIHRoaXMgcGF0Y2gga2VlcHMgdGhhdCBzdHlsZS4N
Cj4gDQo+IElmIHlvdSB3YW50IHNvZW10aGluZyBtb3JlIGZpbmUtZ3JhaW5lZCB5b3UgY291bGQg
dXNlIHRoZSBhcHByb2FjaCANCj4gc3VnZ2VzdGVkIGJ5IE1lbCBHb3JtYW4gYW5kIHVzZSAnbnVt
YWN0bCcgdG8gcHJlLWZpbGwgdGhlIHJhbWRpc2sgdmlhICdkZCcuDQoNClRoYXQncyBhIGN1bWJl
cnNvbWUgYXBwcm9hY2gsIGlsbHVzdHJhdGVkIGJ5IHRoZSBmYWN0IHRoYXQgTWVsIGZvcmdvdCB0
byB1c2UNCmRpcmVjdCB3cml0ZXMgaW4gaGlzIGV4YW1wbGVzLiBJZiBNZWwgb3Zlcmxvb2tlZCB0
aGF0IG1vcmUgcGVvcGxlIHdpbGwgb3Zlcmxvb2sNCnRvIHVzZSBkaXJlY3Qgd3JpdGVzLg0KDQpC
YXJ0Lg0KDQoNCg==

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2018-06-15 16:58 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-06-14 13:38 [PATCH] brd: Allow ramdisk to be allocated on selected NUMA node Hannes Reinecke
2018-06-14 14:47 ` Jens Axboe
2018-06-14 15:29   ` Hannes Reinecke
2018-06-14 15:33     ` Jens Axboe
2018-06-14 16:09       ` Hannes Reinecke
2018-06-14 20:32         ` Adam Manzanares
2018-06-14 20:37           ` Jens Axboe
2018-06-14 20:41             ` Adam Manzanares
2018-06-14 20:47               ` Jens Axboe
2018-06-14 20:53                 ` Adam Manzanares
2018-06-15  6:06                   ` Hannes Reinecke
2018-06-15  9:23                 ` Mel Gorman
2018-06-15 14:28                   ` Jens Axboe
2018-06-15  7:30       ` Christoph Hellwig
2018-06-15 14:12         ` Jens Axboe
2018-06-15 14:07 ` Bart Van Assche
2018-06-15 16:55   ` Hannes Reinecke
2018-06-15 16:58     ` Bart Van Assche

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.