* Re: bpf: use-after-free in array_map_alloc
@ 2016-05-23 21:35 ` Tejun Heo
0 siblings, 0 replies; 31+ messages in thread
From: Tejun Heo @ 2016-05-23 21:35 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Alexei Starovoitov, Sasha Levin, ast, netdev, LKML,
Christoph Lameter, Linux-MM layout
Hello,
Can you please test whether this patch resolves the issue? While
adding support for atomic allocations, I reduced alloc_mutex covered
region too much.
Thanks.
diff --git a/mm/percpu.c b/mm/percpu.c
index 0c59684..bd2df70 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -162,7 +162,7 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
static int pcpu_reserved_chunk_limit;
static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
-static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
+static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map extension */
static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
@@ -435,6 +435,8 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
unsigned long flags;
+ lockdep_assert_held(&pcpu_alloc_mutex);
+
new = pcpu_mem_zalloc(new_size);
if (!new)
return -ENOMEM;
@@ -895,6 +897,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
return NULL;
}
+ if (!is_atomic)
+ mutex_lock(&pcpu_alloc_mutex);
+
spin_lock_irqsave(&pcpu_lock, flags);
/* serve reserved allocations from the reserved chunk if available */
@@ -967,12 +972,11 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
if (is_atomic)
goto fail;
- mutex_lock(&pcpu_alloc_mutex);
+ lockdep_assert_held(&pcpu_alloc_mutex);
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
chunk = pcpu_create_chunk();
if (!chunk) {
- mutex_unlock(&pcpu_alloc_mutex);
err = "failed to allocate new chunk";
goto fail;
}
@@ -983,7 +987,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
spin_lock_irqsave(&pcpu_lock, flags);
}
- mutex_unlock(&pcpu_alloc_mutex);
goto restart;
area_found:
@@ -993,8 +996,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
if (!is_atomic) {
int page_start, page_end, rs, re;
- mutex_lock(&pcpu_alloc_mutex);
-
page_start = PFN_DOWN(off);
page_end = PFN_UP(off + size);
@@ -1005,7 +1006,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
spin_lock_irqsave(&pcpu_lock, flags);
if (ret) {
- mutex_unlock(&pcpu_alloc_mutex);
pcpu_free_area(chunk, off, &occ_pages);
err = "failed to populate";
goto fail_unlock;
@@ -1045,6 +1045,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
/* see the flag handling in pcpu_blance_workfn() */
pcpu_atomic_alloc_failed = true;
pcpu_schedule_balance_work();
+ } else {
+ mutex_unlock(&pcpu_alloc_mutex);
}
return NULL;
}
@@ -1137,6 +1139,8 @@ static void pcpu_balance_workfn(struct work_struct *work)
list_for_each_entry_safe(chunk, next, &to_free, list) {
int rs, re;
+ cancel_work_sync(&chunk->map_extend_work);
+
pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
pcpu_depopulate_chunk(chunk, rs, re);
spin_lock_irq(&pcpu_lock);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 31+ messages in thread
* Re: bpf: use-after-free in array_map_alloc
2016-05-23 21:35 ` Tejun Heo
@ 2016-05-23 22:13 ` Alexei Starovoitov
-1 siblings, 0 replies; 31+ messages in thread
From: Alexei Starovoitov @ 2016-05-23 22:13 UTC (permalink / raw)
To: Tejun Heo
Cc: Vlastimil Babka, Sasha Levin, ast, netdev, LKML,
Christoph Lameter, Linux-MM layout, Marco Grassi,
Daniel Borkmann
On Mon, May 23, 2016 at 05:35:01PM -0400, Tejun Heo wrote:
> Hello,
>
> Can you please test whether this patch resolves the issue? While
> adding support for atomic allocations, I reduced alloc_mutex covered
> region too much.
after the patch the use-after-free is no longer seen.
Tested-by: Alexei Starovoitov <ast@kernel.org>
>
> Thanks.
>
> diff --git a/mm/percpu.c b/mm/percpu.c
> index 0c59684..bd2df70 100644
> --- a/mm/percpu.c
> +++ b/mm/percpu.c
> @@ -162,7 +162,7 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
> static int pcpu_reserved_chunk_limit;
>
> static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
> -static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
> +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map extension */
>
> static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
>
> @@ -435,6 +435,8 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
> size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
> unsigned long flags;
>
> + lockdep_assert_held(&pcpu_alloc_mutex);
> +
> new = pcpu_mem_zalloc(new_size);
> if (!new)
> return -ENOMEM;
> @@ -895,6 +897,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> return NULL;
> }
>
> + if (!is_atomic)
> + mutex_lock(&pcpu_alloc_mutex);
> +
> spin_lock_irqsave(&pcpu_lock, flags);
>
> /* serve reserved allocations from the reserved chunk if available */
> @@ -967,12 +972,11 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> if (is_atomic)
> goto fail;
>
> - mutex_lock(&pcpu_alloc_mutex);
> + lockdep_assert_held(&pcpu_alloc_mutex);
>
> if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
> chunk = pcpu_create_chunk();
> if (!chunk) {
> - mutex_unlock(&pcpu_alloc_mutex);
> err = "failed to allocate new chunk";
> goto fail;
> }
> @@ -983,7 +987,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> spin_lock_irqsave(&pcpu_lock, flags);
> }
>
> - mutex_unlock(&pcpu_alloc_mutex);
> goto restart;
>
> area_found:
> @@ -993,8 +996,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> if (!is_atomic) {
> int page_start, page_end, rs, re;
>
> - mutex_lock(&pcpu_alloc_mutex);
> -
> page_start = PFN_DOWN(off);
> page_end = PFN_UP(off + size);
>
> @@ -1005,7 +1006,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
>
> spin_lock_irqsave(&pcpu_lock, flags);
> if (ret) {
> - mutex_unlock(&pcpu_alloc_mutex);
> pcpu_free_area(chunk, off, &occ_pages);
> err = "failed to populate";
> goto fail_unlock;
> @@ -1045,6 +1045,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> /* see the flag handling in pcpu_blance_workfn() */
> pcpu_atomic_alloc_failed = true;
> pcpu_schedule_balance_work();
> + } else {
> + mutex_unlock(&pcpu_alloc_mutex);
> }
> return NULL;
> }
> @@ -1137,6 +1139,8 @@ static void pcpu_balance_workfn(struct work_struct *work)
> list_for_each_entry_safe(chunk, next, &to_free, list) {
> int rs, re;
>
> + cancel_work_sync(&chunk->map_extend_work);
> +
> pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
> pcpu_depopulate_chunk(chunk, rs, re);
> spin_lock_irq(&pcpu_lock);
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: bpf: use-after-free in array_map_alloc
@ 2016-05-23 22:13 ` Alexei Starovoitov
0 siblings, 0 replies; 31+ messages in thread
From: Alexei Starovoitov @ 2016-05-23 22:13 UTC (permalink / raw)
To: Tejun Heo
Cc: Vlastimil Babka, Sasha Levin, ast, netdev, LKML,
Christoph Lameter, Linux-MM layout, Marco Grassi,
Daniel Borkmann
On Mon, May 23, 2016 at 05:35:01PM -0400, Tejun Heo wrote:
> Hello,
>
> Can you please test whether this patch resolves the issue? While
> adding support for atomic allocations, I reduced alloc_mutex covered
> region too much.
after the patch the use-after-free is no longer seen.
Tested-by: Alexei Starovoitov <ast@kernel.org>
>
> Thanks.
>
> diff --git a/mm/percpu.c b/mm/percpu.c
> index 0c59684..bd2df70 100644
> --- a/mm/percpu.c
> +++ b/mm/percpu.c
> @@ -162,7 +162,7 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
> static int pcpu_reserved_chunk_limit;
>
> static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
> -static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
> +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map extension */
>
> static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
>
> @@ -435,6 +435,8 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
> size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
> unsigned long flags;
>
> + lockdep_assert_held(&pcpu_alloc_mutex);
> +
> new = pcpu_mem_zalloc(new_size);
> if (!new)
> return -ENOMEM;
> @@ -895,6 +897,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> return NULL;
> }
>
> + if (!is_atomic)
> + mutex_lock(&pcpu_alloc_mutex);
> +
> spin_lock_irqsave(&pcpu_lock, flags);
>
> /* serve reserved allocations from the reserved chunk if available */
> @@ -967,12 +972,11 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> if (is_atomic)
> goto fail;
>
> - mutex_lock(&pcpu_alloc_mutex);
> + lockdep_assert_held(&pcpu_alloc_mutex);
>
> if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
> chunk = pcpu_create_chunk();
> if (!chunk) {
> - mutex_unlock(&pcpu_alloc_mutex);
> err = "failed to allocate new chunk";
> goto fail;
> }
> @@ -983,7 +987,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> spin_lock_irqsave(&pcpu_lock, flags);
> }
>
> - mutex_unlock(&pcpu_alloc_mutex);
> goto restart;
>
> area_found:
> @@ -993,8 +996,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> if (!is_atomic) {
> int page_start, page_end, rs, re;
>
> - mutex_lock(&pcpu_alloc_mutex);
> -
> page_start = PFN_DOWN(off);
> page_end = PFN_UP(off + size);
>
> @@ -1005,7 +1006,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
>
> spin_lock_irqsave(&pcpu_lock, flags);
> if (ret) {
> - mutex_unlock(&pcpu_alloc_mutex);
> pcpu_free_area(chunk, off, &occ_pages);
> err = "failed to populate";
> goto fail_unlock;
> @@ -1045,6 +1045,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> /* see the flag handling in pcpu_blance_workfn() */
> pcpu_atomic_alloc_failed = true;
> pcpu_schedule_balance_work();
> + } else {
> + mutex_unlock(&pcpu_alloc_mutex);
> }
> return NULL;
> }
> @@ -1137,6 +1139,8 @@ static void pcpu_balance_workfn(struct work_struct *work)
> list_for_each_entry_safe(chunk, next, &to_free, list) {
> int rs, re;
>
> + cancel_work_sync(&chunk->map_extend_work);
> +
> pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
> pcpu_depopulate_chunk(chunk, rs, re);
> spin_lock_irq(&pcpu_lock);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: bpf: use-after-free in array_map_alloc
2016-05-23 21:35 ` Tejun Heo
@ 2016-05-24 8:40 ` Vlastimil Babka
-1 siblings, 0 replies; 31+ messages in thread
From: Vlastimil Babka @ 2016-05-24 8:40 UTC (permalink / raw)
To: Tejun Heo
Cc: Alexei Starovoitov, Sasha Levin, ast, netdev, LKML,
Christoph Lameter, Linux-MM layout, marco.gra
[+CC Marco who reported the CVE, forgot that earlier]
On 05/23/2016 11:35 PM, Tejun Heo wrote:
> Hello,
>
> Can you please test whether this patch resolves the issue? While
> adding support for atomic allocations, I reduced alloc_mutex covered
> region too much.
>
> Thanks.
Ugh, this makes the code even more head-spinning than it was.
> diff --git a/mm/percpu.c b/mm/percpu.c
> index 0c59684..bd2df70 100644
> --- a/mm/percpu.c
> +++ b/mm/percpu.c
> @@ -162,7 +162,7 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
> static int pcpu_reserved_chunk_limit;
>
> static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
> -static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
> +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map extension */
>
> static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
>
> @@ -435,6 +435,8 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
> size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
> unsigned long flags;
>
> + lockdep_assert_held(&pcpu_alloc_mutex);
I don't see where the mutex gets locked when called via
pcpu_map_extend_workfn? (except via the new cancel_work_sync() call below?)
Also what protects chunks with scheduled work items from being removed?
> +
> new = pcpu_mem_zalloc(new_size);
> if (!new)
> return -ENOMEM;
> @@ -895,6 +897,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> return NULL;
> }
>
> + if (!is_atomic)
> + mutex_lock(&pcpu_alloc_mutex);
BTW I noticed that
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
this is too pessimistic IMHO. Reclaim is possible even without __GFP_FS
and __GFP_IO. Could you just use gfpflags_allow_blocking(gfp) here?
> +
> spin_lock_irqsave(&pcpu_lock, flags);
>
> /* serve reserved allocations from the reserved chunk if available */
> @@ -967,12 +972,11 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> if (is_atomic)
> goto fail;
>
> - mutex_lock(&pcpu_alloc_mutex);
> + lockdep_assert_held(&pcpu_alloc_mutex);
>
> if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
> chunk = pcpu_create_chunk();
> if (!chunk) {
> - mutex_unlock(&pcpu_alloc_mutex);
> err = "failed to allocate new chunk";
> goto fail;
> }
> @@ -983,7 +987,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> spin_lock_irqsave(&pcpu_lock, flags);
> }
>
> - mutex_unlock(&pcpu_alloc_mutex);
> goto restart;
>
> area_found:
> @@ -993,8 +996,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> if (!is_atomic) {
> int page_start, page_end, rs, re;
>
> - mutex_lock(&pcpu_alloc_mutex);
> -
> page_start = PFN_DOWN(off);
> page_end = PFN_UP(off + size);
>
> @@ -1005,7 +1006,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
>
> spin_lock_irqsave(&pcpu_lock, flags);
> if (ret) {
> - mutex_unlock(&pcpu_alloc_mutex);
> pcpu_free_area(chunk, off, &occ_pages);
> err = "failed to populate";
> goto fail_unlock;
> @@ -1045,6 +1045,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> /* see the flag handling in pcpu_blance_workfn() */
> pcpu_atomic_alloc_failed = true;
> pcpu_schedule_balance_work();
> + } else {
> + mutex_unlock(&pcpu_alloc_mutex);
> }
> return NULL;
> }
> @@ -1137,6 +1139,8 @@ static void pcpu_balance_workfn(struct work_struct *work)
> list_for_each_entry_safe(chunk, next, &to_free, list) {
> int rs, re;
>
> + cancel_work_sync(&chunk->map_extend_work);
This deserves some comment?
> +
> pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
> pcpu_depopulate_chunk(chunk, rs, re);
> spin_lock_irq(&pcpu_lock);
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: bpf: use-after-free in array_map_alloc
@ 2016-05-24 8:40 ` Vlastimil Babka
0 siblings, 0 replies; 31+ messages in thread
From: Vlastimil Babka @ 2016-05-24 8:40 UTC (permalink / raw)
To: Tejun Heo
Cc: Alexei Starovoitov, Sasha Levin, ast, netdev, LKML,
Christoph Lameter, Linux-MM layout, marco.gra
[+CC Marco who reported the CVE, forgot that earlier]
On 05/23/2016 11:35 PM, Tejun Heo wrote:
> Hello,
>
> Can you please test whether this patch resolves the issue? While
> adding support for atomic allocations, I reduced alloc_mutex covered
> region too much.
>
> Thanks.
Ugh, this makes the code even more head-spinning than it was.
> diff --git a/mm/percpu.c b/mm/percpu.c
> index 0c59684..bd2df70 100644
> --- a/mm/percpu.c
> +++ b/mm/percpu.c
> @@ -162,7 +162,7 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
> static int pcpu_reserved_chunk_limit;
>
> static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
> -static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
> +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map extension */
>
> static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
>
> @@ -435,6 +435,8 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
> size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
> unsigned long flags;
>
> + lockdep_assert_held(&pcpu_alloc_mutex);
I don't see where the mutex gets locked when called via
pcpu_map_extend_workfn? (except via the new cancel_work_sync() call below?)
Also what protects chunks with scheduled work items from being removed?
> +
> new = pcpu_mem_zalloc(new_size);
> if (!new)
> return -ENOMEM;
> @@ -895,6 +897,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> return NULL;
> }
>
> + if (!is_atomic)
> + mutex_lock(&pcpu_alloc_mutex);
BTW I noticed that
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
this is too pessimistic IMHO. Reclaim is possible even without __GFP_FS
and __GFP_IO. Could you just use gfpflags_allow_blocking(gfp) here?
> +
> spin_lock_irqsave(&pcpu_lock, flags);
>
> /* serve reserved allocations from the reserved chunk if available */
> @@ -967,12 +972,11 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> if (is_atomic)
> goto fail;
>
> - mutex_lock(&pcpu_alloc_mutex);
> + lockdep_assert_held(&pcpu_alloc_mutex);
>
> if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
> chunk = pcpu_create_chunk();
> if (!chunk) {
> - mutex_unlock(&pcpu_alloc_mutex);
> err = "failed to allocate new chunk";
> goto fail;
> }
> @@ -983,7 +987,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> spin_lock_irqsave(&pcpu_lock, flags);
> }
>
> - mutex_unlock(&pcpu_alloc_mutex);
> goto restart;
>
> area_found:
> @@ -993,8 +996,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> if (!is_atomic) {
> int page_start, page_end, rs, re;
>
> - mutex_lock(&pcpu_alloc_mutex);
> -
> page_start = PFN_DOWN(off);
> page_end = PFN_UP(off + size);
>
> @@ -1005,7 +1006,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
>
> spin_lock_irqsave(&pcpu_lock, flags);
> if (ret) {
> - mutex_unlock(&pcpu_alloc_mutex);
> pcpu_free_area(chunk, off, &occ_pages);
> err = "failed to populate";
> goto fail_unlock;
> @@ -1045,6 +1045,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> /* see the flag handling in pcpu_blance_workfn() */
> pcpu_atomic_alloc_failed = true;
> pcpu_schedule_balance_work();
> + } else {
> + mutex_unlock(&pcpu_alloc_mutex);
> }
> return NULL;
> }
> @@ -1137,6 +1139,8 @@ static void pcpu_balance_workfn(struct work_struct *work)
> list_for_each_entry_safe(chunk, next, &to_free, list) {
> int rs, re;
>
> + cancel_work_sync(&chunk->map_extend_work);
This deserves some comment?
> +
> pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
> pcpu_depopulate_chunk(chunk, rs, re);
> spin_lock_irq(&pcpu_lock);
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: bpf: use-after-free in array_map_alloc
2016-05-24 8:40 ` Vlastimil Babka
@ 2016-05-24 15:30 ` Tejun Heo
-1 siblings, 0 replies; 31+ messages in thread
From: Tejun Heo @ 2016-05-24 15:30 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Alexei Starovoitov, Sasha Levin, ast, netdev, LKML,
Christoph Lameter, Linux-MM layout, marco.gra
Hello,
On Tue, May 24, 2016 at 10:40:54AM +0200, Vlastimil Babka wrote:
> [+CC Marco who reported the CVE, forgot that earlier]
>
> On 05/23/2016 11:35 PM, Tejun Heo wrote:
> > Hello,
> >
> > Can you please test whether this patch resolves the issue? While
> > adding support for atomic allocations, I reduced alloc_mutex covered
> > region too much.
> >
> > Thanks.
>
> Ugh, this makes the code even more head-spinning than it was.
Locking-wise, it isn't complicated. It used to be a single mutex
protecting everything. Atomic alloc support required putting core
allocation parts under spinlock. It is messy because the two paths
are mixed in the same function. If we break out the core part to a
separate function and let the sleepable path call into that, it should
look okay, but that's for another patch.
Also, I think protecting chunk's lifetime w/ alloc_mutex is making it
a bit nasty. Maybe we should do per-chunk "extending" completion and
let pcpu_alloc_mutex just protect populating chunks.
> > @@ -435,6 +435,8 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
> > size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
> > unsigned long flags;
> >
> > + lockdep_assert_held(&pcpu_alloc_mutex);
>
> I don't see where the mutex gets locked when called via
> pcpu_map_extend_workfn? (except via the new cancel_work_sync() call below?)
Ah, right.
> Also what protects chunks with scheduled work items from being removed?
cancel_work_sync(), which now obviously should be called outside
alloc_mutex.
> > @@ -895,6 +897,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> > return NULL;
> > }
> >
> > + if (!is_atomic)
> > + mutex_lock(&pcpu_alloc_mutex);
>
> BTW I noticed that
> bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
>
> this is too pessimistic IMHO. Reclaim is possible even without __GFP_FS and
> __GFP_IO. Could you just use gfpflags_allow_blocking(gfp) here?
vmalloc hardcodes GFP_KERNEL, so getting more relaxed doesn't buy us
much.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: bpf: use-after-free in array_map_alloc
@ 2016-05-24 15:30 ` Tejun Heo
0 siblings, 0 replies; 31+ messages in thread
From: Tejun Heo @ 2016-05-24 15:30 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Alexei Starovoitov, Sasha Levin, ast, netdev, LKML,
Christoph Lameter, Linux-MM layout, marco.gra
Hello,
On Tue, May 24, 2016 at 10:40:54AM +0200, Vlastimil Babka wrote:
> [+CC Marco who reported the CVE, forgot that earlier]
>
> On 05/23/2016 11:35 PM, Tejun Heo wrote:
> > Hello,
> >
> > Can you please test whether this patch resolves the issue? While
> > adding support for atomic allocations, I reduced alloc_mutex covered
> > region too much.
> >
> > Thanks.
>
> Ugh, this makes the code even more head-spinning than it was.
Locking-wise, it isn't complicated. It used to be a single mutex
protecting everything. Atomic alloc support required putting core
allocation parts under spinlock. It is messy because the two paths
are mixed in the same function. If we break out the core part to a
separate function and let the sleepable path call into that, it should
look okay, but that's for another patch.
Also, I think protecting chunk's lifetime w/ alloc_mutex is making it
a bit nasty. Maybe we should do per-chunk "extending" completion and
let pcpu_alloc_mutex just protect populating chunks.
> > @@ -435,6 +435,8 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
> > size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
> > unsigned long flags;
> >
> > + lockdep_assert_held(&pcpu_alloc_mutex);
>
> I don't see where the mutex gets locked when called via
> pcpu_map_extend_workfn? (except via the new cancel_work_sync() call below?)
Ah, right.
> Also what protects chunks with scheduled work items from being removed?
cancel_work_sync(), which now obviously should be called outside
alloc_mutex.
> > @@ -895,6 +897,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
> > return NULL;
> > }
> >
> > + if (!is_atomic)
> > + mutex_lock(&pcpu_alloc_mutex);
>
> BTW I noticed that
> bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
>
> this is too pessimistic IMHO. Reclaim is possible even without __GFP_FS and
> __GFP_IO. Could you just use gfpflags_allow_blocking(gfp) here?
vmalloc hardcodes GFP_KERNEL, so getting more relaxed doesn't buy us
much.
Thanks.
--
tejun
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: bpf: use-after-free in array_map_alloc
2016-05-24 15:30 ` Tejun Heo
@ 2016-05-24 19:04 ` Tejun Heo
-1 siblings, 0 replies; 31+ messages in thread
From: Tejun Heo @ 2016-05-24 19:04 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Alexei Starovoitov, Sasha Levin, ast, netdev, LKML,
Christoph Lameter, Linux-MM layout, marco.gra
Hello,
Alexei, can you please verify this patch? Map extension got rolled
into balance work so that there's no sync issues between the two async
operations.
Thanks.
Index: work/mm/percpu.c
===================================================================
--- work.orig/mm/percpu.c
+++ work/mm/percpu.c
@@ -112,7 +112,7 @@ struct pcpu_chunk {
int map_used; /* # of map entries used before the sentry */
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
- struct work_struct map_extend_work;/* async ->map[] extension */
+ struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
void *data; /* chunk data */
int first_free; /* no free below this */
@@ -162,10 +162,13 @@ static struct pcpu_chunk *pcpu_reserved_
static int pcpu_reserved_chunk_limit;
static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
-static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
+static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+/* chunks which need their map areas extended, protected by pcpu_lock */
+static LIST_HEAD(pcpu_map_extend_chunks);
+
/*
* The number of empty populated pages, protected by pcpu_lock. The
* reserved chunk doesn't contribute to the count.
@@ -395,13 +398,19 @@ static int pcpu_need_to_extend(struct pc
{
int margin, new_alloc;
+ lockdep_assert_held(&pcpu_lock);
+
if (is_atomic) {
margin = 3;
if (chunk->map_alloc <
- chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
- pcpu_async_enabled)
- schedule_work(&chunk->map_extend_work);
+ chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
+ if (list_empty(&chunk->map_extend_list)) {
+ list_add_tail(&chunk->map_extend_list,
+ &pcpu_map_extend_chunks);
+ pcpu_schedule_balance_work();
+ }
+ }
} else {
margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
}
@@ -435,6 +444,8 @@ static int pcpu_extend_area_map(struct p
size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
unsigned long flags;
+ lockdep_assert_held(&pcpu_alloc_mutex);
+
new = pcpu_mem_zalloc(new_size);
if (!new)
return -ENOMEM;
@@ -467,20 +478,6 @@ out_unlock:
return 0;
}
-static void pcpu_map_extend_workfn(struct work_struct *work)
-{
- struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
- map_extend_work);
- int new_alloc;
-
- spin_lock_irq(&pcpu_lock);
- new_alloc = pcpu_need_to_extend(chunk, false);
- spin_unlock_irq(&pcpu_lock);
-
- if (new_alloc)
- pcpu_extend_area_map(chunk, new_alloc);
-}
-
/**
* pcpu_fit_in_area - try to fit the requested allocation in a candidate area
* @chunk: chunk the candidate area belongs to
@@ -740,7 +737,7 @@ static struct pcpu_chunk *pcpu_alloc_chu
chunk->map_used = 1;
INIT_LIST_HEAD(&chunk->list);
- INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
+ INIT_LIST_HEAD(&chunk->map_extend_list);
chunk->free_size = pcpu_unit_size;
chunk->contig_hint = pcpu_unit_size;
@@ -895,6 +892,9 @@ static void __percpu *pcpu_alloc(size_t
return NULL;
}
+ if (!is_atomic)
+ mutex_lock(&pcpu_alloc_mutex);
+
spin_lock_irqsave(&pcpu_lock, flags);
/* serve reserved allocations from the reserved chunk if available */
@@ -967,12 +967,9 @@ restart:
if (is_atomic)
goto fail;
- mutex_lock(&pcpu_alloc_mutex);
-
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
chunk = pcpu_create_chunk();
if (!chunk) {
- mutex_unlock(&pcpu_alloc_mutex);
err = "failed to allocate new chunk";
goto fail;
}
@@ -983,7 +980,6 @@ restart:
spin_lock_irqsave(&pcpu_lock, flags);
}
- mutex_unlock(&pcpu_alloc_mutex);
goto restart;
area_found:
@@ -993,8 +989,6 @@ area_found:
if (!is_atomic) {
int page_start, page_end, rs, re;
- mutex_lock(&pcpu_alloc_mutex);
-
page_start = PFN_DOWN(off);
page_end = PFN_UP(off + size);
@@ -1005,7 +999,6 @@ area_found:
spin_lock_irqsave(&pcpu_lock, flags);
if (ret) {
- mutex_unlock(&pcpu_alloc_mutex);
pcpu_free_area(chunk, off, &occ_pages);
err = "failed to populate";
goto fail_unlock;
@@ -1045,6 +1038,8 @@ fail:
/* see the flag handling in pcpu_blance_workfn() */
pcpu_atomic_alloc_failed = true;
pcpu_schedule_balance_work();
+ } else {
+ mutex_unlock(&pcpu_alloc_mutex);
}
return NULL;
}
@@ -1129,6 +1124,7 @@ static void pcpu_balance_workfn(struct w
if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
continue;
+ list_del_init(&chunk->map_extend_list);
list_move(&chunk->list, &to_free);
}
@@ -1146,6 +1142,24 @@ static void pcpu_balance_workfn(struct w
pcpu_destroy_chunk(chunk);
}
+ do {
+ int new_alloc = 0;
+
+ spin_lock_irq(&pcpu_lock);
+
+ chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
+ struct pcpu_chunk, map_extend_list);
+ if (chunk) {
+ list_del_init(&chunk->map_extend_list);
+ new_alloc = pcpu_need_to_extend(chunk, false);
+ }
+
+ spin_unlock_irq(&pcpu_lock);
+
+ if (new_alloc)
+ pcpu_extend_area_map(chunk, new_alloc);
+ } while (chunk);
+
/*
* Ensure there are certain number of free populated pages for
* atomic allocs. Fill up from the most packed so that atomic
@@ -1644,7 +1658,7 @@ int __init pcpu_setup_first_chunk(const
*/
schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&schunk->list);
- INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
+ INIT_LIST_HEAD(&schunk->map_extend_list);
schunk->base_addr = base_addr;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
@@ -1673,7 +1687,7 @@ int __init pcpu_setup_first_chunk(const
if (dyn_size) {
dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&dchunk->list);
- INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
+ INIT_LIST_HEAD(&dchunk->map_extend_list);
dchunk->base_addr = base_addr;
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: bpf: use-after-free in array_map_alloc
@ 2016-05-24 19:04 ` Tejun Heo
0 siblings, 0 replies; 31+ messages in thread
From: Tejun Heo @ 2016-05-24 19:04 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Alexei Starovoitov, Sasha Levin, ast, netdev, LKML,
Christoph Lameter, Linux-MM layout, marco.gra
Hello,
Alexei, can you please verify this patch? Map extension got rolled
into balance work so that there's no sync issues between the two async
operations.
Thanks.
Index: work/mm/percpu.c
===================================================================
--- work.orig/mm/percpu.c
+++ work/mm/percpu.c
@@ -112,7 +112,7 @@ struct pcpu_chunk {
int map_used; /* # of map entries used before the sentry */
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
- struct work_struct map_extend_work;/* async ->map[] extension */
+ struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
void *data; /* chunk data */
int first_free; /* no free below this */
@@ -162,10 +162,13 @@ static struct pcpu_chunk *pcpu_reserved_
static int pcpu_reserved_chunk_limit;
static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
-static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
+static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+/* chunks which need their map areas extended, protected by pcpu_lock */
+static LIST_HEAD(pcpu_map_extend_chunks);
+
/*
* The number of empty populated pages, protected by pcpu_lock. The
* reserved chunk doesn't contribute to the count.
@@ -395,13 +398,19 @@ static int pcpu_need_to_extend(struct pc
{
int margin, new_alloc;
+ lockdep_assert_held(&pcpu_lock);
+
if (is_atomic) {
margin = 3;
if (chunk->map_alloc <
- chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
- pcpu_async_enabled)
- schedule_work(&chunk->map_extend_work);
+ chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
+ if (list_empty(&chunk->map_extend_list)) {
+ list_add_tail(&chunk->map_extend_list,
+ &pcpu_map_extend_chunks);
+ pcpu_schedule_balance_work();
+ }
+ }
} else {
margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
}
@@ -435,6 +444,8 @@ static int pcpu_extend_area_map(struct p
size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
unsigned long flags;
+ lockdep_assert_held(&pcpu_alloc_mutex);
+
new = pcpu_mem_zalloc(new_size);
if (!new)
return -ENOMEM;
@@ -467,20 +478,6 @@ out_unlock:
return 0;
}
-static void pcpu_map_extend_workfn(struct work_struct *work)
-{
- struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
- map_extend_work);
- int new_alloc;
-
- spin_lock_irq(&pcpu_lock);
- new_alloc = pcpu_need_to_extend(chunk, false);
- spin_unlock_irq(&pcpu_lock);
-
- if (new_alloc)
- pcpu_extend_area_map(chunk, new_alloc);
-}
-
/**
* pcpu_fit_in_area - try to fit the requested allocation in a candidate area
* @chunk: chunk the candidate area belongs to
@@ -740,7 +737,7 @@ static struct pcpu_chunk *pcpu_alloc_chu
chunk->map_used = 1;
INIT_LIST_HEAD(&chunk->list);
- INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
+ INIT_LIST_HEAD(&chunk->map_extend_list);
chunk->free_size = pcpu_unit_size;
chunk->contig_hint = pcpu_unit_size;
@@ -895,6 +892,9 @@ static void __percpu *pcpu_alloc(size_t
return NULL;
}
+ if (!is_atomic)
+ mutex_lock(&pcpu_alloc_mutex);
+
spin_lock_irqsave(&pcpu_lock, flags);
/* serve reserved allocations from the reserved chunk if available */
@@ -967,12 +967,9 @@ restart:
if (is_atomic)
goto fail;
- mutex_lock(&pcpu_alloc_mutex);
-
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
chunk = pcpu_create_chunk();
if (!chunk) {
- mutex_unlock(&pcpu_alloc_mutex);
err = "failed to allocate new chunk";
goto fail;
}
@@ -983,7 +980,6 @@ restart:
spin_lock_irqsave(&pcpu_lock, flags);
}
- mutex_unlock(&pcpu_alloc_mutex);
goto restart;
area_found:
@@ -993,8 +989,6 @@ area_found:
if (!is_atomic) {
int page_start, page_end, rs, re;
- mutex_lock(&pcpu_alloc_mutex);
-
page_start = PFN_DOWN(off);
page_end = PFN_UP(off + size);
@@ -1005,7 +999,6 @@ area_found:
spin_lock_irqsave(&pcpu_lock, flags);
if (ret) {
- mutex_unlock(&pcpu_alloc_mutex);
pcpu_free_area(chunk, off, &occ_pages);
err = "failed to populate";
goto fail_unlock;
@@ -1045,6 +1038,8 @@ fail:
/* see the flag handling in pcpu_blance_workfn() */
pcpu_atomic_alloc_failed = true;
pcpu_schedule_balance_work();
+ } else {
+ mutex_unlock(&pcpu_alloc_mutex);
}
return NULL;
}
@@ -1129,6 +1124,7 @@ static void pcpu_balance_workfn(struct w
if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
continue;
+ list_del_init(&chunk->map_extend_list);
list_move(&chunk->list, &to_free);
}
@@ -1146,6 +1142,24 @@ static void pcpu_balance_workfn(struct w
pcpu_destroy_chunk(chunk);
}
+ do {
+ int new_alloc = 0;
+
+ spin_lock_irq(&pcpu_lock);
+
+ chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
+ struct pcpu_chunk, map_extend_list);
+ if (chunk) {
+ list_del_init(&chunk->map_extend_list);
+ new_alloc = pcpu_need_to_extend(chunk, false);
+ }
+
+ spin_unlock_irq(&pcpu_lock);
+
+ if (new_alloc)
+ pcpu_extend_area_map(chunk, new_alloc);
+ } while (chunk);
+
/*
* Ensure there are certain number of free populated pages for
* atomic allocs. Fill up from the most packed so that atomic
@@ -1644,7 +1658,7 @@ int __init pcpu_setup_first_chunk(const
*/
schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&schunk->list);
- INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
+ INIT_LIST_HEAD(&schunk->map_extend_list);
schunk->base_addr = base_addr;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
@@ -1673,7 +1687,7 @@ int __init pcpu_setup_first_chunk(const
if (dyn_size) {
dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&dchunk->list);
- INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
+ INIT_LIST_HEAD(&dchunk->map_extend_list);
dchunk->base_addr = base_addr;
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: bpf: use-after-free in array_map_alloc
2016-05-24 19:04 ` Tejun Heo
@ 2016-05-24 20:43 ` Alexei Starovoitov
-1 siblings, 0 replies; 31+ messages in thread
From: Alexei Starovoitov @ 2016-05-24 20:43 UTC (permalink / raw)
To: Tejun Heo
Cc: Vlastimil Babka, Sasha Levin, Alexei Starovoitov, netdev, LKML,
Christoph Lameter, Linux-MM layout, Marco Grassi
On Tue, May 24, 2016 at 12:04 PM, Tejun Heo <tj@kernel.org> wrote:
> Hello,
>
> Alexei, can you please verify this patch? Map extension got rolled
> into balance work so that there's no sync issues between the two async
> operations.
tests look good. No uaf and basic bpf tests exercise per-cpu map are fine.
>
> Thanks.
>
> Index: work/mm/percpu.c
> ===================================================================
> --- work.orig/mm/percpu.c
> +++ work/mm/percpu.c
> @@ -112,7 +112,7 @@ struct pcpu_chunk {
> int map_used; /* # of map entries used before the sentry */
> int map_alloc; /* # of map entries allocated */
> int *map; /* allocation map */
> - struct work_struct map_extend_work;/* async ->map[] extension */
> + struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
>
> void *data; /* chunk data */
> int first_free; /* no free below this */
> @@ -162,10 +162,13 @@ static struct pcpu_chunk *pcpu_reserved_
> static int pcpu_reserved_chunk_limit;
>
> static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
> -static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
> +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
>
> static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
>
> +/* chunks which need their map areas extended, protected by pcpu_lock */
> +static LIST_HEAD(pcpu_map_extend_chunks);
> +
> /*
> * The number of empty populated pages, protected by pcpu_lock. The
> * reserved chunk doesn't contribute to the count.
> @@ -395,13 +398,19 @@ static int pcpu_need_to_extend(struct pc
> {
> int margin, new_alloc;
>
> + lockdep_assert_held(&pcpu_lock);
> +
> if (is_atomic) {
> margin = 3;
>
> if (chunk->map_alloc <
> - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
> - pcpu_async_enabled)
> - schedule_work(&chunk->map_extend_work);
> + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
> + if (list_empty(&chunk->map_extend_list)) {
> + list_add_tail(&chunk->map_extend_list,
> + &pcpu_map_extend_chunks);
> + pcpu_schedule_balance_work();
> + }
> + }
> } else {
> margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
> }
> @@ -435,6 +444,8 @@ static int pcpu_extend_area_map(struct p
> size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
> unsigned long flags;
>
> + lockdep_assert_held(&pcpu_alloc_mutex);
> +
> new = pcpu_mem_zalloc(new_size);
> if (!new)
> return -ENOMEM;
> @@ -467,20 +478,6 @@ out_unlock:
> return 0;
> }
>
> -static void pcpu_map_extend_workfn(struct work_struct *work)
> -{
> - struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
> - map_extend_work);
> - int new_alloc;
> -
> - spin_lock_irq(&pcpu_lock);
> - new_alloc = pcpu_need_to_extend(chunk, false);
> - spin_unlock_irq(&pcpu_lock);
> -
> - if (new_alloc)
> - pcpu_extend_area_map(chunk, new_alloc);
> -}
> -
> /**
> * pcpu_fit_in_area - try to fit the requested allocation in a candidate area
> * @chunk: chunk the candidate area belongs to
> @@ -740,7 +737,7 @@ static struct pcpu_chunk *pcpu_alloc_chu
> chunk->map_used = 1;
>
> INIT_LIST_HEAD(&chunk->list);
> - INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
> + INIT_LIST_HEAD(&chunk->map_extend_list);
> chunk->free_size = pcpu_unit_size;
> chunk->contig_hint = pcpu_unit_size;
>
> @@ -895,6 +892,9 @@ static void __percpu *pcpu_alloc(size_t
> return NULL;
> }
>
> + if (!is_atomic)
> + mutex_lock(&pcpu_alloc_mutex);
> +
> spin_lock_irqsave(&pcpu_lock, flags);
>
> /* serve reserved allocations from the reserved chunk if available */
> @@ -967,12 +967,9 @@ restart:
> if (is_atomic)
> goto fail;
>
> - mutex_lock(&pcpu_alloc_mutex);
> -
> if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
> chunk = pcpu_create_chunk();
> if (!chunk) {
> - mutex_unlock(&pcpu_alloc_mutex);
> err = "failed to allocate new chunk";
> goto fail;
> }
> @@ -983,7 +980,6 @@ restart:
> spin_lock_irqsave(&pcpu_lock, flags);
> }
>
> - mutex_unlock(&pcpu_alloc_mutex);
> goto restart;
>
> area_found:
> @@ -993,8 +989,6 @@ area_found:
> if (!is_atomic) {
> int page_start, page_end, rs, re;
>
> - mutex_lock(&pcpu_alloc_mutex);
> -
> page_start = PFN_DOWN(off);
> page_end = PFN_UP(off + size);
>
> @@ -1005,7 +999,6 @@ area_found:
>
> spin_lock_irqsave(&pcpu_lock, flags);
> if (ret) {
> - mutex_unlock(&pcpu_alloc_mutex);
> pcpu_free_area(chunk, off, &occ_pages);
> err = "failed to populate";
> goto fail_unlock;
> @@ -1045,6 +1038,8 @@ fail:
> /* see the flag handling in pcpu_blance_workfn() */
> pcpu_atomic_alloc_failed = true;
> pcpu_schedule_balance_work();
> + } else {
> + mutex_unlock(&pcpu_alloc_mutex);
> }
> return NULL;
> }
> @@ -1129,6 +1124,7 @@ static void pcpu_balance_workfn(struct w
> if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
> continue;
>
> + list_del_init(&chunk->map_extend_list);
> list_move(&chunk->list, &to_free);
> }
>
> @@ -1146,6 +1142,24 @@ static void pcpu_balance_workfn(struct w
> pcpu_destroy_chunk(chunk);
> }
>
> + do {
> + int new_alloc = 0;
> +
> + spin_lock_irq(&pcpu_lock);
> +
> + chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
> + struct pcpu_chunk, map_extend_list);
> + if (chunk) {
> + list_del_init(&chunk->map_extend_list);
> + new_alloc = pcpu_need_to_extend(chunk, false);
> + }
> +
> + spin_unlock_irq(&pcpu_lock);
> +
> + if (new_alloc)
> + pcpu_extend_area_map(chunk, new_alloc);
> + } while (chunk);
> +
> /*
> * Ensure there are certain number of free populated pages for
> * atomic allocs. Fill up from the most packed so that atomic
> @@ -1644,7 +1658,7 @@ int __init pcpu_setup_first_chunk(const
> */
> schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
> INIT_LIST_HEAD(&schunk->list);
> - INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
> + INIT_LIST_HEAD(&schunk->map_extend_list);
> schunk->base_addr = base_addr;
> schunk->map = smap;
> schunk->map_alloc = ARRAY_SIZE(smap);
> @@ -1673,7 +1687,7 @@ int __init pcpu_setup_first_chunk(const
> if (dyn_size) {
> dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
> INIT_LIST_HEAD(&dchunk->list);
> - INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
> + INIT_LIST_HEAD(&dchunk->map_extend_list);
> dchunk->base_addr = base_addr;
> dchunk->map = dmap;
> dchunk->map_alloc = ARRAY_SIZE(dmap);
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: bpf: use-after-free in array_map_alloc
@ 2016-05-24 20:43 ` Alexei Starovoitov
0 siblings, 0 replies; 31+ messages in thread
From: Alexei Starovoitov @ 2016-05-24 20:43 UTC (permalink / raw)
To: Tejun Heo
Cc: Vlastimil Babka, Sasha Levin, Alexei Starovoitov, netdev, LKML,
Christoph Lameter, Linux-MM layout, Marco Grassi
On Tue, May 24, 2016 at 12:04 PM, Tejun Heo <tj@kernel.org> wrote:
> Hello,
>
> Alexei, can you please verify this patch? Map extension got rolled
> into balance work so that there's no sync issues between the two async
> operations.
tests look good. No uaf and basic bpf tests exercise per-cpu map are fine.
>
> Thanks.
>
> Index: work/mm/percpu.c
> ===================================================================
> --- work.orig/mm/percpu.c
> +++ work/mm/percpu.c
> @@ -112,7 +112,7 @@ struct pcpu_chunk {
> int map_used; /* # of map entries used before the sentry */
> int map_alloc; /* # of map entries allocated */
> int *map; /* allocation map */
> - struct work_struct map_extend_work;/* async ->map[] extension */
> + struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
>
> void *data; /* chunk data */
> int first_free; /* no free below this */
> @@ -162,10 +162,13 @@ static struct pcpu_chunk *pcpu_reserved_
> static int pcpu_reserved_chunk_limit;
>
> static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
> -static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
> +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
>
> static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
>
> +/* chunks which need their map areas extended, protected by pcpu_lock */
> +static LIST_HEAD(pcpu_map_extend_chunks);
> +
> /*
> * The number of empty populated pages, protected by pcpu_lock. The
> * reserved chunk doesn't contribute to the count.
> @@ -395,13 +398,19 @@ static int pcpu_need_to_extend(struct pc
> {
> int margin, new_alloc;
>
> + lockdep_assert_held(&pcpu_lock);
> +
> if (is_atomic) {
> margin = 3;
>
> if (chunk->map_alloc <
> - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
> - pcpu_async_enabled)
> - schedule_work(&chunk->map_extend_work);
> + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
> + if (list_empty(&chunk->map_extend_list)) {
> + list_add_tail(&chunk->map_extend_list,
> + &pcpu_map_extend_chunks);
> + pcpu_schedule_balance_work();
> + }
> + }
> } else {
> margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
> }
> @@ -435,6 +444,8 @@ static int pcpu_extend_area_map(struct p
> size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
> unsigned long flags;
>
> + lockdep_assert_held(&pcpu_alloc_mutex);
> +
> new = pcpu_mem_zalloc(new_size);
> if (!new)
> return -ENOMEM;
> @@ -467,20 +478,6 @@ out_unlock:
> return 0;
> }
>
> -static void pcpu_map_extend_workfn(struct work_struct *work)
> -{
> - struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
> - map_extend_work);
> - int new_alloc;
> -
> - spin_lock_irq(&pcpu_lock);
> - new_alloc = pcpu_need_to_extend(chunk, false);
> - spin_unlock_irq(&pcpu_lock);
> -
> - if (new_alloc)
> - pcpu_extend_area_map(chunk, new_alloc);
> -}
> -
> /**
> * pcpu_fit_in_area - try to fit the requested allocation in a candidate area
> * @chunk: chunk the candidate area belongs to
> @@ -740,7 +737,7 @@ static struct pcpu_chunk *pcpu_alloc_chu
> chunk->map_used = 1;
>
> INIT_LIST_HEAD(&chunk->list);
> - INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
> + INIT_LIST_HEAD(&chunk->map_extend_list);
> chunk->free_size = pcpu_unit_size;
> chunk->contig_hint = pcpu_unit_size;
>
> @@ -895,6 +892,9 @@ static void __percpu *pcpu_alloc(size_t
> return NULL;
> }
>
> + if (!is_atomic)
> + mutex_lock(&pcpu_alloc_mutex);
> +
> spin_lock_irqsave(&pcpu_lock, flags);
>
> /* serve reserved allocations from the reserved chunk if available */
> @@ -967,12 +967,9 @@ restart:
> if (is_atomic)
> goto fail;
>
> - mutex_lock(&pcpu_alloc_mutex);
> -
> if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
> chunk = pcpu_create_chunk();
> if (!chunk) {
> - mutex_unlock(&pcpu_alloc_mutex);
> err = "failed to allocate new chunk";
> goto fail;
> }
> @@ -983,7 +980,6 @@ restart:
> spin_lock_irqsave(&pcpu_lock, flags);
> }
>
> - mutex_unlock(&pcpu_alloc_mutex);
> goto restart;
>
> area_found:
> @@ -993,8 +989,6 @@ area_found:
> if (!is_atomic) {
> int page_start, page_end, rs, re;
>
> - mutex_lock(&pcpu_alloc_mutex);
> -
> page_start = PFN_DOWN(off);
> page_end = PFN_UP(off + size);
>
> @@ -1005,7 +999,6 @@ area_found:
>
> spin_lock_irqsave(&pcpu_lock, flags);
> if (ret) {
> - mutex_unlock(&pcpu_alloc_mutex);
> pcpu_free_area(chunk, off, &occ_pages);
> err = "failed to populate";
> goto fail_unlock;
> @@ -1045,6 +1038,8 @@ fail:
> /* see the flag handling in pcpu_blance_workfn() */
> pcpu_atomic_alloc_failed = true;
> pcpu_schedule_balance_work();
> + } else {
> + mutex_unlock(&pcpu_alloc_mutex);
> }
> return NULL;
> }
> @@ -1129,6 +1124,7 @@ static void pcpu_balance_workfn(struct w
> if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
> continue;
>
> + list_del_init(&chunk->map_extend_list);
> list_move(&chunk->list, &to_free);
> }
>
> @@ -1146,6 +1142,24 @@ static void pcpu_balance_workfn(struct w
> pcpu_destroy_chunk(chunk);
> }
>
> + do {
> + int new_alloc = 0;
> +
> + spin_lock_irq(&pcpu_lock);
> +
> + chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
> + struct pcpu_chunk, map_extend_list);
> + if (chunk) {
> + list_del_init(&chunk->map_extend_list);
> + new_alloc = pcpu_need_to_extend(chunk, false);
> + }
> +
> + spin_unlock_irq(&pcpu_lock);
> +
> + if (new_alloc)
> + pcpu_extend_area_map(chunk, new_alloc);
> + } while (chunk);
> +
> /*
> * Ensure there are certain number of free populated pages for
> * atomic allocs. Fill up from the most packed so that atomic
> @@ -1644,7 +1658,7 @@ int __init pcpu_setup_first_chunk(const
> */
> schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
> INIT_LIST_HEAD(&schunk->list);
> - INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
> + INIT_LIST_HEAD(&schunk->map_extend_list);
> schunk->base_addr = base_addr;
> schunk->map = smap;
> schunk->map_alloc = ARRAY_SIZE(smap);
> @@ -1673,7 +1687,7 @@ int __init pcpu_setup_first_chunk(const
> if (dyn_size) {
> dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
> INIT_LIST_HEAD(&dchunk->list);
> - INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
> + INIT_LIST_HEAD(&dchunk->map_extend_list);
> dchunk->base_addr = base_addr;
> dchunk->map = dmap;
> dchunk->map_alloc = ARRAY_SIZE(dmap);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH percpu/for-4.7-fixes 1/2] percpu: fix synchronization between chunk->map_extend_work and chunk destruction
2016-05-24 20:43 ` Alexei Starovoitov
@ 2016-05-25 15:44 ` Tejun Heo
-1 siblings, 0 replies; 31+ messages in thread
From: Tejun Heo @ 2016-05-25 15:44 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Vlastimil Babka, Sasha Levin, Alexei Starovoitov, netdev, LKML,
Christoph Lameter, Linux-MM layout, Marco Grassi
Atomic allocations can trigger async map extensions which is serviced
by chunk->map_extend_work. pcpu_balance_work which is responsible for
destroying idle chunks wasn't synchronizing properly against
chunk->map_extend_work and may end up freeing the chunk while the work
item is still in flight.
This patch fixes the bug by rolling async map extension operations
into pcpu_balance_work.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Reported-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Cc: stable@vger.kernel.org # v3.18+
Fixes: 9c824b6a172c ("percpu: make sure chunk->map array has available space")
---
mm/percpu.c | 57 ++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 36 insertions(+), 21 deletions(-)
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -112,7 +112,7 @@ struct pcpu_chunk {
int map_used; /* # of map entries used before the sentry */
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
- struct work_struct map_extend_work;/* async ->map[] extension */
+ struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
void *data; /* chunk data */
int first_free; /* no free below this */
@@ -166,6 +166,9 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /
static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+/* chunks which need their map areas extended, protected by pcpu_lock */
+static LIST_HEAD(pcpu_map_extend_chunks);
+
/*
* The number of empty populated pages, protected by pcpu_lock. The
* reserved chunk doesn't contribute to the count.
@@ -395,13 +398,19 @@ static int pcpu_need_to_extend(struct pc
{
int margin, new_alloc;
+ lockdep_assert_held(&pcpu_lock);
+
if (is_atomic) {
margin = 3;
if (chunk->map_alloc <
- chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
- pcpu_async_enabled)
- schedule_work(&chunk->map_extend_work);
+ chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
+ if (list_empty(&chunk->map_extend_list)) {
+ list_add_tail(&chunk->map_extend_list,
+ &pcpu_map_extend_chunks);
+ pcpu_schedule_balance_work();
+ }
+ }
} else {
margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
}
@@ -467,20 +476,6 @@ out_unlock:
return 0;
}
-static void pcpu_map_extend_workfn(struct work_struct *work)
-{
- struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
- map_extend_work);
- int new_alloc;
-
- spin_lock_irq(&pcpu_lock);
- new_alloc = pcpu_need_to_extend(chunk, false);
- spin_unlock_irq(&pcpu_lock);
-
- if (new_alloc)
- pcpu_extend_area_map(chunk, new_alloc);
-}
-
/**
* pcpu_fit_in_area - try to fit the requested allocation in a candidate area
* @chunk: chunk the candidate area belongs to
@@ -740,7 +735,7 @@ static struct pcpu_chunk *pcpu_alloc_chu
chunk->map_used = 1;
INIT_LIST_HEAD(&chunk->list);
- INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
+ INIT_LIST_HEAD(&chunk->map_extend_list);
chunk->free_size = pcpu_unit_size;
chunk->contig_hint = pcpu_unit_size;
@@ -1129,6 +1124,7 @@ static void pcpu_balance_workfn(struct w
if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
continue;
+ list_del_init(&chunk->map_extend_list);
list_move(&chunk->list, &to_free);
}
@@ -1146,6 +1142,25 @@ static void pcpu_balance_workfn(struct w
pcpu_destroy_chunk(chunk);
}
+ /* service chunks which requested async area map extension */
+ do {
+ int new_alloc = 0;
+
+ spin_lock_irq(&pcpu_lock);
+
+ chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
+ struct pcpu_chunk, map_extend_list);
+ if (chunk) {
+ list_del_init(&chunk->map_extend_list);
+ new_alloc = pcpu_need_to_extend(chunk, false);
+ }
+
+ spin_unlock_irq(&pcpu_lock);
+
+ if (new_alloc)
+ pcpu_extend_area_map(chunk, new_alloc);
+ } while (chunk);
+
/*
* Ensure there are certain number of free populated pages for
* atomic allocs. Fill up from the most packed so that atomic
@@ -1644,7 +1659,7 @@ int __init pcpu_setup_first_chunk(const
*/
schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&schunk->list);
- INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
+ INIT_LIST_HEAD(&schunk->map_extend_list);
schunk->base_addr = base_addr;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
@@ -1673,7 +1688,7 @@ int __init pcpu_setup_first_chunk(const
if (dyn_size) {
dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&dchunk->list);
- INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
+ INIT_LIST_HEAD(&dchunk->map_extend_list);
dchunk->base_addr = base_addr;
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH percpu/for-4.7-fixes 1/2] percpu: fix synchronization between chunk->map_extend_work and chunk destruction
@ 2016-05-25 15:44 ` Tejun Heo
0 siblings, 0 replies; 31+ messages in thread
From: Tejun Heo @ 2016-05-25 15:44 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Vlastimil Babka, Sasha Levin, Alexei Starovoitov, netdev, LKML,
Christoph Lameter, Linux-MM layout, Marco Grassi
Atomic allocations can trigger async map extensions which is serviced
by chunk->map_extend_work. pcpu_balance_work which is responsible for
destroying idle chunks wasn't synchronizing properly against
chunk->map_extend_work and may end up freeing the chunk while the work
item is still in flight.
This patch fixes the bug by rolling async map extension operations
into pcpu_balance_work.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Reported-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Cc: stable@vger.kernel.org # v3.18+
Fixes: 9c824b6a172c ("percpu: make sure chunk->map array has available space")
---
mm/percpu.c | 57 ++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 36 insertions(+), 21 deletions(-)
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -112,7 +112,7 @@ struct pcpu_chunk {
int map_used; /* # of map entries used before the sentry */
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
- struct work_struct map_extend_work;/* async ->map[] extension */
+ struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
void *data; /* chunk data */
int first_free; /* no free below this */
@@ -166,6 +166,9 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /
static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+/* chunks which need their map areas extended, protected by pcpu_lock */
+static LIST_HEAD(pcpu_map_extend_chunks);
+
/*
* The number of empty populated pages, protected by pcpu_lock. The
* reserved chunk doesn't contribute to the count.
@@ -395,13 +398,19 @@ static int pcpu_need_to_extend(struct pc
{
int margin, new_alloc;
+ lockdep_assert_held(&pcpu_lock);
+
if (is_atomic) {
margin = 3;
if (chunk->map_alloc <
- chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
- pcpu_async_enabled)
- schedule_work(&chunk->map_extend_work);
+ chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
+ if (list_empty(&chunk->map_extend_list)) {
+ list_add_tail(&chunk->map_extend_list,
+ &pcpu_map_extend_chunks);
+ pcpu_schedule_balance_work();
+ }
+ }
} else {
margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
}
@@ -467,20 +476,6 @@ out_unlock:
return 0;
}
-static void pcpu_map_extend_workfn(struct work_struct *work)
-{
- struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
- map_extend_work);
- int new_alloc;
-
- spin_lock_irq(&pcpu_lock);
- new_alloc = pcpu_need_to_extend(chunk, false);
- spin_unlock_irq(&pcpu_lock);
-
- if (new_alloc)
- pcpu_extend_area_map(chunk, new_alloc);
-}
-
/**
* pcpu_fit_in_area - try to fit the requested allocation in a candidate area
* @chunk: chunk the candidate area belongs to
@@ -740,7 +735,7 @@ static struct pcpu_chunk *pcpu_alloc_chu
chunk->map_used = 1;
INIT_LIST_HEAD(&chunk->list);
- INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
+ INIT_LIST_HEAD(&chunk->map_extend_list);
chunk->free_size = pcpu_unit_size;
chunk->contig_hint = pcpu_unit_size;
@@ -1129,6 +1124,7 @@ static void pcpu_balance_workfn(struct w
if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
continue;
+ list_del_init(&chunk->map_extend_list);
list_move(&chunk->list, &to_free);
}
@@ -1146,6 +1142,25 @@ static void pcpu_balance_workfn(struct w
pcpu_destroy_chunk(chunk);
}
+ /* service chunks which requested async area map extension */
+ do {
+ int new_alloc = 0;
+
+ spin_lock_irq(&pcpu_lock);
+
+ chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
+ struct pcpu_chunk, map_extend_list);
+ if (chunk) {
+ list_del_init(&chunk->map_extend_list);
+ new_alloc = pcpu_need_to_extend(chunk, false);
+ }
+
+ spin_unlock_irq(&pcpu_lock);
+
+ if (new_alloc)
+ pcpu_extend_area_map(chunk, new_alloc);
+ } while (chunk);
+
/*
* Ensure there are certain number of free populated pages for
* atomic allocs. Fill up from the most packed so that atomic
@@ -1644,7 +1659,7 @@ int __init pcpu_setup_first_chunk(const
*/
schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&schunk->list);
- INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
+ INIT_LIST_HEAD(&schunk->map_extend_list);
schunk->base_addr = base_addr;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
@@ -1673,7 +1688,7 @@ int __init pcpu_setup_first_chunk(const
if (dyn_size) {
dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&dchunk->list);
- INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
+ INIT_LIST_HEAD(&dchunk->map_extend_list);
dchunk->base_addr = base_addr;
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH percpu/for-4.7-fixes 1/2] percpu: fix synchronization between chunk->map_extend_work and chunk destruction
2016-05-25 15:44 ` Tejun Heo
@ 2016-05-26 9:19 ` Vlastimil Babka
-1 siblings, 0 replies; 31+ messages in thread
From: Vlastimil Babka @ 2016-05-26 9:19 UTC (permalink / raw)
To: Tejun Heo, Alexei Starovoitov
Cc: Sasha Levin, Alexei Starovoitov, netdev, LKML, Christoph Lameter,
Linux-MM layout, Marco Grassi
On 05/25/2016 05:44 PM, Tejun Heo wrote:
> Atomic allocations can trigger async map extensions which is serviced
> by chunk->map_extend_work. pcpu_balance_work which is responsible for
> destroying idle chunks wasn't synchronizing properly against
> chunk->map_extend_work and may end up freeing the chunk while the work
> item is still in flight.
>
> This patch fixes the bug by rolling async map extension operations
> into pcpu_balance_work.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-and-tested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
> Reported-by: Vlastimil Babka <vbabka@suse.cz>
> Reported-by: Sasha Levin <sasha.levin@oracle.com>
> Cc: stable@vger.kernel.org # v3.18+
> Fixes: 9c824b6a172c ("percpu: make sure chunk->map array has available space")
I didn't spot issues, but I'm not that familiar with the code, so it doesn't
mean much. Just one question below:
> ---
> mm/percpu.c | 57 ++++++++++++++++++++++++++++++++++++---------------------
> 1 file changed, 36 insertions(+), 21 deletions(-)
>
> --- a/mm/percpu.c
> +++ b/mm/percpu.c
> @@ -112,7 +112,7 @@ struct pcpu_chunk {
> int map_used; /* # of map entries used before the sentry */
> int map_alloc; /* # of map entries allocated */
> int *map; /* allocation map */
> - struct work_struct map_extend_work;/* async ->map[] extension */
> + struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
>
> void *data; /* chunk data */
> int first_free; /* no free below this */
> @@ -166,6 +166,9 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /
>
> static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
>
> +/* chunks which need their map areas extended, protected by pcpu_lock */
> +static LIST_HEAD(pcpu_map_extend_chunks);
> +
> /*
> * The number of empty populated pages, protected by pcpu_lock. The
> * reserved chunk doesn't contribute to the count.
> @@ -395,13 +398,19 @@ static int pcpu_need_to_extend(struct pc
> {
> int margin, new_alloc;
>
> + lockdep_assert_held(&pcpu_lock);
> +
> if (is_atomic) {
> margin = 3;
>
> if (chunk->map_alloc <
> - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
> - pcpu_async_enabled)
> - schedule_work(&chunk->map_extend_work);
> + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
> + if (list_empty(&chunk->map_extend_list)) {
So why this list_empty condition? Doesn't it deserve a comment then? And isn't
using a list an overkill in that case?
Thanks.
> + list_add_tail(&chunk->map_extend_list,
> + &pcpu_map_extend_chunks);
> + pcpu_schedule_balance_work();
> + }
> + }
> } else {
> margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
> }
[...]
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH percpu/for-4.7-fixes 1/2] percpu: fix synchronization between chunk->map_extend_work and chunk destruction
@ 2016-05-26 9:19 ` Vlastimil Babka
0 siblings, 0 replies; 31+ messages in thread
From: Vlastimil Babka @ 2016-05-26 9:19 UTC (permalink / raw)
To: Tejun Heo, Alexei Starovoitov
Cc: Sasha Levin, Alexei Starovoitov, netdev, LKML, Christoph Lameter,
Linux-MM layout, Marco Grassi
On 05/25/2016 05:44 PM, Tejun Heo wrote:
> Atomic allocations can trigger async map extensions which is serviced
> by chunk->map_extend_work. pcpu_balance_work which is responsible for
> destroying idle chunks wasn't synchronizing properly against
> chunk->map_extend_work and may end up freeing the chunk while the work
> item is still in flight.
>
> This patch fixes the bug by rolling async map extension operations
> into pcpu_balance_work.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-and-tested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
> Reported-by: Vlastimil Babka <vbabka@suse.cz>
> Reported-by: Sasha Levin <sasha.levin@oracle.com>
> Cc: stable@vger.kernel.org # v3.18+
> Fixes: 9c824b6a172c ("percpu: make sure chunk->map array has available space")
I didn't spot issues, but I'm not that familiar with the code, so it doesn't
mean much. Just one question below:
> ---
> mm/percpu.c | 57 ++++++++++++++++++++++++++++++++++++---------------------
> 1 file changed, 36 insertions(+), 21 deletions(-)
>
> --- a/mm/percpu.c
> +++ b/mm/percpu.c
> @@ -112,7 +112,7 @@ struct pcpu_chunk {
> int map_used; /* # of map entries used before the sentry */
> int map_alloc; /* # of map entries allocated */
> int *map; /* allocation map */
> - struct work_struct map_extend_work;/* async ->map[] extension */
> + struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
>
> void *data; /* chunk data */
> int first_free; /* no free below this */
> @@ -166,6 +166,9 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /
>
> static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
>
> +/* chunks which need their map areas extended, protected by pcpu_lock */
> +static LIST_HEAD(pcpu_map_extend_chunks);
> +
> /*
> * The number of empty populated pages, protected by pcpu_lock. The
> * reserved chunk doesn't contribute to the count.
> @@ -395,13 +398,19 @@ static int pcpu_need_to_extend(struct pc
> {
> int margin, new_alloc;
>
> + lockdep_assert_held(&pcpu_lock);
> +
> if (is_atomic) {
> margin = 3;
>
> if (chunk->map_alloc <
> - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
> - pcpu_async_enabled)
> - schedule_work(&chunk->map_extend_work);
> + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
> + if (list_empty(&chunk->map_extend_list)) {
So why this list_empty condition? Doesn't it deserve a comment then? And isn't
using a list an overkill in that case?
Thanks.
> + list_add_tail(&chunk->map_extend_list,
> + &pcpu_map_extend_chunks);
> + pcpu_schedule_balance_work();
> + }
> + }
> } else {
> margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
> }
[...]
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH percpu/for-4.7-fixes 1/2] percpu: fix synchronization between chunk->map_extend_work and chunk destruction
2016-05-26 9:19 ` Vlastimil Babka
@ 2016-05-26 19:21 ` Tejun Heo
-1 siblings, 0 replies; 31+ messages in thread
From: Tejun Heo @ 2016-05-26 19:21 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Alexei Starovoitov, Sasha Levin, Alexei Starovoitov, netdev,
LKML, Christoph Lameter, Linux-MM layout, Marco Grassi
Hello,
On Thu, May 26, 2016 at 11:19:06AM +0200, Vlastimil Babka wrote:
> > if (is_atomic) {
> > margin = 3;
> >
> > if (chunk->map_alloc <
> > - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
> > - pcpu_async_enabled)
> > - schedule_work(&chunk->map_extend_work);
> > + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
> > + if (list_empty(&chunk->map_extend_list)) {
> So why this list_empty condition? Doesn't it deserve a comment then? And
Because doing list_add() twice corrupts the list. I'm not sure that
deserves a comment. We can do list_move() instead but that isn't
necessarily better.
> isn't using a list an overkill in that case?
That would require rebalance work to scan all chunks whenever it's
scheduled and if a lot of atomic allocations are taking place, it has
some possibility to become expensive with a lot of chunks.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH percpu/for-4.7-fixes 1/2] percpu: fix synchronization between chunk->map_extend_work and chunk destruction
@ 2016-05-26 19:21 ` Tejun Heo
0 siblings, 0 replies; 31+ messages in thread
From: Tejun Heo @ 2016-05-26 19:21 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Alexei Starovoitov, Sasha Levin, Alexei Starovoitov, netdev,
LKML, Christoph Lameter, Linux-MM layout, Marco Grassi
Hello,
On Thu, May 26, 2016 at 11:19:06AM +0200, Vlastimil Babka wrote:
> > if (is_atomic) {
> > margin = 3;
> >
> > if (chunk->map_alloc <
> > - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
> > - pcpu_async_enabled)
> > - schedule_work(&chunk->map_extend_work);
> > + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
> > + if (list_empty(&chunk->map_extend_list)) {
> So why this list_empty condition? Doesn't it deserve a comment then? And
Because doing list_add() twice corrupts the list. I'm not sure that
deserves a comment. We can do list_move() instead but that isn't
necessarily better.
> isn't using a list an overkill in that case?
That would require rebalance work to scan all chunks whenever it's
scheduled and if a lot of atomic allocations are taking place, it has
some possibility to become expensive with a lot of chunks.
Thanks.
--
tejun
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH percpu/for-4.7-fixes 1/2] percpu: fix synchronization between chunk->map_extend_work and chunk destruction
2016-05-26 19:21 ` Tejun Heo
@ 2016-05-26 20:48 ` Vlastimil Babka
-1 siblings, 0 replies; 31+ messages in thread
From: Vlastimil Babka @ 2016-05-26 20:48 UTC (permalink / raw)
To: Tejun Heo
Cc: Alexei Starovoitov, Sasha Levin, Alexei Starovoitov, netdev,
LKML, Christoph Lameter, Linux-MM layout, Marco Grassi
On 26.5.2016 21:21, Tejun Heo wrote:
> Hello,
>
> On Thu, May 26, 2016 at 11:19:06AM +0200, Vlastimil Babka wrote:
>>> if (is_atomic) {
>>> margin = 3;
>>>
>>> if (chunk->map_alloc <
>>> - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
>>> - pcpu_async_enabled)
>>> - schedule_work(&chunk->map_extend_work);
>>> + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
>>> + if (list_empty(&chunk->map_extend_list)) {
>
>> So why this list_empty condition? Doesn't it deserve a comment then? And
>
> Because doing list_add() twice corrupts the list. I'm not sure that
> deserves a comment. We can do list_move() instead but that isn't
> necessarily better.
Ugh, right, somehow I thought it was testing &pcpu_map_extend_chunks.
My second question was based on the assumption that the list can have only one
item. Sorry about the noise.
>> isn't using a list an overkill in that case?
>
> That would require rebalance work to scan all chunks whenever it's
> scheduled and if a lot of atomic allocations are taking place, it has
> some possibility to become expensive with a lot of chunks.
>
> Thanks.
>
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH percpu/for-4.7-fixes 1/2] percpu: fix synchronization between chunk->map_extend_work and chunk destruction
@ 2016-05-26 20:48 ` Vlastimil Babka
0 siblings, 0 replies; 31+ messages in thread
From: Vlastimil Babka @ 2016-05-26 20:48 UTC (permalink / raw)
To: Tejun Heo
Cc: Alexei Starovoitov, Sasha Levin, Alexei Starovoitov, netdev,
LKML, Christoph Lameter, Linux-MM layout, Marco Grassi
On 26.5.2016 21:21, Tejun Heo wrote:
> Hello,
>
> On Thu, May 26, 2016 at 11:19:06AM +0200, Vlastimil Babka wrote:
>>> if (is_atomic) {
>>> margin = 3;
>>>
>>> if (chunk->map_alloc <
>>> - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
>>> - pcpu_async_enabled)
>>> - schedule_work(&chunk->map_extend_work);
>>> + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
>>> + if (list_empty(&chunk->map_extend_list)) {
>
>> So why this list_empty condition? Doesn't it deserve a comment then? And
>
> Because doing list_add() twice corrupts the list. I'm not sure that
> deserves a comment. We can do list_move() instead but that isn't
> necessarily better.
Ugh, right, somehow I thought it was testing &pcpu_map_extend_chunks.
My second question was based on the assumption that the list can have only one
item. Sorry about the noise.
>> isn't using a list an overkill in that case?
>
> That would require rebalance work to scan all chunks whenever it's
> scheduled and if a lot of atomic allocations are taking place, it has
> some possibility to become expensive with a lot of chunks.
>
> Thanks.
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH percpu/for-4.7-fixes 2/2] percpu: fix synchronization between synchronous map extension and chunk destruction
2016-05-24 20:43 ` Alexei Starovoitov
@ 2016-05-25 15:45 ` Tejun Heo
-1 siblings, 0 replies; 31+ messages in thread
From: Tejun Heo @ 2016-05-25 15:45 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Vlastimil Babka, Sasha Levin, Alexei Starovoitov, netdev, LKML,
Christoph Lameter, Linux-MM layout, Marco Grassi, kernel-team
For non-atomic allocations, pcpu_alloc() can try to extend the area
map synchronously after dropping pcpu_lock; however, the extension
wasn't synchronized against chunk destruction and the chunk might get
freed while extension is in progress.
This patch fixes the bug by putting most of non-atomic allocations
under pcpu_alloc_mutex to synchronize against pcpu_balance_work which
is responsible for async chunk management including destruction.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Reported-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Cc: stable@vger.kernel.org # v3.18+
Fixes: 1a4d76076cda ("percpu: implement asynchronous chunk population")
---
Hello,
I'll send both patches mainline in a couple days through the percpu
tree.
Thanks.
mm/percpu.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -162,7 +162,7 @@ static struct pcpu_chunk *pcpu_reserved_
static int pcpu_reserved_chunk_limit;
static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
-static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
+static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
@@ -444,6 +444,8 @@ static int pcpu_extend_area_map(struct p
size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
unsigned long flags;
+ lockdep_assert_held(&pcpu_alloc_mutex);
+
new = pcpu_mem_zalloc(new_size);
if (!new)
return -ENOMEM;
@@ -890,6 +892,9 @@ static void __percpu *pcpu_alloc(size_t
return NULL;
}
+ if (!is_atomic)
+ mutex_lock(&pcpu_alloc_mutex);
+
spin_lock_irqsave(&pcpu_lock, flags);
/* serve reserved allocations from the reserved chunk if available */
@@ -962,12 +967,9 @@ restart:
if (is_atomic)
goto fail;
- mutex_lock(&pcpu_alloc_mutex);
-
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
chunk = pcpu_create_chunk();
if (!chunk) {
- mutex_unlock(&pcpu_alloc_mutex);
err = "failed to allocate new chunk";
goto fail;
}
@@ -978,7 +980,6 @@ restart:
spin_lock_irqsave(&pcpu_lock, flags);
}
- mutex_unlock(&pcpu_alloc_mutex);
goto restart;
area_found:
@@ -988,8 +989,6 @@ area_found:
if (!is_atomic) {
int page_start, page_end, rs, re;
- mutex_lock(&pcpu_alloc_mutex);
-
page_start = PFN_DOWN(off);
page_end = PFN_UP(off + size);
@@ -1000,7 +999,6 @@ area_found:
spin_lock_irqsave(&pcpu_lock, flags);
if (ret) {
- mutex_unlock(&pcpu_alloc_mutex);
pcpu_free_area(chunk, off, &occ_pages);
err = "failed to populate";
goto fail_unlock;
@@ -1040,6 +1038,8 @@ fail:
/* see the flag handling in pcpu_blance_workfn() */
pcpu_atomic_alloc_failed = true;
pcpu_schedule_balance_work();
+ } else {
+ mutex_unlock(&pcpu_alloc_mutex);
}
return NULL;
}
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH percpu/for-4.7-fixes 2/2] percpu: fix synchronization between synchronous map extension and chunk destruction
@ 2016-05-25 15:45 ` Tejun Heo
0 siblings, 0 replies; 31+ messages in thread
From: Tejun Heo @ 2016-05-25 15:45 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Vlastimil Babka, Sasha Levin, Alexei Starovoitov, netdev, LKML,
Christoph Lameter, Linux-MM layout, Marco Grassi, kernel-team
For non-atomic allocations, pcpu_alloc() can try to extend the area
map synchronously after dropping pcpu_lock; however, the extension
wasn't synchronized against chunk destruction and the chunk might get
freed while extension is in progress.
This patch fixes the bug by putting most of non-atomic allocations
under pcpu_alloc_mutex to synchronize against pcpu_balance_work which
is responsible for async chunk management including destruction.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Reported-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Cc: stable@vger.kernel.org # v3.18+
Fixes: 1a4d76076cda ("percpu: implement asynchronous chunk population")
---
Hello,
I'll send both patches mainline in a couple days through the percpu
tree.
Thanks.
mm/percpu.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -162,7 +162,7 @@ static struct pcpu_chunk *pcpu_reserved_
static int pcpu_reserved_chunk_limit;
static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
-static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
+static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
@@ -444,6 +444,8 @@ static int pcpu_extend_area_map(struct p
size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
unsigned long flags;
+ lockdep_assert_held(&pcpu_alloc_mutex);
+
new = pcpu_mem_zalloc(new_size);
if (!new)
return -ENOMEM;
@@ -890,6 +892,9 @@ static void __percpu *pcpu_alloc(size_t
return NULL;
}
+ if (!is_atomic)
+ mutex_lock(&pcpu_alloc_mutex);
+
spin_lock_irqsave(&pcpu_lock, flags);
/* serve reserved allocations from the reserved chunk if available */
@@ -962,12 +967,9 @@ restart:
if (is_atomic)
goto fail;
- mutex_lock(&pcpu_alloc_mutex);
-
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
chunk = pcpu_create_chunk();
if (!chunk) {
- mutex_unlock(&pcpu_alloc_mutex);
err = "failed to allocate new chunk";
goto fail;
}
@@ -978,7 +980,6 @@ restart:
spin_lock_irqsave(&pcpu_lock, flags);
}
- mutex_unlock(&pcpu_alloc_mutex);
goto restart;
area_found:
@@ -988,8 +989,6 @@ area_found:
if (!is_atomic) {
int page_start, page_end, rs, re;
- mutex_lock(&pcpu_alloc_mutex);
-
page_start = PFN_DOWN(off);
page_end = PFN_UP(off + size);
@@ -1000,7 +999,6 @@ area_found:
spin_lock_irqsave(&pcpu_lock, flags);
if (ret) {
- mutex_unlock(&pcpu_alloc_mutex);
pcpu_free_area(chunk, off, &occ_pages);
err = "failed to populate";
goto fail_unlock;
@@ -1040,6 +1038,8 @@ fail:
/* see the flag handling in pcpu_blance_workfn() */
pcpu_atomic_alloc_failed = true;
pcpu_schedule_balance_work();
+ } else {
+ mutex_unlock(&pcpu_alloc_mutex);
}
return NULL;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH percpu/for-4.7-fixes 2/2] percpu: fix synchronization between synchronous map extension and chunk destruction
2016-05-25 15:45 ` Tejun Heo
@ 2016-05-26 9:48 ` Vlastimil Babka
-1 siblings, 0 replies; 31+ messages in thread
From: Vlastimil Babka @ 2016-05-26 9:48 UTC (permalink / raw)
To: Tejun Heo, Alexei Starovoitov
Cc: Sasha Levin, Alexei Starovoitov, netdev, LKML, Christoph Lameter,
Linux-MM layout, Marco Grassi, kernel-team
On 05/25/2016 05:45 PM, Tejun Heo wrote:
> For non-atomic allocations, pcpu_alloc() can try to extend the area
> map synchronously after dropping pcpu_lock; however, the extension
> wasn't synchronized against chunk destruction and the chunk might get
> freed while extension is in progress.
>
> This patch fixes the bug by putting most of non-atomic allocations
> under pcpu_alloc_mutex to synchronize against pcpu_balance_work which
> is responsible for async chunk management including destruction.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-and-tested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
> Reported-by: Vlastimil Babka <vbabka@suse.cz>
> Reported-by: Sasha Levin <sasha.levin@oracle.com>
> Cc: stable@vger.kernel.org # v3.18+
> Fixes: 1a4d76076cda ("percpu: implement asynchronous chunk population")
Didn't spot any problems this time.
Thanks
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH percpu/for-4.7-fixes 2/2] percpu: fix synchronization between synchronous map extension and chunk destruction
@ 2016-05-26 9:48 ` Vlastimil Babka
0 siblings, 0 replies; 31+ messages in thread
From: Vlastimil Babka @ 2016-05-26 9:48 UTC (permalink / raw)
To: Tejun Heo, Alexei Starovoitov
Cc: Sasha Levin, Alexei Starovoitov, netdev, LKML, Christoph Lameter,
Linux-MM layout, Marco Grassi, kernel-team
On 05/25/2016 05:45 PM, Tejun Heo wrote:
> For non-atomic allocations, pcpu_alloc() can try to extend the area
> map synchronously after dropping pcpu_lock; however, the extension
> wasn't synchronized against chunk destruction and the chunk might get
> freed while extension is in progress.
>
> This patch fixes the bug by putting most of non-atomic allocations
> under pcpu_alloc_mutex to synchronize against pcpu_balance_work which
> is responsible for async chunk management including destruction.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-and-tested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
> Reported-by: Vlastimil Babka <vbabka@suse.cz>
> Reported-by: Sasha Levin <sasha.levin@oracle.com>
> Cc: stable@vger.kernel.org # v3.18+
> Fixes: 1a4d76076cda ("percpu: implement asynchronous chunk population")
Didn't spot any problems this time.
Thanks
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 31+ messages in thread