* [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-14 21:30 ` David Rientjes
0 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-14 21:30 UTC (permalink / raw)
To: Andrew Morton
Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
prevented the oom reaper from unmapping private anonymous memory with the
oom reaper when the oom victim mm had mmu notifiers registered.
The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
around the unmap_page_range(), which is needed, can block and the oom
killer will stall forever waiting for the victim to exit, which may not
be possible without reaping.
That concern is real, but only true for mmu notifiers that have blockable
invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
to mmu notifier ops that can set a bit to indicate that these callbacks do
not block.
The implementation is steered toward an expensive slowpath, such as after
the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
Signed-off-by: David Rientjes <rientjes@google.com>
---
v2:
- specifically exclude mmu_notifiers without invalidate callbacks
- move flags to mmu_notifier_ops per Paolo
- reverse flag from blockable -> not blockable per Christian
drivers/infiniband/hw/hfi1/mmu_rb.c | 1 +
drivers/iommu/amd_iommu_v2.c | 1 +
drivers/iommu/intel-svm.c | 1 +
drivers/misc/sgi-gru/grutlbpurge.c | 1 +
include/linux/mmu_notifier.h | 21 +++++++++++++++++++++
mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++
virt/kvm/kvm_main.c | 1 +
7 files changed, 57 insertions(+)
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
static void handle_remove(struct work_struct *work);
static const struct mmu_notifier_ops mn_opts = {
+ .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.invalidate_range_start = mmu_notifier_range_start,
};
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
}
static const struct mmu_notifier_ops iommu_mn = {
+ .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.release = mn_release,
.clear_flush_young = mn_clear_flush_young,
.invalidate_range = mn_invalidate_range,
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
}
static const struct mmu_notifier_ops intel_mmuops = {
+ .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.release = intel_mm_release,
.change_pte = intel_change_pte,
.invalidate_range = intel_invalidate_range,
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
static const struct mmu_notifier_ops gru_mmuops = {
+ .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.invalidate_range_start = gru_invalidate_range_start,
.invalidate_range_end = gru_invalidate_range_end,
.release = gru_release,
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -10,6 +10,9 @@
struct mmu_notifier;
struct mmu_notifier_ops;
+/* mmu_notifier_ops flags */
+#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
+
#ifdef CONFIG_MMU_NOTIFIER
/*
@@ -26,6 +29,15 @@ struct mmu_notifier_mm {
};
struct mmu_notifier_ops {
+ /*
+ * Flags to specify behavior of callbacks for this MMU notifier.
+ * Used to determine which context an operation may be called.
+ *
+ * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
+ * block
+ */
+ int flags;
+
/*
* Called either by mmu_notifier_unregister or when the mm is
* being destroyed by exit_mmap, always before all pages are
@@ -137,6 +149,9 @@ struct mmu_notifier_ops {
* page. Pages will no longer be referenced by the linux
* address space but may still be referenced by sptes until
* the last refcount is dropped.
+ *
+ * If both of these callbacks cannot block, mmu_notifier_ops.flags
+ * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
*/
void (*invalidate_range_start)(struct mmu_notifier *mn,
struct mm_struct *mm,
@@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
bool only_end);
extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end);
+extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
static inline void mmu_notifier_release(struct mm_struct *mm)
{
@@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
{
}
+static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+ return 0;
+}
+
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
{
}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
+/*
+ * Must be called while holding mm->mmap_sem for either read or write.
+ * The result is guaranteed to be valid until mm->mmap_sem is dropped.
+ */
+int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+ struct mmu_notifier *mn;
+ int id;
+ int ret = 0;
+
+ WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
+
+ if (!mm_has_notifiers(mm))
+ return ret;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (!mn->ops->invalidate_range &&
+ !mn->ops->invalidate_range_start &&
+ !mn->ops->invalidate_range_end)
+ continue;
+
+ if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
+ ret = 1;
+ break;
+ }
+ }
+ srcu_read_unlock(&srcu, id);
+ return ret;
+}
+
static int do_mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm,
int take_mmap_sem)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
}
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+ .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread
* [patch v2 2/2] mm, oom: avoid reaping only for mm's with blockable invalidate callbacks
2017-12-14 21:30 ` David Rientjes
@ 2017-12-14 21:31 ` David Rientjes
-1 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-14 21:31 UTC (permalink / raw)
To: Andrew Morton
Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
This uses the new annotation to determine if an mm has mmu notifiers with
blockable invalidate range callbacks to avoid oom reaping. Otherwise, the
callbacks are used around unmap_page_range().
Signed-off-by: David Rientjes <rientjes@google.com>
---
mm/oom_kill.c | 21 +++++++++++----------
1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
}
/*
- * If the mm has notifiers then we would need to invalidate them around
- * unmap_page_range and that is risky because notifiers can sleep and
- * what they do is basically undeterministic. So let's have a short
+ * If the mm has invalidate_{start,end}() notifiers that could block,
* sleep to give the oom victim some more time.
* TODO: we really want to get rid of this ugly hack and make sure that
- * notifiers cannot block for unbounded amount of time and add
- * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
+ * notifiers cannot block for unbounded amount of time
*/
- if (mm_has_notifiers(mm)) {
+ if (mm_has_blockable_invalidate_notifiers(mm)) {
up_read(&mm->mmap_sem);
schedule_timeout_idle(HZ);
goto unlock_oom;
@@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* count elevated without a good reason.
*/
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
- tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
- unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
- NULL);
- tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+ const unsigned long start = vma->vm_start;
+ const unsigned long end = vma->vm_end;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ unmap_page_range(&tlb, vma, start, end, NULL);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
}
}
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
^ permalink raw reply [flat|nested] 53+ messages in thread
* [patch v2 2/2] mm, oom: avoid reaping only for mm's with blockable invalidate callbacks
@ 2017-12-14 21:31 ` David Rientjes
0 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-14 21:31 UTC (permalink / raw)
To: Andrew Morton
Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
This uses the new annotation to determine if an mm has mmu notifiers with
blockable invalidate range callbacks to avoid oom reaping. Otherwise, the
callbacks are used around unmap_page_range().
Signed-off-by: David Rientjes <rientjes@google.com>
---
mm/oom_kill.c | 21 +++++++++++----------
1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
}
/*
- * If the mm has notifiers then we would need to invalidate them around
- * unmap_page_range and that is risky because notifiers can sleep and
- * what they do is basically undeterministic. So let's have a short
+ * If the mm has invalidate_{start,end}() notifiers that could block,
* sleep to give the oom victim some more time.
* TODO: we really want to get rid of this ugly hack and make sure that
- * notifiers cannot block for unbounded amount of time and add
- * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
+ * notifiers cannot block for unbounded amount of time
*/
- if (mm_has_notifiers(mm)) {
+ if (mm_has_blockable_invalidate_notifiers(mm)) {
up_read(&mm->mmap_sem);
schedule_timeout_idle(HZ);
goto unlock_oom;
@@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* count elevated without a good reason.
*/
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
- tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
- unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
- NULL);
- tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+ const unsigned long start = vma->vm_start;
+ const unsigned long end = vma->vm_end;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ unmap_page_range(&tlb, vma, start, end, NULL);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
}
}
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 2/2] mm, oom: avoid reaping only for mm's with blockable invalidate callbacks
2017-12-14 21:31 ` David Rientjes
@ 2017-12-15 16:35 ` Michal Hocko
-1 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-15 16:35 UTC (permalink / raw)
To: David Rientjes
Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
On Thu 14-12-17 13:31:00, David Rientjes wrote:
> This uses the new annotation to determine if an mm has mmu notifiers with
> blockable invalidate range callbacks to avoid oom reaping. Otherwise, the
> callbacks are used around unmap_page_range().
Do you have any example where this helped? KVM guest oom killed I guess?
> Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
> ---
> mm/oom_kill.c | 21 +++++++++++----------
> 1 file changed, 11 insertions(+), 10 deletions(-)
>
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
> }
>
> /*
> - * If the mm has notifiers then we would need to invalidate them around
> - * unmap_page_range and that is risky because notifiers can sleep and
> - * what they do is basically undeterministic. So let's have a short
> + * If the mm has invalidate_{start,end}() notifiers that could block,
> * sleep to give the oom victim some more time.
> * TODO: we really want to get rid of this ugly hack and make sure that
> - * notifiers cannot block for unbounded amount of time and add
> - * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
> + * notifiers cannot block for unbounded amount of time
> */
> - if (mm_has_notifiers(mm)) {
> + if (mm_has_blockable_invalidate_notifiers(mm)) {
> up_read(&mm->mmap_sem);
> schedule_timeout_idle(HZ);
> goto unlock_oom;
> @@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
> * count elevated without a good reason.
> */
> if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
> - tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
> - unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
> - NULL);
> - tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
> + const unsigned long start = vma->vm_start;
> + const unsigned long end = vma->vm_end;
> +
> + tlb_gather_mmu(&tlb, mm, start, end);
> + mmu_notifier_invalidate_range_start(mm, start, end);
> + unmap_page_range(&tlb, vma, start, end, NULL);
> + mmu_notifier_invalidate_range_end(mm, start, end);
> + tlb_finish_mmu(&tlb, start, end);
> }
> }
> pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 2/2] mm, oom: avoid reaping only for mm's with blockable invalidate callbacks
@ 2017-12-15 16:35 ` Michal Hocko
0 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-15 16:35 UTC (permalink / raw)
To: David Rientjes
Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
On Thu 14-12-17 13:31:00, David Rientjes wrote:
> This uses the new annotation to determine if an mm has mmu notifiers with
> blockable invalidate range callbacks to avoid oom reaping. Otherwise, the
> callbacks are used around unmap_page_range().
Do you have any example where this helped? KVM guest oom killed I guess?
> Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
> ---
> mm/oom_kill.c | 21 +++++++++++----------
> 1 file changed, 11 insertions(+), 10 deletions(-)
>
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
> }
>
> /*
> - * If the mm has notifiers then we would need to invalidate them around
> - * unmap_page_range and that is risky because notifiers can sleep and
> - * what they do is basically undeterministic. So let's have a short
> + * If the mm has invalidate_{start,end}() notifiers that could block,
> * sleep to give the oom victim some more time.
> * TODO: we really want to get rid of this ugly hack and make sure that
> - * notifiers cannot block for unbounded amount of time and add
> - * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
> + * notifiers cannot block for unbounded amount of time
> */
> - if (mm_has_notifiers(mm)) {
> + if (mm_has_blockable_invalidate_notifiers(mm)) {
> up_read(&mm->mmap_sem);
> schedule_timeout_idle(HZ);
> goto unlock_oom;
> @@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
> * count elevated without a good reason.
> */
> if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
> - tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
> - unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
> - NULL);
> - tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
> + const unsigned long start = vma->vm_start;
> + const unsigned long end = vma->vm_end;
> +
> + tlb_gather_mmu(&tlb, mm, start, end);
> + mmu_notifier_invalidate_range_start(mm, start, end);
> + unmap_page_range(&tlb, vma, start, end, NULL);
> + mmu_notifier_invalidate_range_end(mm, start, end);
> + tlb_finish_mmu(&tlb, start, end);
> }
> }
> pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 2/2] mm, oom: avoid reaping only for mm's with blockable invalidate callbacks
2017-12-15 16:35 ` Michal Hocko
@ 2017-12-15 21:46 ` David Rientjes
-1 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-15 21:46 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
On Fri, 15 Dec 2017, Michal Hocko wrote:
> > This uses the new annotation to determine if an mm has mmu notifiers with
> > blockable invalidate range callbacks to avoid oom reaping. Otherwise, the
> > callbacks are used around unmap_page_range().
>
> Do you have any example where this helped? KVM guest oom killed I guess?
>
KVM is the most significant one that we are interested in, but haven't had
sufficient time to quantify how much of an impact this has other than to
say it will certainly be non-zero.
The motivation was more to exclude mmu notifiers that have a reason to be
excluded rather than a blanket exemption to make the oom reaper more
robust.
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 2/2] mm, oom: avoid reaping only for mm's with blockable invalidate callbacks
@ 2017-12-15 21:46 ` David Rientjes
0 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-15 21:46 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
On Fri, 15 Dec 2017, Michal Hocko wrote:
> > This uses the new annotation to determine if an mm has mmu notifiers with
> > blockable invalidate range callbacks to avoid oom reaping. Otherwise, the
> > callbacks are used around unmap_page_range().
>
> Do you have any example where this helped? KVM guest oom killed I guess?
>
KVM is the most significant one that we are interested in, but haven't had
sufficient time to quantify how much of an impact this has other than to
say it will certainly be non-zero.
The motivation was more to exclude mmu notifiers that have a reason to be
excluded rather than a blanket exemption to make the oom reaper more
robust.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
2017-12-14 21:30 ` David Rientjes
@ 2017-12-15 8:42 ` Paolo Bonzini
-1 siblings, 0 replies; 53+ messages in thread
From: Paolo Bonzini @ 2017-12-15 8:42 UTC (permalink / raw)
To: David Rientjes, Andrew Morton
Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Radim Krčmář,
linux-kernel, linux-mm
On 14/12/2017 22:30, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
> v2:
> - specifically exclude mmu_notifiers without invalidate callbacks
> - move flags to mmu_notifier_ops per Paolo
> - reverse flag from blockable -> not blockable per Christian
>
> drivers/infiniband/hw/hfi1/mmu_rb.c | 1 +
> drivers/iommu/amd_iommu_v2.c | 1 +
> drivers/iommu/intel-svm.c | 1 +
> drivers/misc/sgi-gru/grutlbpurge.c | 1 +
> include/linux/mmu_notifier.h | 21 +++++++++++++++++++++
> mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 1 +
> 7 files changed, 57 insertions(+)
>
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
> static void handle_remove(struct work_struct *work);
>
> static const struct mmu_notifier_ops mn_opts = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = mmu_notifier_range_start,
> };
>
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops iommu_mn = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = mn_release,
> .clear_flush_young = mn_clear_flush_young,
> .invalidate_range = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops intel_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = intel_mm_release,
> .change_pte = intel_change_pte,
> .invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
>
>
> static const struct mmu_notifier_ops gru_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = gru_invalidate_range_start,
> .invalidate_range_end = gru_invalidate_range_end,
> .release = gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
> struct mmu_notifier;
> struct mmu_notifier_ops;
>
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
> +
> #ifdef CONFIG_MMU_NOTIFIER
>
> /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
> };
>
> struct mmu_notifier_ops {
> + /*
> + * Flags to specify behavior of callbacks for this MMU notifier.
> + * Used to determine which context an operation may be called.
> + *
> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> + * block
> + */
> + int flags;
> +
> /*
> * Called either by mmu_notifier_unregister or when the mm is
> * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
> * page. Pages will no longer be referenced by the linux
> * address space but may still be referenced by sptes until
> * the last refcount is dropped.
> + *
> + * If both of these callbacks cannot block, mmu_notifier_ops.flags
> + * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
> */
> void (*invalidate_range_start)(struct mmu_notifier *mn,
> struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
> bool only_end);
> extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>
> static inline void mmu_notifier_release(struct mm_struct *mm)
> {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
> {
> }
>
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + return 0;
> +}
> +
> static inline void mmu_notifier_mm_init(struct mm_struct *mm)
> {
> }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> }
> EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + struct mmu_notifier *mn;
> + int id;
> + int ret = 0;
> +
> + WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> + if (!mm_has_notifiers(mm))
> + return ret;
> +
> + id = srcu_read_lock(&srcu);
> + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> + if (!mn->ops->invalidate_range &&
> + !mn->ops->invalidate_range_start &&
> + !mn->ops->invalidate_range_end)
> + continue;
> +
> + if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> + ret = 1;
> + break;
> + }
> + }
> + srcu_read_unlock(&srcu, id);
> + return ret;
> +}
> +
> static int do_mmu_notifier_register(struct mmu_notifier *mn,
> struct mm_struct *mm,
> int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> }
>
> static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
> .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
> .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-15 8:42 ` Paolo Bonzini
0 siblings, 0 replies; 53+ messages in thread
From: Paolo Bonzini @ 2017-12-15 8:42 UTC (permalink / raw)
To: David Rientjes, Andrew Morton
Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Radim Krčmář,
linux-kernel, linux-mm
On 14/12/2017 22:30, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
> v2:
> - specifically exclude mmu_notifiers without invalidate callbacks
> - move flags to mmu_notifier_ops per Paolo
> - reverse flag from blockable -> not blockable per Christian
>
> drivers/infiniband/hw/hfi1/mmu_rb.c | 1 +
> drivers/iommu/amd_iommu_v2.c | 1 +
> drivers/iommu/intel-svm.c | 1 +
> drivers/misc/sgi-gru/grutlbpurge.c | 1 +
> include/linux/mmu_notifier.h | 21 +++++++++++++++++++++
> mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 1 +
> 7 files changed, 57 insertions(+)
>
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
> static void handle_remove(struct work_struct *work);
>
> static const struct mmu_notifier_ops mn_opts = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = mmu_notifier_range_start,
> };
>
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops iommu_mn = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = mn_release,
> .clear_flush_young = mn_clear_flush_young,
> .invalidate_range = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops intel_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = intel_mm_release,
> .change_pte = intel_change_pte,
> .invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
>
>
> static const struct mmu_notifier_ops gru_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = gru_invalidate_range_start,
> .invalidate_range_end = gru_invalidate_range_end,
> .release = gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
> struct mmu_notifier;
> struct mmu_notifier_ops;
>
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
> +
> #ifdef CONFIG_MMU_NOTIFIER
>
> /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
> };
>
> struct mmu_notifier_ops {
> + /*
> + * Flags to specify behavior of callbacks for this MMU notifier.
> + * Used to determine which context an operation may be called.
> + *
> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> + * block
> + */
> + int flags;
> +
> /*
> * Called either by mmu_notifier_unregister or when the mm is
> * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
> * page. Pages will no longer be referenced by the linux
> * address space but may still be referenced by sptes until
> * the last refcount is dropped.
> + *
> + * If both of these callbacks cannot block, mmu_notifier_ops.flags
> + * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
> */
> void (*invalidate_range_start)(struct mmu_notifier *mn,
> struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
> bool only_end);
> extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>
> static inline void mmu_notifier_release(struct mm_struct *mm)
> {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
> {
> }
>
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + return 0;
> +}
> +
> static inline void mmu_notifier_mm_init(struct mm_struct *mm)
> {
> }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> }
> EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + struct mmu_notifier *mn;
> + int id;
> + int ret = 0;
> +
> + WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> + if (!mm_has_notifiers(mm))
> + return ret;
> +
> + id = srcu_read_lock(&srcu);
> + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> + if (!mn->ops->invalidate_range &&
> + !mn->ops->invalidate_range_start &&
> + !mn->ops->invalidate_range_end)
> + continue;
> +
> + if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> + ret = 1;
> + break;
> + }
> + }
> + srcu_read_unlock(&srcu, id);
> + return ret;
> +}
> +
> static int do_mmu_notifier_register(struct mmu_notifier *mn,
> struct mm_struct *mm,
> int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> }
>
> static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
> .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
> .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
2017-12-14 21:30 ` David Rientjes
@ 2017-12-15 12:19 ` Christian König
-1 siblings, 0 replies; 53+ messages in thread
From: Christian König @ 2017-12-15 12:19 UTC (permalink / raw)
To: David Rientjes, Andrew Morton
Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, David Airlie,
Joerg Roedel, Doug Ledford, Jani Nikula, Mike Marciniszyn,
Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
Am 14.12.2017 um 22:30 schrieb David Rientjes:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Christian König <christian.koenig@amd.com>
> ---
> v2:
> - specifically exclude mmu_notifiers without invalidate callbacks
> - move flags to mmu_notifier_ops per Paolo
> - reverse flag from blockable -> not blockable per Christian
>
> drivers/infiniband/hw/hfi1/mmu_rb.c | 1 +
> drivers/iommu/amd_iommu_v2.c | 1 +
> drivers/iommu/intel-svm.c | 1 +
> drivers/misc/sgi-gru/grutlbpurge.c | 1 +
> include/linux/mmu_notifier.h | 21 +++++++++++++++++++++
> mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 1 +
> 7 files changed, 57 insertions(+)
>
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
> static void handle_remove(struct work_struct *work);
>
> static const struct mmu_notifier_ops mn_opts = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = mmu_notifier_range_start,
> };
>
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops iommu_mn = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = mn_release,
> .clear_flush_young = mn_clear_flush_young,
> .invalidate_range = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops intel_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = intel_mm_release,
> .change_pte = intel_change_pte,
> .invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
>
>
> static const struct mmu_notifier_ops gru_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = gru_invalidate_range_start,
> .invalidate_range_end = gru_invalidate_range_end,
> .release = gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
> struct mmu_notifier;
> struct mmu_notifier_ops;
>
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
> +
> #ifdef CONFIG_MMU_NOTIFIER
>
> /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
> };
>
> struct mmu_notifier_ops {
> + /*
> + * Flags to specify behavior of callbacks for this MMU notifier.
> + * Used to determine which context an operation may be called.
> + *
> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> + * block
> + */
> + int flags;
> +
> /*
> * Called either by mmu_notifier_unregister or when the mm is
> * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
> * page. Pages will no longer be referenced by the linux
> * address space but may still be referenced by sptes until
> * the last refcount is dropped.
> + *
> + * If both of these callbacks cannot block, mmu_notifier_ops.flags
> + * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
> */
> void (*invalidate_range_start)(struct mmu_notifier *mn,
> struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
> bool only_end);
> extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>
> static inline void mmu_notifier_release(struct mm_struct *mm)
> {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
> {
> }
>
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + return 0;
> +}
> +
> static inline void mmu_notifier_mm_init(struct mm_struct *mm)
> {
> }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> }
> EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + struct mmu_notifier *mn;
> + int id;
> + int ret = 0;
> +
> + WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> + if (!mm_has_notifiers(mm))
> + return ret;
> +
> + id = srcu_read_lock(&srcu);
> + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> + if (!mn->ops->invalidate_range &&
> + !mn->ops->invalidate_range_start &&
> + !mn->ops->invalidate_range_end)
> + continue;
> +
> + if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> + ret = 1;
> + break;
> + }
> + }
> + srcu_read_unlock(&srcu, id);
> + return ret;
> +}
> +
> static int do_mmu_notifier_register(struct mmu_notifier *mn,
> struct mm_struct *mm,
> int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> }
>
> static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
> .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
> .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-15 12:19 ` Christian König
0 siblings, 0 replies; 53+ messages in thread
From: Christian König @ 2017-12-15 12:19 UTC (permalink / raw)
To: David Rientjes, Andrew Morton
Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, David Airlie,
Joerg Roedel, Doug Ledford, Jani Nikula, Mike Marciniszyn,
Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
Am 14.12.2017 um 22:30 schrieb David Rientjes:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Christian KA?nig <christian.koenig@amd.com>
> ---
> v2:
> - specifically exclude mmu_notifiers without invalidate callbacks
> - move flags to mmu_notifier_ops per Paolo
> - reverse flag from blockable -> not blockable per Christian
>
> drivers/infiniband/hw/hfi1/mmu_rb.c | 1 +
> drivers/iommu/amd_iommu_v2.c | 1 +
> drivers/iommu/intel-svm.c | 1 +
> drivers/misc/sgi-gru/grutlbpurge.c | 1 +
> include/linux/mmu_notifier.h | 21 +++++++++++++++++++++
> mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 1 +
> 7 files changed, 57 insertions(+)
>
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
> static void handle_remove(struct work_struct *work);
>
> static const struct mmu_notifier_ops mn_opts = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = mmu_notifier_range_start,
> };
>
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops iommu_mn = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = mn_release,
> .clear_flush_young = mn_clear_flush_young,
> .invalidate_range = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops intel_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = intel_mm_release,
> .change_pte = intel_change_pte,
> .invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
>
>
> static const struct mmu_notifier_ops gru_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = gru_invalidate_range_start,
> .invalidate_range_end = gru_invalidate_range_end,
> .release = gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
> struct mmu_notifier;
> struct mmu_notifier_ops;
>
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
> +
> #ifdef CONFIG_MMU_NOTIFIER
>
> /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
> };
>
> struct mmu_notifier_ops {
> + /*
> + * Flags to specify behavior of callbacks for this MMU notifier.
> + * Used to determine which context an operation may be called.
> + *
> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> + * block
> + */
> + int flags;
> +
> /*
> * Called either by mmu_notifier_unregister or when the mm is
> * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
> * page. Pages will no longer be referenced by the linux
> * address space but may still be referenced by sptes until
> * the last refcount is dropped.
> + *
> + * If both of these callbacks cannot block, mmu_notifier_ops.flags
> + * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
> */
> void (*invalidate_range_start)(struct mmu_notifier *mn,
> struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
> bool only_end);
> extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>
> static inline void mmu_notifier_release(struct mm_struct *mm)
> {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
> {
> }
>
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + return 0;
> +}
> +
> static inline void mmu_notifier_mm_init(struct mm_struct *mm)
> {
> }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> }
> EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + struct mmu_notifier *mn;
> + int id;
> + int ret = 0;
> +
> + WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> + if (!mm_has_notifiers(mm))
> + return ret;
> +
> + id = srcu_read_lock(&srcu);
> + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> + if (!mn->ops->invalidate_range &&
> + !mn->ops->invalidate_range_start &&
> + !mn->ops->invalidate_range_end)
> + continue;
> +
> + if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> + ret = 1;
> + break;
> + }
> + }
> + srcu_read_unlock(&srcu, id);
> + return ret;
> +}
> +
> static int do_mmu_notifier_register(struct mmu_notifier *mn,
> struct mm_struct *mm,
> int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> }
>
> static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
> .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
> .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
2017-12-14 21:30 ` David Rientjes
@ 2017-12-15 13:36 ` Dimitri Sivanich
-1 siblings, 0 replies; 53+ messages in thread
From: Dimitri Sivanich @ 2017-12-15 13:36 UTC (permalink / raw)
To: David Rientjes
Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
Alex Deucher, Christian König, David Airlie, Joerg Roedel,
Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
Paolo Bonzini, Radim Krčmář,
linux-kernel, linux-mm
On Thu, Dec 14, 2017 at 01:30:56PM -0800, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Dimitri Sivanich <sivanich@hpe.com>
> ---
> v2:
> - specifically exclude mmu_notifiers without invalidate callbacks
> - move flags to mmu_notifier_ops per Paolo
> - reverse flag from blockable -> not blockable per Christian
>
> drivers/infiniband/hw/hfi1/mmu_rb.c | 1 +
> drivers/iommu/amd_iommu_v2.c | 1 +
> drivers/iommu/intel-svm.c | 1 +
> drivers/misc/sgi-gru/grutlbpurge.c | 1 +
> include/linux/mmu_notifier.h | 21 +++++++++++++++++++++
> mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 1 +
> 7 files changed, 57 insertions(+)
>
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
> static void handle_remove(struct work_struct *work);
>
> static const struct mmu_notifier_ops mn_opts = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = mmu_notifier_range_start,
> };
>
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops iommu_mn = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = mn_release,
> .clear_flush_young = mn_clear_flush_young,
> .invalidate_range = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops intel_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = intel_mm_release,
> .change_pte = intel_change_pte,
> .invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
>
>
> static const struct mmu_notifier_ops gru_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = gru_invalidate_range_start,
> .invalidate_range_end = gru_invalidate_range_end,
> .release = gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
> struct mmu_notifier;
> struct mmu_notifier_ops;
>
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
> +
> #ifdef CONFIG_MMU_NOTIFIER
>
> /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
> };
>
> struct mmu_notifier_ops {
> + /*
> + * Flags to specify behavior of callbacks for this MMU notifier.
> + * Used to determine which context an operation may be called.
> + *
> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> + * block
> + */
> + int flags;
> +
> /*
> * Called either by mmu_notifier_unregister or when the mm is
> * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
> * page. Pages will no longer be referenced by the linux
> * address space but may still be referenced by sptes until
> * the last refcount is dropped.
> + *
> + * If both of these callbacks cannot block, mmu_notifier_ops.flags
> + * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
> */
> void (*invalidate_range_start)(struct mmu_notifier *mn,
> struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
> bool only_end);
> extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>
> static inline void mmu_notifier_release(struct mm_struct *mm)
> {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
> {
> }
>
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + return 0;
> +}
> +
> static inline void mmu_notifier_mm_init(struct mm_struct *mm)
> {
> }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> }
> EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + struct mmu_notifier *mn;
> + int id;
> + int ret = 0;
> +
> + WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> + if (!mm_has_notifiers(mm))
> + return ret;
> +
> + id = srcu_read_lock(&srcu);
> + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> + if (!mn->ops->invalidate_range &&
> + !mn->ops->invalidate_range_start &&
> + !mn->ops->invalidate_range_end)
> + continue;
> +
> + if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> + ret = 1;
> + break;
> + }
> + }
> + srcu_read_unlock(&srcu, id);
> + return ret;
> +}
> +
> static int do_mmu_notifier_register(struct mmu_notifier *mn,
> struct mm_struct *mm,
> int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> }
>
> static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
> .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
> .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-15 13:36 ` Dimitri Sivanich
0 siblings, 0 replies; 53+ messages in thread
From: Dimitri Sivanich @ 2017-12-15 13:36 UTC (permalink / raw)
To: David Rientjes
Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
Alex Deucher, Christian König, David Airlie, Joerg Roedel,
Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
Paolo Bonzini, Radim Krčmář,
linux-kernel, linux-mm
On Thu, Dec 14, 2017 at 01:30:56PM -0800, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Dimitri Sivanich <sivanich@hpe.com>
> ---
> v2:
> - specifically exclude mmu_notifiers without invalidate callbacks
> - move flags to mmu_notifier_ops per Paolo
> - reverse flag from blockable -> not blockable per Christian
>
> drivers/infiniband/hw/hfi1/mmu_rb.c | 1 +
> drivers/iommu/amd_iommu_v2.c | 1 +
> drivers/iommu/intel-svm.c | 1 +
> drivers/misc/sgi-gru/grutlbpurge.c | 1 +
> include/linux/mmu_notifier.h | 21 +++++++++++++++++++++
> mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 1 +
> 7 files changed, 57 insertions(+)
>
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
> static void handle_remove(struct work_struct *work);
>
> static const struct mmu_notifier_ops mn_opts = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = mmu_notifier_range_start,
> };
>
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops iommu_mn = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = mn_release,
> .clear_flush_young = mn_clear_flush_young,
> .invalidate_range = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops intel_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = intel_mm_release,
> .change_pte = intel_change_pte,
> .invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
>
>
> static const struct mmu_notifier_ops gru_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = gru_invalidate_range_start,
> .invalidate_range_end = gru_invalidate_range_end,
> .release = gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
> struct mmu_notifier;
> struct mmu_notifier_ops;
>
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
> +
> #ifdef CONFIG_MMU_NOTIFIER
>
> /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
> };
>
> struct mmu_notifier_ops {
> + /*
> + * Flags to specify behavior of callbacks for this MMU notifier.
> + * Used to determine which context an operation may be called.
> + *
> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> + * block
> + */
> + int flags;
> +
> /*
> * Called either by mmu_notifier_unregister or when the mm is
> * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
> * page. Pages will no longer be referenced by the linux
> * address space but may still be referenced by sptes until
> * the last refcount is dropped.
> + *
> + * If both of these callbacks cannot block, mmu_notifier_ops.flags
> + * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
> */
> void (*invalidate_range_start)(struct mmu_notifier *mn,
> struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
> bool only_end);
> extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>
> static inline void mmu_notifier_release(struct mm_struct *mm)
> {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
> {
> }
>
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + return 0;
> +}
> +
> static inline void mmu_notifier_mm_init(struct mm_struct *mm)
> {
> }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> }
> EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + struct mmu_notifier *mn;
> + int id;
> + int ret = 0;
> +
> + WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> + if (!mm_has_notifiers(mm))
> + return ret;
> +
> + id = srcu_read_lock(&srcu);
> + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> + if (!mn->ops->invalidate_range &&
> + !mn->ops->invalidate_range_start &&
> + !mn->ops->invalidate_range_end)
> + continue;
> +
> + if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> + ret = 1;
> + break;
> + }
> + }
> + srcu_read_unlock(&srcu, id);
> + return ret;
> +}
> +
> static int do_mmu_notifier_register(struct mmu_notifier *mn,
> struct mm_struct *mm,
> int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> }
>
> static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
> .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
> .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
2017-12-14 21:30 ` David Rientjes
@ 2017-12-15 16:25 ` Michal Hocko
-1 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-15 16:25 UTC (permalink / raw)
To: David Rientjes
Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
On Thu 14-12-17 13:30:56, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> Signed-off-by: David Rientjes <rientjes@google.com>
Yes, this make sense. I haven't checked all the existing mmu notifiers
but those that you have marked seem to be OK.
I just think that the semantic of the flag should be describe more. See
below
Acked-by: Michal Hocko <mhocko@suse.com>
> ---
> v2:
> - specifically exclude mmu_notifiers without invalidate callbacks
> - move flags to mmu_notifier_ops per Paolo
> - reverse flag from blockable -> not blockable per Christian
>
> drivers/infiniband/hw/hfi1/mmu_rb.c | 1 +
> drivers/iommu/amd_iommu_v2.c | 1 +
> drivers/iommu/intel-svm.c | 1 +
> drivers/misc/sgi-gru/grutlbpurge.c | 1 +
> include/linux/mmu_notifier.h | 21 +++++++++++++++++++++
> mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 1 +
> 7 files changed, 57 insertions(+)
>
[...]
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
> struct mmu_notifier;
> struct mmu_notifier_ops;
>
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
> +
> #ifdef CONFIG_MMU_NOTIFIER
>
> /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
> };
>
> struct mmu_notifier_ops {
> + /*
> + * Flags to specify behavior of callbacks for this MMU notifier.
> + * Used to determine which context an operation may be called.
> + *
> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> + * block
> + */
> + int flags;
This should be more specific IMHO. What do you think about the following
wording?
invalidate_{start,end,range} doesn't block on any locks which depend
directly or indirectly (via lock chain or resources e.g. worker context)
on a memory allocation.
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> }
>
> static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
> .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
> .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-15 16:25 ` Michal Hocko
0 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-15 16:25 UTC (permalink / raw)
To: David Rientjes
Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
On Thu 14-12-17 13:30:56, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> Signed-off-by: David Rientjes <rientjes@google.com>
Yes, this make sense. I haven't checked all the existing mmu notifiers
but those that you have marked seem to be OK.
I just think that the semantic of the flag should be describe more. See
below
Acked-by: Michal Hocko <mhocko@suse.com>
> ---
> v2:
> - specifically exclude mmu_notifiers without invalidate callbacks
> - move flags to mmu_notifier_ops per Paolo
> - reverse flag from blockable -> not blockable per Christian
>
> drivers/infiniband/hw/hfi1/mmu_rb.c | 1 +
> drivers/iommu/amd_iommu_v2.c | 1 +
> drivers/iommu/intel-svm.c | 1 +
> drivers/misc/sgi-gru/grutlbpurge.c | 1 +
> include/linux/mmu_notifier.h | 21 +++++++++++++++++++++
> mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 1 +
> 7 files changed, 57 insertions(+)
>
[...]
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
> struct mmu_notifier;
> struct mmu_notifier_ops;
>
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
> +
> #ifdef CONFIG_MMU_NOTIFIER
>
> /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
> };
>
> struct mmu_notifier_ops {
> + /*
> + * Flags to specify behavior of callbacks for this MMU notifier.
> + * Used to determine which context an operation may be called.
> + *
> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> + * block
> + */
> + int flags;
This should be more specific IMHO. What do you think about the following
wording?
invalidate_{start,end,range} doesn't block on any locks which depend
directly or indirectly (via lock chain or resources e.g. worker context)
on a memory allocation.
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> }
>
> static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
> .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
> .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
2017-12-15 16:25 ` Michal Hocko
@ 2017-12-16 6:21 ` Tetsuo Handa
-1 siblings, 0 replies; 53+ messages in thread
From: Tetsuo Handa @ 2017-12-16 6:21 UTC (permalink / raw)
To: Michal Hocko, David Rientjes
Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
On 2017/12/16 1:25, Michal Hocko wrote:
>> struct mmu_notifier_ops {
>> + /*
>> + * Flags to specify behavior of callbacks for this MMU notifier.
>> + * Used to determine which context an operation may be called.
>> + *
>> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
>> + * block
>> + */
>> + int flags;
>
> This should be more specific IMHO. What do you think about the following
> wording?
>
> invalidate_{start,end,range} doesn't block on any locks which depend
> directly or indirectly (via lock chain or resources e.g. worker context)
> on a memory allocation.
I disagree. It needlessly complicates validating the correctness.
What if the invalidate_{start,end} calls schedule_timeout_idle(10 * HZ) ?
schedule_timeout_idle() will not block on any locks which depend directly or
indirectly on a memory allocation, but we are already blocking other memory
allocating threads at mutex_trylock(&oom_lock) in __alloc_pages_may_oom().
This is essentially same with "sleeping forever due to schedule_timeout_killable(1) by
SCHED_IDLE thread with oom_lock held" versus "looping due to mutex_trylock(&oom_lock)
by all other allocating threads" lockup problem. The OOM reaper does not want to get
blocked for so long.
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-16 6:21 ` Tetsuo Handa
0 siblings, 0 replies; 53+ messages in thread
From: Tetsuo Handa @ 2017-12-16 6:21 UTC (permalink / raw)
To: Michal Hocko, David Rientjes
Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
On 2017/12/16 1:25, Michal Hocko wrote:
>> struct mmu_notifier_ops {
>> + /*
>> + * Flags to specify behavior of callbacks for this MMU notifier.
>> + * Used to determine which context an operation may be called.
>> + *
>> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
>> + * block
>> + */
>> + int flags;
>
> This should be more specific IMHO. What do you think about the following
> wording?
>
> invalidate_{start,end,range} doesn't block on any locks which depend
> directly or indirectly (via lock chain or resources e.g. worker context)
> on a memory allocation.
I disagree. It needlessly complicates validating the correctness.
What if the invalidate_{start,end} calls schedule_timeout_idle(10 * HZ) ?
schedule_timeout_idle() will not block on any locks which depend directly or
indirectly on a memory allocation, but we are already blocking other memory
allocating threads at mutex_trylock(&oom_lock) in __alloc_pages_may_oom().
This is essentially same with "sleeping forever due to schedule_timeout_killable(1) by
SCHED_IDLE thread with oom_lock held" versus "looping due to mutex_trylock(&oom_lock)
by all other allocating threads" lockup problem. The OOM reaper does not want to get
blocked for so long.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
2017-12-16 6:21 ` Tetsuo Handa
@ 2017-12-16 11:33 ` Michal Hocko
-1 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-16 11:33 UTC (permalink / raw)
To: Tetsuo Handa
Cc: David Rientjes, Andrew Morton, Andrea Arcangeli,
Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
Alex Deucher, Christian König, David Airlie, Joerg Roedel,
Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
Paolo Bonzini, Radim Krčmář,
linux-kernel, linux-mm
On Sat 16-12-17 15:21:51, Tetsuo Handa wrote:
> On 2017/12/16 1:25, Michal Hocko wrote:
> >> struct mmu_notifier_ops {
> >> + /*
> >> + * Flags to specify behavior of callbacks for this MMU notifier.
> >> + * Used to determine which context an operation may be called.
> >> + *
> >> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> >> + * block
> >> + */
> >> + int flags;
> >
> > This should be more specific IMHO. What do you think about the following
> > wording?
> >
> > invalidate_{start,end,range} doesn't block on any locks which depend
> > directly or indirectly (via lock chain or resources e.g. worker context)
> > on a memory allocation.
>
> I disagree. It needlessly complicates validating the correctness.
But it makes it clear what is the actual semantic.
> What if the invalidate_{start,end} calls schedule_timeout_idle(10 * HZ) ?
Let's talk seriously about a real code. Any mmu notifier doing this is
just crazy and should be fixed.
> schedule_timeout_idle() will not block on any locks which depend directly or
> indirectly on a memory allocation, but we are already blocking other memory
> allocating threads at mutex_trylock(&oom_lock) in __alloc_pages_may_oom().
Then the reaper will block and progress would be slower.
> This is essentially same with "sleeping forever due to schedule_timeout_killable(1) by
> SCHED_IDLE thread with oom_lock held" versus "looping due to mutex_trylock(&oom_lock)
> by all other allocating threads" lockup problem. The OOM reaper does not want to get
> blocked for so long.
Yes, it absolutely doesn't want to do that. MMu notifiers should be
reasonable because they are called from performance sensitive call
paths.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-16 11:33 ` Michal Hocko
0 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-16 11:33 UTC (permalink / raw)
To: Tetsuo Handa
Cc: David Rientjes, Andrew Morton, Andrea Arcangeli,
Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
Alex Deucher, Christian König, David Airlie, Joerg Roedel,
Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
Paolo Bonzini, Radim Krčmář,
linux-kernel, linux-mm
On Sat 16-12-17 15:21:51, Tetsuo Handa wrote:
> On 2017/12/16 1:25, Michal Hocko wrote:
> >> struct mmu_notifier_ops {
> >> + /*
> >> + * Flags to specify behavior of callbacks for this MMU notifier.
> >> + * Used to determine which context an operation may be called.
> >> + *
> >> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> >> + * block
> >> + */
> >> + int flags;
> >
> > This should be more specific IMHO. What do you think about the following
> > wording?
> >
> > invalidate_{start,end,range} doesn't block on any locks which depend
> > directly or indirectly (via lock chain or resources e.g. worker context)
> > on a memory allocation.
>
> I disagree. It needlessly complicates validating the correctness.
But it makes it clear what is the actual semantic.
> What if the invalidate_{start,end} calls schedule_timeout_idle(10 * HZ) ?
Let's talk seriously about a real code. Any mmu notifier doing this is
just crazy and should be fixed.
> schedule_timeout_idle() will not block on any locks which depend directly or
> indirectly on a memory allocation, but we are already blocking other memory
> allocating threads at mutex_trylock(&oom_lock) in __alloc_pages_may_oom().
Then the reaper will block and progress would be slower.
> This is essentially same with "sleeping forever due to schedule_timeout_killable(1) by
> SCHED_IDLE thread with oom_lock held" versus "looping due to mutex_trylock(&oom_lock)
> by all other allocating threads" lockup problem. The OOM reaper does not want to get
> blocked for so long.
Yes, it absolutely doesn't want to do that. MMu notifiers should be
reasonable because they are called from performance sensitive call
paths.
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
2017-12-14 21:30 ` David Rientjes
@ 2017-12-15 23:04 ` Andrew Morton
-1 siblings, 0 replies; 53+ messages in thread
From: Andrew Morton @ 2017-12-15 23:04 UTC (permalink / raw)
To: David Rientjes
Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
On Thu, 14 Dec 2017 13:30:56 -0800 (PST) David Rientjes <rientjes@google.com> wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
some tweakage, please review.
From: Andrew Morton <akpm@linux-foundation.org>
Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: David Airlie <airlied@linux.ie>
Cc: David Rientjes <rientjes@google.com>
Cc: Dimitri Sivanich <sivanich@hpe.com>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Marciniszyn <mike.marciniszyn@intel.com>
Cc: Oded Gabbay <oded.gabbay@gmail.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
include/linux/mmu_notifier.h | 7 ++++---
mm/mmu_notifier.c | 8 ++++----
2 files changed, 8 insertions(+), 7 deletions(-)
diff -puN include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
+++ a/include/linux/mmu_notifier.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_MMU_NOTIFIER_H
#define _LINUX_MMU_NOTIFIER_H
+#include <linux/types.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
@@ -233,7 +234,7 @@ extern void __mmu_notifier_invalidate_ra
bool only_end);
extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end);
-extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
+extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
static inline void mmu_notifier_release(struct mm_struct *mm)
{
@@ -473,9 +474,9 @@ static inline void mmu_notifier_invalida
{
}
-static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
{
- return 0;
+ return false;
}
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
diff -puN mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix mm/mmu_notifier.c
--- a/mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
+++ a/mm/mmu_notifier.c
@@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
* Must be called while holding mm->mmap_sem for either read or write.
* The result is guaranteed to be valid until mm->mmap_sem is dropped.
*/
-int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
{
struct mmu_notifier *mn;
int id;
- int ret = 0;
+ bool ret = false;
- WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
+ WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
if (!mm_has_notifiers(mm))
return ret;
@@ -259,7 +259,7 @@ int mm_has_blockable_invalidate_notifier
continue;
if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
- ret = 1;
+ ret = true;
break;
}
}
_
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-15 23:04 ` Andrew Morton
0 siblings, 0 replies; 53+ messages in thread
From: Andrew Morton @ 2017-12-15 23:04 UTC (permalink / raw)
To: David Rientjes
Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
On Thu, 14 Dec 2017 13:30:56 -0800 (PST) David Rientjes <rientjes@google.com> wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
some tweakage, please review.
From: Andrew Morton <akpm@linux-foundation.org>
Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian KA?nig <christian.koenig@amd.com>
Cc: David Airlie <airlied@linux.ie>
Cc: David Rientjes <rientjes@google.com>
Cc: Dimitri Sivanich <sivanich@hpe.com>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: JA(C)rA'me Glisse <jglisse@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Marciniszyn <mike.marciniszyn@intel.com>
Cc: Oded Gabbay <oded.gabbay@gmail.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Radim KrA?mA!A? <rkrcmar@redhat.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
include/linux/mmu_notifier.h | 7 ++++---
mm/mmu_notifier.c | 8 ++++----
2 files changed, 8 insertions(+), 7 deletions(-)
diff -puN include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
+++ a/include/linux/mmu_notifier.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_MMU_NOTIFIER_H
#define _LINUX_MMU_NOTIFIER_H
+#include <linux/types.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
@@ -233,7 +234,7 @@ extern void __mmu_notifier_invalidate_ra
bool only_end);
extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end);
-extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
+extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
static inline void mmu_notifier_release(struct mm_struct *mm)
{
@@ -473,9 +474,9 @@ static inline void mmu_notifier_invalida
{
}
-static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
{
- return 0;
+ return false;
}
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
diff -puN mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix mm/mmu_notifier.c
--- a/mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
+++ a/mm/mmu_notifier.c
@@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
* Must be called while holding mm->mmap_sem for either read or write.
* The result is guaranteed to be valid until mm->mmap_sem is dropped.
*/
-int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
{
struct mmu_notifier *mn;
int id;
- int ret = 0;
+ bool ret = false;
- WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
+ WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
if (!mm_has_notifiers(mm))
return ret;
@@ -259,7 +259,7 @@ int mm_has_blockable_invalidate_notifier
continue;
if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
- ret = 1;
+ ret = true;
break;
}
}
_
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
2017-12-15 23:04 ` Andrew Morton
@ 2017-12-16 7:14 ` Tetsuo Handa
-1 siblings, 0 replies; 53+ messages in thread
From: Tetsuo Handa @ 2017-12-16 7:14 UTC (permalink / raw)
To: Andrew Morton, David Rientjes
Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
On 2017/12/16 8:04, Andrew Morton wrote:
>> The implementation is steered toward an expensive slowpath, such as after
>> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> some tweakage, please review.
>
> From: Andrew Morton <akpm@linux-foundation.org>
> Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
>
> make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
>
> @@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
> * Must be called while holding mm->mmap_sem for either read or write.
> * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> */
> -int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> {
> struct mmu_notifier *mn;
> int id;
> - int ret = 0;
> + bool ret = false;
>
> - WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
>
> if (!mm_has_notifiers(mm))
> return ret;
rwsem_is_locked() test isn't equivalent with __mutex_owner() == current test, is it?
If rwsem_is_locked() returns true because somebody else has locked it, there is
no guarantee that current thread has locked it before calling this function.
down_write_trylock() test isn't equivalent with __mutex_owner() == current test, is it?
What if somebody else held it for read or write (the worst case is registration path),
down_write_trylock() will return false even if current thread has not locked it for
read or write.
I think this WARN_ON_ONCE() can not detect incorrect call to this function.
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-16 7:14 ` Tetsuo Handa
0 siblings, 0 replies; 53+ messages in thread
From: Tetsuo Handa @ 2017-12-16 7:14 UTC (permalink / raw)
To: Andrew Morton, David Rientjes
Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
On 2017/12/16 8:04, Andrew Morton wrote:
>> The implementation is steered toward an expensive slowpath, such as after
>> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> some tweakage, please review.
>
> From: Andrew Morton <akpm@linux-foundation.org>
> Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
>
> make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
>
> @@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
> * Must be called while holding mm->mmap_sem for either read or write.
> * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> */
> -int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> {
> struct mmu_notifier *mn;
> int id;
> - int ret = 0;
> + bool ret = false;
>
> - WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
>
> if (!mm_has_notifiers(mm))
> return ret;
rwsem_is_locked() test isn't equivalent with __mutex_owner() == current test, is it?
If rwsem_is_locked() returns true because somebody else has locked it, there is
no guarantee that current thread has locked it before calling this function.
down_write_trylock() test isn't equivalent with __mutex_owner() == current test, is it?
What if somebody else held it for read or write (the worst case is registration path),
down_write_trylock() will return false even if current thread has not locked it for
read or write.
I think this WARN_ON_ONCE() can not detect incorrect call to this function.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
2017-12-16 7:14 ` Tetsuo Handa
@ 2017-12-16 11:36 ` Michal Hocko
-1 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-16 11:36 UTC (permalink / raw)
To: Tetsuo Handa
Cc: Andrew Morton, David Rientjes, Andrea Arcangeli,
Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
Alex Deucher, Christian König, David Airlie, Joerg Roedel,
Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
Paolo Bonzini, Radim Krčmář,
linux-kernel, linux-mm
On Sat 16-12-17 16:14:07, Tetsuo Handa wrote:
> On 2017/12/16 8:04, Andrew Morton wrote:
> >> The implementation is steered toward an expensive slowpath, such as after
> >> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> >
> > some tweakage, please review.
> >
> > From: Andrew Morton <akpm@linux-foundation.org>
> > Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> >
> > make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
> >
>
> > @@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
> > * Must be called while holding mm->mmap_sem for either read or write.
> > * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> > */
> > -int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> > +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> > {
> > struct mmu_notifier *mn;
> > int id;
> > - int ret = 0;
> > + bool ret = false;
> >
> > - WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> > + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
> >
> > if (!mm_has_notifiers(mm))
> > return ret;
>
> rwsem_is_locked() test isn't equivalent with __mutex_owner() == current test, is it?
> If rwsem_is_locked() returns true because somebody else has locked it, there is
> no guarantee that current thread has locked it before calling this function.
>
> down_write_trylock() test isn't equivalent with __mutex_owner() == current test, is it?
> What if somebody else held it for read or write (the worst case is registration path),
> down_write_trylock() will return false even if current thread has not locked it for
> read or write.
>
> I think this WARN_ON_ONCE() can not detect incorrect call to this function.
Yes it cannot catch _all_ cases. This is an inherent problem of
rwsem_is_locked because semaphores do not really have the owner concept.
The core idea behind this, I guess, is to catch obviously incorrect
usage and as such it gives us a reasonabe coverage. I could live without
the annotation but rwsem_is_locked looks better than down_write_trylock
to me.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-16 11:36 ` Michal Hocko
0 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-16 11:36 UTC (permalink / raw)
To: Tetsuo Handa
Cc: Andrew Morton, David Rientjes, Andrea Arcangeli,
Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
Alex Deucher, Christian König, David Airlie, Joerg Roedel,
Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
Paolo Bonzini, Radim Krčmář,
linux-kernel, linux-mm
On Sat 16-12-17 16:14:07, Tetsuo Handa wrote:
> On 2017/12/16 8:04, Andrew Morton wrote:
> >> The implementation is steered toward an expensive slowpath, such as after
> >> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> >
> > some tweakage, please review.
> >
> > From: Andrew Morton <akpm@linux-foundation.org>
> > Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> >
> > make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
> >
>
> > @@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
> > * Must be called while holding mm->mmap_sem for either read or write.
> > * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> > */
> > -int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> > +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> > {
> > struct mmu_notifier *mn;
> > int id;
> > - int ret = 0;
> > + bool ret = false;
> >
> > - WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> > + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
> >
> > if (!mm_has_notifiers(mm))
> > return ret;
>
> rwsem_is_locked() test isn't equivalent with __mutex_owner() == current test, is it?
> If rwsem_is_locked() returns true because somebody else has locked it, there is
> no guarantee that current thread has locked it before calling this function.
>
> down_write_trylock() test isn't equivalent with __mutex_owner() == current test, is it?
> What if somebody else held it for read or write (the worst case is registration path),
> down_write_trylock() will return false even if current thread has not locked it for
> read or write.
>
> I think this WARN_ON_ONCE() can not detect incorrect call to this function.
Yes it cannot catch _all_ cases. This is an inherent problem of
rwsem_is_locked because semaphores do not really have the owner concept.
The core idea behind this, I guess, is to catch obviously incorrect
usage and as such it gives us a reasonabe coverage. I could live without
the annotation but rwsem_is_locked looks better than down_write_trylock
to me.
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
2017-12-16 11:36 ` Michal Hocko
@ 2017-12-16 14:45 ` Tetsuo Handa
-1 siblings, 0 replies; 53+ messages in thread
From: Tetsuo Handa @ 2017-12-16 14:45 UTC (permalink / raw)
To: mhocko
Cc: akpm, rientjes, aarcange, benh, paulus, oded.gabbay,
alexander.deucher, christian.koenig, airlied, joro, dledford,
jani.nikula, mike.marciniszyn, sean.hefty, sivanich,
boris.ostrovsky, jglisse, pbonzini, rkrcmar, linux-kernel,
linux-mm
Michal Hocko wrote:
> On Sat 16-12-17 16:14:07, Tetsuo Handa wrote:
> > rwsem_is_locked() test isn't equivalent with __mutex_owner() == current test, is it?
> > If rwsem_is_locked() returns true because somebody else has locked it, there is
> > no guarantee that current thread has locked it before calling this function.
> >
> > down_write_trylock() test isn't equivalent with __mutex_owner() == current test, is it?
> > What if somebody else held it for read or write (the worst case is registration path),
> > down_write_trylock() will return false even if current thread has not locked it for
> > read or write.
> >
> > I think this WARN_ON_ONCE() can not detect incorrect call to this function.
>
> Yes it cannot catch _all_ cases. This is an inherent problem of
> rwsem_is_locked because semaphores do not really have the owner concept.
> The core idea behind this, I guess, is to catch obviously incorrect
> usage and as such it gives us a reasonabe coverage. I could live without
> the annotation but rwsem_is_locked looks better than down_write_trylock
> to me.
I agree that rwsem_is_locked() is better than down_write_trylock() because
the former does not have side effect when nobody was holding the rwsem.
Looking at how rwsem_is_locked() is used in mm/ directory for sanity checks,
only VM_BUG_ON(), VM_BUG_ON_MM(), or VM_BUG_ON_VMA() are used. Therefore,
this WARN_ON_ONCE() usage might be irregular.
Also, regarding the problem that semaphores do not really have the owner
concept, we can add "struct task_struct *owner_of_mmap_sem_for_write" to
"struct mm_struct" and replace direct down_write_killable() etc. with
corresponding wrapper functions like
int __must_check get_mmap_sem_write_killable(struct mm_struct *mm) {
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
mm->owner_of_mmap_sem_for_write = current;
return 0;
}
and make the rwsem_is_locked() test more robust by doing like
bool mmap_sem_is_held_for_write_by_current(struct mm_struct *mm) {
return mm->owner_of_mmap_sem_for_write == current;
}
. If there is a guarantee that no thread is allowed to hold multiple
mmap_sem, wrapper functions which manipulate per "struct task_struct"
flag will work.
But the fundamental problem is that we are heavily relying on runtime
testing (e.g. lockdep / syzkaller). Since there are a lot of factors which
prevent sanity checks from being called (e.g. conditional calls based on
threshold check), we can not exercise all paths, and everybody is making
changes without understanding all the dependencies. Consider that nobody
noticed that relying on __GFP_DIRECT_RECLAIM with oom_lock held may cause
lockups. We are too easily introducing unsafe dependency. I think that we
need to describe all the dependencies without relying on runtime testing.
Back to MMU_INVALIDATE_DOES_NOT_BLOCK flag, I worry that we will fail to
notice when somebody in future makes changes with mmu notifier which
currently does not rely on __GFP_DIRECT_RECLAIM to by error rely on
__GFP_DIRECT_RECLAIM. Staying at "whether the callback might sleep"
granularity will help preventing such unnoticed dependency bugs from
being introduced.
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-16 14:45 ` Tetsuo Handa
0 siblings, 0 replies; 53+ messages in thread
From: Tetsuo Handa @ 2017-12-16 14:45 UTC (permalink / raw)
To: mhocko
Cc: akpm, rientjes, aarcange, benh, paulus, oded.gabbay,
alexander.deucher, christian.koenig, airlied, joro, dledford,
jani.nikula, mike.marciniszyn, sean.hefty, sivanich,
boris.ostrovsky, jglisse, pbonzini, rkrcmar, linux-kernel,
linux-mm
Michal Hocko wrote:
> On Sat 16-12-17 16:14:07, Tetsuo Handa wrote:
> > rwsem_is_locked() test isn't equivalent with __mutex_owner() == current test, is it?
> > If rwsem_is_locked() returns true because somebody else has locked it, there is
> > no guarantee that current thread has locked it before calling this function.
> >
> > down_write_trylock() test isn't equivalent with __mutex_owner() == current test, is it?
> > What if somebody else held it for read or write (the worst case is registration path),
> > down_write_trylock() will return false even if current thread has not locked it for
> > read or write.
> >
> > I think this WARN_ON_ONCE() can not detect incorrect call to this function.
>
> Yes it cannot catch _all_ cases. This is an inherent problem of
> rwsem_is_locked because semaphores do not really have the owner concept.
> The core idea behind this, I guess, is to catch obviously incorrect
> usage and as such it gives us a reasonabe coverage. I could live without
> the annotation but rwsem_is_locked looks better than down_write_trylock
> to me.
I agree that rwsem_is_locked() is better than down_write_trylock() because
the former does not have side effect when nobody was holding the rwsem.
Looking at how rwsem_is_locked() is used in mm/ directory for sanity checks,
only VM_BUG_ON(), VM_BUG_ON_MM(), or VM_BUG_ON_VMA() are used. Therefore,
this WARN_ON_ONCE() usage might be irregular.
Also, regarding the problem that semaphores do not really have the owner
concept, we can add "struct task_struct *owner_of_mmap_sem_for_write" to
"struct mm_struct" and replace direct down_write_killable() etc. with
corresponding wrapper functions like
int __must_check get_mmap_sem_write_killable(struct mm_struct *mm) {
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
mm->owner_of_mmap_sem_for_write = current;
return 0;
}
and make the rwsem_is_locked() test more robust by doing like
bool mmap_sem_is_held_for_write_by_current(struct mm_struct *mm) {
return mm->owner_of_mmap_sem_for_write == current;
}
. If there is a guarantee that no thread is allowed to hold multiple
mmap_sem, wrapper functions which manipulate per "struct task_struct"
flag will work.
But the fundamental problem is that we are heavily relying on runtime
testing (e.g. lockdep / syzkaller). Since there are a lot of factors which
prevent sanity checks from being called (e.g. conditional calls based on
threshold check), we can not exercise all paths, and everybody is making
changes without understanding all the dependencies. Consider that nobody
noticed that relying on __GFP_DIRECT_RECLAIM with oom_lock held may cause
lockups. We are too easily introducing unsafe dependency. I think that we
need to describe all the dependencies without relying on runtime testing.
Back to MMU_INVALIDATE_DOES_NOT_BLOCK flag, I worry that we will fail to
notice when somebody in future makes changes with mmu notifier which
currently does not rely on __GFP_DIRECT_RECLAIM to by error rely on
__GFP_DIRECT_RECLAIM. Staying at "whether the callback might sleep"
granularity will help preventing such unnoticed dependency bugs from
being introduced.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
2017-12-15 23:04 ` Andrew Morton
@ 2017-12-16 9:10 ` Michal Hocko
-1 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-16 9:10 UTC (permalink / raw)
To: Andrew Morton
Cc: David Rientjes, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
On Fri 15-12-17 15:04:29, Andrew Morton wrote:
> On Thu, 14 Dec 2017 13:30:56 -0800 (PST) David Rientjes <rientjes@google.com> wrote:
>
> > Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> > prevented the oom reaper from unmapping private anonymous memory with the
> > oom reaper when the oom victim mm had mmu notifiers registered.
> >
> > The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> > around the unmap_page_range(), which is needed, can block and the oom
> > killer will stall forever waiting for the victim to exit, which may not
> > be possible without reaping.
> >
> > That concern is real, but only true for mmu notifiers that have blockable
> > invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> > to mmu notifier ops that can set a bit to indicate that these callbacks do
> > not block.
> >
> > The implementation is steered toward an expensive slowpath, such as after
> > the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> some tweakage, please review.
>
> From: Andrew Morton <akpm@linux-foundation.org>
> Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
>
> make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
Yes, that makes sense to me.
>
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: David Airlie <airlied@linux.ie>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Dimitri Sivanich <sivanich@hpe.com>
> Cc: Doug Ledford <dledford@redhat.com>
> Cc: Jani Nikula <jani.nikula@linux.intel.com>
> Cc: Jérôme Glisse <jglisse@redhat.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Mike Marciniszyn <mike.marciniszyn@intel.com>
> Cc: Oded Gabbay <oded.gabbay@gmail.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Radim Krčmář <rkrcmar@redhat.com>
> Cc: Sean Hefty <sean.hefty@intel.com>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
>
> include/linux/mmu_notifier.h | 7 ++++---
> mm/mmu_notifier.c | 8 ++++----
> 2 files changed, 8 insertions(+), 7 deletions(-)
>
> diff -puN include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> +++ a/include/linux/mmu_notifier.h
> @@ -2,6 +2,7 @@
> #ifndef _LINUX_MMU_NOTIFIER_H
> #define _LINUX_MMU_NOTIFIER_H
>
> +#include <linux/types.h>
> #include <linux/list.h>
> #include <linux/spinlock.h>
> #include <linux/mm_types.h>
> @@ -233,7 +234,7 @@ extern void __mmu_notifier_invalidate_ra
> bool only_end);
> extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> unsigned long start, unsigned long end);
> -extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
> +extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>
> static inline void mmu_notifier_release(struct mm_struct *mm)
> {
> @@ -473,9 +474,9 @@ static inline void mmu_notifier_invalida
> {
> }
>
> -static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> {
> - return 0;
> + return false;
> }
>
> static inline void mmu_notifier_mm_init(struct mm_struct *mm)
> diff -puN mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> +++ a/mm/mmu_notifier.c
> @@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
> * Must be called while holding mm->mmap_sem for either read or write.
> * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> */
> -int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> {
> struct mmu_notifier *mn;
> int id;
> - int ret = 0;
> + bool ret = false;
>
> - WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
>
> if (!mm_has_notifiers(mm))
> return ret;
> @@ -259,7 +259,7 @@ int mm_has_blockable_invalidate_notifier
> continue;
>
> if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> - ret = 1;
> + ret = true;
> break;
> }
> }
> _
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-16 9:10 ` Michal Hocko
0 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-16 9:10 UTC (permalink / raw)
To: Andrew Morton
Cc: David Rientjes, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
On Fri 15-12-17 15:04:29, Andrew Morton wrote:
> On Thu, 14 Dec 2017 13:30:56 -0800 (PST) David Rientjes <rientjes@google.com> wrote:
>
> > Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> > prevented the oom reaper from unmapping private anonymous memory with the
> > oom reaper when the oom victim mm had mmu notifiers registered.
> >
> > The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> > around the unmap_page_range(), which is needed, can block and the oom
> > killer will stall forever waiting for the victim to exit, which may not
> > be possible without reaping.
> >
> > That concern is real, but only true for mmu notifiers that have blockable
> > invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> > to mmu notifier ops that can set a bit to indicate that these callbacks do
> > not block.
> >
> > The implementation is steered toward an expensive slowpath, such as after
> > the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> some tweakage, please review.
>
> From: Andrew Morton <akpm@linux-foundation.org>
> Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
>
> make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
Yes, that makes sense to me.
>
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> Cc: Christian KA?nig <christian.koenig@amd.com>
> Cc: David Airlie <airlied@linux.ie>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Dimitri Sivanich <sivanich@hpe.com>
> Cc: Doug Ledford <dledford@redhat.com>
> Cc: Jani Nikula <jani.nikula@linux.intel.com>
> Cc: JA(C)rA'me Glisse <jglisse@redhat.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Mike Marciniszyn <mike.marciniszyn@intel.com>
> Cc: Oded Gabbay <oded.gabbay@gmail.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Radim KrA?mA!A? <rkrcmar@redhat.com>
> Cc: Sean Hefty <sean.hefty@intel.com>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
>
> include/linux/mmu_notifier.h | 7 ++++---
> mm/mmu_notifier.c | 8 ++++----
> 2 files changed, 8 insertions(+), 7 deletions(-)
>
> diff -puN include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> +++ a/include/linux/mmu_notifier.h
> @@ -2,6 +2,7 @@
> #ifndef _LINUX_MMU_NOTIFIER_H
> #define _LINUX_MMU_NOTIFIER_H
>
> +#include <linux/types.h>
> #include <linux/list.h>
> #include <linux/spinlock.h>
> #include <linux/mm_types.h>
> @@ -233,7 +234,7 @@ extern void __mmu_notifier_invalidate_ra
> bool only_end);
> extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> unsigned long start, unsigned long end);
> -extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
> +extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>
> static inline void mmu_notifier_release(struct mm_struct *mm)
> {
> @@ -473,9 +474,9 @@ static inline void mmu_notifier_invalida
> {
> }
>
> -static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> {
> - return 0;
> + return false;
> }
>
> static inline void mmu_notifier_mm_init(struct mm_struct *mm)
> diff -puN mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> +++ a/mm/mmu_notifier.c
> @@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
> * Must be called while holding mm->mmap_sem for either read or write.
> * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> */
> -int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> {
> struct mmu_notifier *mn;
> int id;
> - int ret = 0;
> + bool ret = false;
>
> - WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
>
> if (!mm_has_notifiers(mm))
> return ret;
> @@ -259,7 +259,7 @@ int mm_has_blockable_invalidate_notifier
> continue;
>
> if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> - ret = 1;
> + ret = true;
> break;
> }
> }
> _
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread
* [patch -mm] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks fix fix
2017-12-15 23:04 ` Andrew Morton
@ 2018-01-09 21:40 ` David Rientjes
-1 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2018-01-09 21:40 UTC (permalink / raw)
To: Andrew Morton
Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
Fix mmu_notifier.h comments in "mm, mmu_notifier: annotate mmu notifiers
with blockable invalidate callbacks".
mmu_notifier_invalidate_range_end() can also call the invalidate_range()
callback, so it must not block for MMU_INVALIDATE_DOES_NOT_BLOCK to be
set.
Also remove a bogus comment about invalidate_range() always being called
under the ptl spinlock.
Signed-off-by: David Rientjes <rientjes@google.com>
---
include/linux/mmu_notifier.h | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -34,8 +34,8 @@ struct mmu_notifier_ops {
* Flags to specify behavior of callbacks for this MMU notifier.
* Used to determine which context an operation may be called.
*
- * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
- * block
+ * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not
+ * block
*/
int flags;
@@ -151,8 +151,9 @@ struct mmu_notifier_ops {
* address space but may still be referenced by sptes until
* the last refcount is dropped.
*
- * If both of these callbacks cannot block, mmu_notifier_ops.flags
- * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
+ * If both of these callbacks cannot block, and invalidate_range
+ * cannot block, mmu_notifier_ops.flags should have
+ * MMU_INVALIDATE_DOES_NOT_BLOCK set.
*/
void (*invalidate_range_start)(struct mmu_notifier *mn,
struct mm_struct *mm,
@@ -175,12 +176,13 @@ struct mmu_notifier_ops {
* external TLB range needs to be flushed. For more in depth
* discussion on this see Documentation/vm/mmu_notifier.txt
*
- * The invalidate_range() function is called under the ptl
- * spin-lock and not allowed to sleep.
- *
* Note that this function might be called with just a sub-range
* of what was passed to invalidate_range_start()/end(), if
* called between those functions.
+ *
+ * If this callback cannot block, and invalidate_range_{start,end}
+ * cannot block, mmu_notifier_ops.flags should have
+ * MMU_INVALIDATE_DOES_NOT_BLOCK set.
*/
void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
unsigned long start, unsigned long end);
^ permalink raw reply [flat|nested] 53+ messages in thread
* [patch -mm] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks fix fix
@ 2018-01-09 21:40 ` David Rientjes
0 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2018-01-09 21:40 UTC (permalink / raw)
To: Andrew Morton
Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
Jérôme Glisse, Paolo Bonzini,
Radim Krčmář,
linux-kernel, linux-mm
Fix mmu_notifier.h comments in "mm, mmu_notifier: annotate mmu notifiers
with blockable invalidate callbacks".
mmu_notifier_invalidate_range_end() can also call the invalidate_range()
callback, so it must not block for MMU_INVALIDATE_DOES_NOT_BLOCK to be
set.
Also remove a bogus comment about invalidate_range() always being called
under the ptl spinlock.
Signed-off-by: David Rientjes <rientjes@google.com>
---
include/linux/mmu_notifier.h | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -34,8 +34,8 @@ struct mmu_notifier_ops {
* Flags to specify behavior of callbacks for this MMU notifier.
* Used to determine which context an operation may be called.
*
- * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
- * block
+ * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not
+ * block
*/
int flags;
@@ -151,8 +151,9 @@ struct mmu_notifier_ops {
* address space but may still be referenced by sptes until
* the last refcount is dropped.
*
- * If both of these callbacks cannot block, mmu_notifier_ops.flags
- * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
+ * If both of these callbacks cannot block, and invalidate_range
+ * cannot block, mmu_notifier_ops.flags should have
+ * MMU_INVALIDATE_DOES_NOT_BLOCK set.
*/
void (*invalidate_range_start)(struct mmu_notifier *mn,
struct mm_struct *mm,
@@ -175,12 +176,13 @@ struct mmu_notifier_ops {
* external TLB range needs to be flushed. For more in depth
* discussion on this see Documentation/vm/mmu_notifier.txt
*
- * The invalidate_range() function is called under the ptl
- * spin-lock and not allowed to sleep.
- *
* Note that this function might be called with just a sub-range
* of what was passed to invalidate_range_start()/end(), if
* called between those functions.
+ *
+ * If this callback cannot block, and invalidate_range_{start,end}
+ * cannot block, mmu_notifier_ops.flags should have
+ * MMU_INVALIDATE_DOES_NOT_BLOCK set.
*/
void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
unsigned long start, unsigned long end);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 53+ messages in thread