All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-11 22:11 ` David Rientjes
  0 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-11 22:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
prevented the oom reaper from unmapping private anonymous memory with the
oom reaper when the oom victim mm had mmu notifiers registered.

The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
around the unmap_page_range(), which is needed, can block and the oom
killer will stall forever waiting for the victim to exit, which may not
be possible without reaping.

That concern is real, but only true for mmu notifiers that have blockable
invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
for mmu notifiers that can set a bit to indicate that these callbacks do
block.

The implementation is steered toward an expensive slowpath, such as after
the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 arch/powerpc/platforms/powernv/npu-dma.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c   |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  1 +
 drivers/gpu/drm/i915/i915_gem_userptr.c  |  1 +
 drivers/gpu/drm/radeon/radeon_mn.c       |  5 +++--
 drivers/infiniband/core/umem_odp.c       |  1 +
 drivers/infiniband/hw/hfi1/mmu_rb.c      |  1 +
 drivers/iommu/amd_iommu_v2.c             |  1 +
 drivers/iommu/intel-svm.c                |  1 +
 drivers/misc/mic/scif/scif_dma.c         |  1 +
 drivers/misc/sgi-gru/grutlbpurge.c       |  1 +
 drivers/xen/gntdev.c                     |  1 +
 include/linux/mmu_notifier.h             | 13 +++++++++++++
 mm/hmm.c                                 |  1 +
 mm/mmu_notifier.c                        | 25 +++++++++++++++++++++++++
 virt/kvm/kvm_main.c                      |  1 +
 16 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -710,6 +710,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 
 		mm->context.npu_context = npu_context;
 		npu_context->mm = mm;
+		npu_content->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
 		npu_context->mn.ops = &nv_nmmu_notifier_ops;
 		__mmu_notifier_register(&npu_context->mn, mm);
 		kref_init(&npu_context->kref);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -276,6 +276,7 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev)
 
 	rmn->adev = adev;
 	rmn->mm = mm;
+	rmn->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
 	rmn->mn.ops = &amdgpu_mn_ops;
 	init_rwsem(&rmn->lock);
 	rmn->objects = RB_ROOT_CACHED;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -282,6 +282,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 	process->mm = thread->mm;
 
 	/* register notifier */
+	process->mmu_notifier.flags = 0;
 	process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
 	err = __mmu_notifier_register(&process->mmu_notifier, process->mm);
 	if (err)
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -170,6 +170,7 @@ i915_mmu_notifier_create(struct mm_struct *mm)
 		return ERR_PTR(-ENOMEM);
 
 	spin_lock_init(&mn->lock);
+	mn->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
 	mn->mn.ops = &i915_gem_userptr_notifier;
 	mn->objects = RB_ROOT_CACHED;
 	mn->wq = alloc_workqueue("i915-userptr-release",
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -164,7 +164,7 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 			radeon_bo_unreserve(bo);
 		}
 	}
-	
+
 	mutex_unlock(&rmn->lock);
 }
 
@@ -203,10 +203,11 @@ static struct radeon_mn *radeon_mn_get(struct radeon_device *rdev)
 
 	rmn->rdev = rdev;
 	rmn->mm = mm;
+	rmn->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
 	rmn->mn.ops = &radeon_mn_ops;
 	mutex_init(&rmn->lock);
 	rmn->objects = RB_ROOT_CACHED;
-	
+
 	r = __mmu_notifier_register(&rmn->mn, mm);
 	if (r)
 		goto free_rmn;
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -411,6 +411,7 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem,
 		 */
 		atomic_set(&context->notifier_count, 0);
 		INIT_HLIST_NODE(&context->mn.hlist);
+		context->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
 		context->mn.ops = &ib_umem_notifiers;
 		/*
 		 * Lock-dep detects a false positive for mmap_sem vs.
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -110,6 +110,7 @@ int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
 	handlr->ops_arg = ops_arg;
 	INIT_HLIST_NODE(&handlr->mn.hlist);
 	spin_lock_init(&handlr->lock);
+	handlr->mn.flags = 0;
 	handlr->mn.ops = &mn_opts;
 	handlr->mm = mm;
 	INIT_WORK(&handlr->del_work, handle_remove);
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -671,6 +671,7 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
 	pasid_state->pasid        = pasid;
 	pasid_state->invalid      = true; /* Mark as valid only if we are
 					     done with setting up the pasid */
+	pasid_state->mn.flags     = 0;
 	pasid_state->mn.ops       = &iommu_mn;
 
 	if (pasid_state->mm == NULL)
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -382,6 +382,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 			goto out;
 		}
 		svm->pasid = ret;
+		svm->notifier.flags = 0;
 		svm->notifier.ops = &intel_mmuops;
 		svm->mm = mm;
 		svm->flags = flags;
diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c
--- a/drivers/misc/mic/scif/scif_dma.c
+++ b/drivers/misc/mic/scif/scif_dma.c
@@ -249,6 +249,7 @@ static void scif_init_mmu_notifier(struct scif_mmu_notif *mmn,
 {
 	mmn->ep = ep;
 	mmn->mm = mm;
+	mmn->ep_mmu_notifier.flags = MMU_INVALIDATE_MAY_BLOCK;
 	mmn->ep_mmu_notifier.ops = &scif_mmu_notifier_ops;
 	INIT_LIST_HEAD(&mmn->list);
 	INIT_LIST_HEAD(&mmn->tc_reg_list);
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
 			return ERR_PTR(-ENOMEM);
 		STAT(gms_alloc);
 		spin_lock_init(&gms->ms_asid_lock);
+		gms->ms_notifier.flags = 0;
 		gms->ms_notifier.ops = &gru_mmuops;
 		atomic_set(&gms->ms_refcnt, 1);
 		init_waitqueue_head(&gms->ms_wait_queue);
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -539,6 +539,7 @@ static int gntdev_open(struct inode *inode, struct file *flip)
 			kfree(priv);
 			return -ENOMEM;
 		}
+		priv->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
 		priv->mn.ops = &gntdev_mmu_ops;
 		ret = mmu_notifier_register(&priv->mn, priv->mm);
 		mmput(priv->mm);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -10,6 +10,9 @@
 struct mmu_notifier;
 struct mmu_notifier_ops;
 
+/* This mmu notifier's invalidate_{start,end}() callbacks may block */
+#define MMU_INVALIDATE_MAY_BLOCK	(0x01)
+
 #ifdef CONFIG_MMU_NOTIFIER
 
 /*
@@ -137,6 +140,9 @@ struct mmu_notifier_ops {
 	 * page. Pages will no longer be referenced by the linux
 	 * address space but may still be referenced by sptes until
 	 * the last refcount is dropped.
+	 *
+	 * If either of these callbacks can block, the mmu_notifier.flags
+	 * must have MMU_INVALIDATE_MAY_BLOCK set.
 	 */
 	void (*invalidate_range_start)(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
@@ -182,6 +188,7 @@ struct mmu_notifier_ops {
  * 3. No other concurrent thread can access the list (release)
  */
 struct mmu_notifier {
+	int flags;
 	struct hlist_node hlist;
 	const struct mmu_notifier_ops *ops;
 };
@@ -218,6 +225,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 				  bool only_end);
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
+extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
 
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
@@ -457,6 +465,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
 {
 }
 
+static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+	return 0;
+}
+
 static inline void mmu_notifier_mm_init(struct mm_struct *mm)
 {
 }
diff --git a/mm/hmm.c b/mm/hmm.c
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -104,6 +104,7 @@ static struct hmm *hmm_register(struct mm_struct *mm)
 	 * We should only get here if hold the mmap_sem in write mode ie on
 	 * registration of first mirror through hmm_mirror_register()
 	 */
+	hmm->mmu_notifier.flags = MMU_INVALIDATE_MAY_BLOCK;
 	hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
 	if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
 		kfree(hmm);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -236,6 +236,31 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 }
 EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
 
+/*
+ * Must be called while holding mm->mmap_sem for either read or write.
+ * The result is guaranteed to be valid until mm->mmap_sem is dropped.
+ */
+int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+	struct mmu_notifier *mn;
+	int id;
+	int ret = 0;
+
+	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
+
+	if (!mm_has_notifiers(mm))
+		return ret;
+
+	id = srcu_read_lock(&srcu);
+	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
+		if (mn->flags & MMU_INVALIDATE_MAY_BLOCK) {
+			ret = 1;
+			break;
+		}
+	srcu_read_unlock(&srcu, id);
+	return ret;
+}
+
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
 				    struct mm_struct *mm,
 				    int take_mmap_sem)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -487,6 +487,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 
 static int kvm_init_mmu_notifier(struct kvm *kvm)
 {
+	kvm->mmu_notifier.flags = 0;
 	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 }

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-11 22:11 ` David Rientjes
  0 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-11 22:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
prevented the oom reaper from unmapping private anonymous memory with the
oom reaper when the oom victim mm had mmu notifiers registered.

The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
around the unmap_page_range(), which is needed, can block and the oom
killer will stall forever waiting for the victim to exit, which may not
be possible without reaping.

That concern is real, but only true for mmu notifiers that have blockable
invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
for mmu notifiers that can set a bit to indicate that these callbacks do
block.

The implementation is steered toward an expensive slowpath, such as after
the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 arch/powerpc/platforms/powernv/npu-dma.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c   |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  1 +
 drivers/gpu/drm/i915/i915_gem_userptr.c  |  1 +
 drivers/gpu/drm/radeon/radeon_mn.c       |  5 +++--
 drivers/infiniband/core/umem_odp.c       |  1 +
 drivers/infiniband/hw/hfi1/mmu_rb.c      |  1 +
 drivers/iommu/amd_iommu_v2.c             |  1 +
 drivers/iommu/intel-svm.c                |  1 +
 drivers/misc/mic/scif/scif_dma.c         |  1 +
 drivers/misc/sgi-gru/grutlbpurge.c       |  1 +
 drivers/xen/gntdev.c                     |  1 +
 include/linux/mmu_notifier.h             | 13 +++++++++++++
 mm/hmm.c                                 |  1 +
 mm/mmu_notifier.c                        | 25 +++++++++++++++++++++++++
 virt/kvm/kvm_main.c                      |  1 +
 16 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -710,6 +710,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 
 		mm->context.npu_context = npu_context;
 		npu_context->mm = mm;
+		npu_content->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
 		npu_context->mn.ops = &nv_nmmu_notifier_ops;
 		__mmu_notifier_register(&npu_context->mn, mm);
 		kref_init(&npu_context->kref);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -276,6 +276,7 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev)
 
 	rmn->adev = adev;
 	rmn->mm = mm;
+	rmn->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
 	rmn->mn.ops = &amdgpu_mn_ops;
 	init_rwsem(&rmn->lock);
 	rmn->objects = RB_ROOT_CACHED;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -282,6 +282,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 	process->mm = thread->mm;
 
 	/* register notifier */
+	process->mmu_notifier.flags = 0;
 	process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
 	err = __mmu_notifier_register(&process->mmu_notifier, process->mm);
 	if (err)
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -170,6 +170,7 @@ i915_mmu_notifier_create(struct mm_struct *mm)
 		return ERR_PTR(-ENOMEM);
 
 	spin_lock_init(&mn->lock);
+	mn->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
 	mn->mn.ops = &i915_gem_userptr_notifier;
 	mn->objects = RB_ROOT_CACHED;
 	mn->wq = alloc_workqueue("i915-userptr-release",
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -164,7 +164,7 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 			radeon_bo_unreserve(bo);
 		}
 	}
-	
+
 	mutex_unlock(&rmn->lock);
 }
 
@@ -203,10 +203,11 @@ static struct radeon_mn *radeon_mn_get(struct radeon_device *rdev)
 
 	rmn->rdev = rdev;
 	rmn->mm = mm;
+	rmn->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
 	rmn->mn.ops = &radeon_mn_ops;
 	mutex_init(&rmn->lock);
 	rmn->objects = RB_ROOT_CACHED;
-	
+
 	r = __mmu_notifier_register(&rmn->mn, mm);
 	if (r)
 		goto free_rmn;
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -411,6 +411,7 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem,
 		 */
 		atomic_set(&context->notifier_count, 0);
 		INIT_HLIST_NODE(&context->mn.hlist);
+		context->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
 		context->mn.ops = &ib_umem_notifiers;
 		/*
 		 * Lock-dep detects a false positive for mmap_sem vs.
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -110,6 +110,7 @@ int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
 	handlr->ops_arg = ops_arg;
 	INIT_HLIST_NODE(&handlr->mn.hlist);
 	spin_lock_init(&handlr->lock);
+	handlr->mn.flags = 0;
 	handlr->mn.ops = &mn_opts;
 	handlr->mm = mm;
 	INIT_WORK(&handlr->del_work, handle_remove);
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -671,6 +671,7 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
 	pasid_state->pasid        = pasid;
 	pasid_state->invalid      = true; /* Mark as valid only if we are
 					     done with setting up the pasid */
+	pasid_state->mn.flags     = 0;
 	pasid_state->mn.ops       = &iommu_mn;
 
 	if (pasid_state->mm == NULL)
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -382,6 +382,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 			goto out;
 		}
 		svm->pasid = ret;
+		svm->notifier.flags = 0;
 		svm->notifier.ops = &intel_mmuops;
 		svm->mm = mm;
 		svm->flags = flags;
diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c
--- a/drivers/misc/mic/scif/scif_dma.c
+++ b/drivers/misc/mic/scif/scif_dma.c
@@ -249,6 +249,7 @@ static void scif_init_mmu_notifier(struct scif_mmu_notif *mmn,
 {
 	mmn->ep = ep;
 	mmn->mm = mm;
+	mmn->ep_mmu_notifier.flags = MMU_INVALIDATE_MAY_BLOCK;
 	mmn->ep_mmu_notifier.ops = &scif_mmu_notifier_ops;
 	INIT_LIST_HEAD(&mmn->list);
 	INIT_LIST_HEAD(&mmn->tc_reg_list);
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
 			return ERR_PTR(-ENOMEM);
 		STAT(gms_alloc);
 		spin_lock_init(&gms->ms_asid_lock);
+		gms->ms_notifier.flags = 0;
 		gms->ms_notifier.ops = &gru_mmuops;
 		atomic_set(&gms->ms_refcnt, 1);
 		init_waitqueue_head(&gms->ms_wait_queue);
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -539,6 +539,7 @@ static int gntdev_open(struct inode *inode, struct file *flip)
 			kfree(priv);
 			return -ENOMEM;
 		}
+		priv->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
 		priv->mn.ops = &gntdev_mmu_ops;
 		ret = mmu_notifier_register(&priv->mn, priv->mm);
 		mmput(priv->mm);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -10,6 +10,9 @@
 struct mmu_notifier;
 struct mmu_notifier_ops;
 
+/* This mmu notifier's invalidate_{start,end}() callbacks may block */
+#define MMU_INVALIDATE_MAY_BLOCK	(0x01)
+
 #ifdef CONFIG_MMU_NOTIFIER
 
 /*
@@ -137,6 +140,9 @@ struct mmu_notifier_ops {
 	 * page. Pages will no longer be referenced by the linux
 	 * address space but may still be referenced by sptes until
 	 * the last refcount is dropped.
+	 *
+	 * If either of these callbacks can block, the mmu_notifier.flags
+	 * must have MMU_INVALIDATE_MAY_BLOCK set.
 	 */
 	void (*invalidate_range_start)(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
@@ -182,6 +188,7 @@ struct mmu_notifier_ops {
  * 3. No other concurrent thread can access the list (release)
  */
 struct mmu_notifier {
+	int flags;
 	struct hlist_node hlist;
 	const struct mmu_notifier_ops *ops;
 };
@@ -218,6 +225,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 				  bool only_end);
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
+extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
 
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
@@ -457,6 +465,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
 {
 }
 
+static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+	return 0;
+}
+
 static inline void mmu_notifier_mm_init(struct mm_struct *mm)
 {
 }
diff --git a/mm/hmm.c b/mm/hmm.c
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -104,6 +104,7 @@ static struct hmm *hmm_register(struct mm_struct *mm)
 	 * We should only get here if hold the mmap_sem in write mode ie on
 	 * registration of first mirror through hmm_mirror_register()
 	 */
+	hmm->mmu_notifier.flags = MMU_INVALIDATE_MAY_BLOCK;
 	hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
 	if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
 		kfree(hmm);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -236,6 +236,31 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 }
 EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
 
+/*
+ * Must be called while holding mm->mmap_sem for either read or write.
+ * The result is guaranteed to be valid until mm->mmap_sem is dropped.
+ */
+int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+	struct mmu_notifier *mn;
+	int id;
+	int ret = 0;
+
+	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
+
+	if (!mm_has_notifiers(mm))
+		return ret;
+
+	id = srcu_read_lock(&srcu);
+	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
+		if (mn->flags & MMU_INVALIDATE_MAY_BLOCK) {
+			ret = 1;
+			break;
+		}
+	srcu_read_unlock(&srcu, id);
+	return ret;
+}
+
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
 				    struct mm_struct *mm,
 				    int take_mmap_sem)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -487,6 +487,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 
 static int kvm_init_mmu_notifier(struct kvm *kvm)
 {
+	kvm->mmu_notifier.flags = 0;
 	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [patch 2/2] mm, oom: avoid reaping only for mm's with blockable invalidate callbacks
  2017-12-11 22:11 ` David Rientjes
@ 2017-12-11 22:11   ` David Rientjes
  -1 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-11 22:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

This uses the new annotation to determine if an mm has mmu notifiers with
blockable invalidate range callbacks to avoid oom reaping.  Otherwise, the
callbacks are used around unmap_page_range().

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/oom_kill.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 	}
 
 	/*
-	 * If the mm has notifiers then we would need to invalidate them around
-	 * unmap_page_range and that is risky because notifiers can sleep and
-	 * what they do is basically undeterministic.  So let's have a short
+	 * If the mm has invalidate_{start,end}() notifiers that could block,
 	 * sleep to give the oom victim some more time.
 	 * TODO: we really want to get rid of this ugly hack and make sure that
-	 * notifiers cannot block for unbounded amount of time and add
-	 * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
+	 * notifiers cannot block for unbounded amount of time
 	 */
-	if (mm_has_notifiers(mm)) {
+	if (mm_has_blockable_invalidate_notifiers(mm)) {
 		up_read(&mm->mmap_sem);
 		schedule_timeout_idle(HZ);
 		goto unlock_oom;
@@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 		 * count elevated without a good reason.
 		 */
 		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
-			tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
-			unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
-					 NULL);
-			tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+			const unsigned long start = vma->vm_start;
+			const unsigned long end = vma->vm_end;
+
+			tlb_gather_mmu(&tlb, mm, start, end);
+			mmu_notifier_invalidate_range_start(mm, start, end);
+			unmap_page_range(&tlb, vma, start, end, NULL);
+			mmu_notifier_invalidate_range_end(mm, start, end);
+			tlb_finish_mmu(&tlb, start, end);
 		}
 	}
 	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [patch 2/2] mm, oom: avoid reaping only for mm's with blockable invalidate callbacks
@ 2017-12-11 22:11   ` David Rientjes
  0 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-11 22:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

This uses the new annotation to determine if an mm has mmu notifiers with
blockable invalidate range callbacks to avoid oom reaping.  Otherwise, the
callbacks are used around unmap_page_range().

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/oom_kill.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 	}
 
 	/*
-	 * If the mm has notifiers then we would need to invalidate them around
-	 * unmap_page_range and that is risky because notifiers can sleep and
-	 * what they do is basically undeterministic.  So let's have a short
+	 * If the mm has invalidate_{start,end}() notifiers that could block,
 	 * sleep to give the oom victim some more time.
 	 * TODO: we really want to get rid of this ugly hack and make sure that
-	 * notifiers cannot block for unbounded amount of time and add
-	 * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
+	 * notifiers cannot block for unbounded amount of time
 	 */
-	if (mm_has_notifiers(mm)) {
+	if (mm_has_blockable_invalidate_notifiers(mm)) {
 		up_read(&mm->mmap_sem);
 		schedule_timeout_idle(HZ);
 		goto unlock_oom;
@@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 		 * count elevated without a good reason.
 		 */
 		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
-			tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
-			unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
-					 NULL);
-			tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+			const unsigned long start = vma->vm_start;
+			const unsigned long end = vma->vm_end;
+
+			tlb_gather_mmu(&tlb, mm, start, end);
+			mmu_notifier_invalidate_range_start(mm, start, end);
+			unmap_page_range(&tlb, vma, start, end, NULL);
+			mmu_notifier_invalidate_range_end(mm, start, end);
+			tlb_finish_mmu(&tlb, start, end);
 		}
 	}
 	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-11 22:11 ` David Rientjes
@ 2017-12-11 22:23   ` Paolo Bonzini
  -1 siblings, 0 replies; 53+ messages in thread
From: Paolo Bonzini @ 2017-12-11 22:23 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Radim Krčmář,
	linux-kernel, linux-mm

On 11/12/2017 23:11, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
> 
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
> 
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> for mmu notifiers that can set a bit to indicate that these callbacks do
> block.

Why not put the flag in the ops, since the same ops should always be
either blockable or unblockable?

Paolo

> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-11 22:23   ` Paolo Bonzini
  0 siblings, 0 replies; 53+ messages in thread
From: Paolo Bonzini @ 2017-12-11 22:23 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Radim Krčmář,
	linux-kernel, linux-mm

On 11/12/2017 23:11, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
> 
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
> 
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> for mmu notifiers that can set a bit to indicate that these callbacks do
> block.

Why not put the flag in the ops, since the same ops should always be
either blockable or unblockable?

Paolo

> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-11 22:23   ` Paolo Bonzini
@ 2017-12-11 23:09     ` David Rientjes
  -1 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-11 23:09 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, Christian König, David Airlie, Joerg Roedel,
	Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
	Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
	Radim Krčmář,
	linux-kernel, linux-mm

On Mon, 11 Dec 2017, Paolo Bonzini wrote:

> > Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> > prevented the oom reaper from unmapping private anonymous memory with the
> > oom reaper when the oom victim mm had mmu notifiers registered.
> > 
> > The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> > around the unmap_page_range(), which is needed, can block and the oom
> > killer will stall forever waiting for the victim to exit, which may not
> > be possible without reaping.
> > 
> > That concern is real, but only true for mmu notifiers that have blockable
> > invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> > for mmu notifiers that can set a bit to indicate that these callbacks do
> > block.
> 
> Why not put the flag in the ops, since the same ops should always be
> either blockable or unblockable?
> 

Hi Paolo,

It certainly can be in mmu_notifier_ops, the only rationale for putting 
the flags member in mmu_notifier was that it may become generally useful 
later for things other than callbacks.  I'm indifferent to where it is 
placed and will happily move it if that's desired, absent any other 
feedback on other parts of the patchset.

Thanks.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-11 23:09     ` David Rientjes
  0 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-11 23:09 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, Christian König, David Airlie, Joerg Roedel,
	Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
	Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
	Radim Krčmář,
	linux-kernel, linux-mm

On Mon, 11 Dec 2017, Paolo Bonzini wrote:

> > Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> > prevented the oom reaper from unmapping private anonymous memory with the
> > oom reaper when the oom victim mm had mmu notifiers registered.
> > 
> > The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> > around the unmap_page_range(), which is needed, can block and the oom
> > killer will stall forever waiting for the victim to exit, which may not
> > be possible without reaping.
> > 
> > That concern is real, but only true for mmu notifiers that have blockable
> > invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> > for mmu notifiers that can set a bit to indicate that these callbacks do
> > block.
> 
> Why not put the flag in the ops, since the same ops should always be
> either blockable or unblockable?
> 

Hi Paolo,

It certainly can be in mmu_notifier_ops, the only rationale for putting 
the flags member in mmu_notifier was that it may become generally useful 
later for things other than callbacks.  I'm indifferent to where it is 
placed and will happily move it if that's desired, absent any other 
feedback on other parts of the patchset.

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-11 22:11 ` David Rientjes
@ 2017-12-12 20:05   ` Dimitri Sivanich
  -1 siblings, 0 replies; 53+ messages in thread
From: Dimitri Sivanich @ 2017-12-12 20:05 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, Christian König, David Airlie, Joerg Roedel,
	Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
	Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
	Paolo Bonzini, Radim Krčmář,
	linux-kernel, linux-mm

On Mon, Dec 11, 2017 at 02:11:55PM -0800, David Rientjes wrote:
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
>  			return ERR_PTR(-ENOMEM);
>  		STAT(gms_alloc);
>  		spin_lock_init(&gms->ms_asid_lock);
> +		gms->ms_notifier.flags = 0;
>  		gms->ms_notifier.ops = &gru_mmuops;
>  		atomic_set(&gms->ms_refcnt, 1);
>  		init_waitqueue_head(&gms->ms_wait_queue);
> diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c

There is a kzalloc() just above this:
	gms = kzalloc(sizeof(*gms), GFP_KERNEL);

Is that not sufficient to clear the 'flags' field?

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-12 20:05   ` Dimitri Sivanich
  0 siblings, 0 replies; 53+ messages in thread
From: Dimitri Sivanich @ 2017-12-12 20:05 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, Christian König, David Airlie, Joerg Roedel,
	Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
	Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
	Paolo Bonzini, Radim Krčmář,
	linux-kernel, linux-mm

On Mon, Dec 11, 2017 at 02:11:55PM -0800, David Rientjes wrote:
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
>  			return ERR_PTR(-ENOMEM);
>  		STAT(gms_alloc);
>  		spin_lock_init(&gms->ms_asid_lock);
> +		gms->ms_notifier.flags = 0;
>  		gms->ms_notifier.ops = &gru_mmuops;
>  		atomic_set(&gms->ms_refcnt, 1);
>  		init_waitqueue_head(&gms->ms_wait_queue);
> diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c

There is a kzalloc() just above this:
	gms = kzalloc(sizeof(*gms), GFP_KERNEL);

Is that not sufficient to clear the 'flags' field?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-12 20:05   ` Dimitri Sivanich
@ 2017-12-12 21:28     ` David Rientjes
  -1 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-12 21:28 UTC (permalink / raw)
  To: Dimitri Sivanich
  Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, Christian König, David Airlie, Joerg Roedel,
	Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
	Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
	Paolo Bonzini, Radim Krčmář,
	linux-kernel, linux-mm

On Tue, 12 Dec 2017, Dimitri Sivanich wrote:

> > --- a/drivers/misc/sgi-gru/grutlbpurge.c
> > +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> > @@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
> >  			return ERR_PTR(-ENOMEM);
> >  		STAT(gms_alloc);
> >  		spin_lock_init(&gms->ms_asid_lock);
> > +		gms->ms_notifier.flags = 0;
> >  		gms->ms_notifier.ops = &gru_mmuops;
> >  		atomic_set(&gms->ms_refcnt, 1);
> >  		init_waitqueue_head(&gms->ms_wait_queue);
> > diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
> 
> There is a kzalloc() just above this:
> 	gms = kzalloc(sizeof(*gms), GFP_KERNEL);
> 
> Is that not sufficient to clear the 'flags' field?
> 

Absolutely, but whether it is better to explicitly document that the mmu 
notifier has cleared flags, i.e. there are no blockable callbacks, is 
another story.  I can change it if preferred.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-12 21:28     ` David Rientjes
  0 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-12 21:28 UTC (permalink / raw)
  To: Dimitri Sivanich
  Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, Christian König, David Airlie, Joerg Roedel,
	Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
	Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
	Paolo Bonzini, Radim Krčmář,
	linux-kernel, linux-mm

On Tue, 12 Dec 2017, Dimitri Sivanich wrote:

> > --- a/drivers/misc/sgi-gru/grutlbpurge.c
> > +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> > @@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
> >  			return ERR_PTR(-ENOMEM);
> >  		STAT(gms_alloc);
> >  		spin_lock_init(&gms->ms_asid_lock);
> > +		gms->ms_notifier.flags = 0;
> >  		gms->ms_notifier.ops = &gru_mmuops;
> >  		atomic_set(&gms->ms_refcnt, 1);
> >  		init_waitqueue_head(&gms->ms_wait_queue);
> > diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
> 
> There is a kzalloc() just above this:
> 	gms = kzalloc(sizeof(*gms), GFP_KERNEL);
> 
> Is that not sufficient to clear the 'flags' field?
> 

Absolutely, but whether it is better to explicitly document that the mmu 
notifier has cleared flags, i.e. there are no blockable callbacks, is 
another story.  I can change it if preferred.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-12 21:28     ` David Rientjes
@ 2017-12-13  9:34       ` Christian König
  -1 siblings, 0 replies; 53+ messages in thread
From: Christian König @ 2017-12-13  9:34 UTC (permalink / raw)
  To: David Rientjes, Dimitri Sivanich
  Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, David Airlie, Joerg Roedel, Doug Ledford,
	Jani Nikula, Mike Marciniszyn, Sean Hefty, Dimitri Sivanich,
	Boris Ostrovsky, Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

Am 12.12.2017 um 22:28 schrieb David Rientjes:
> On Tue, 12 Dec 2017, Dimitri Sivanich wrote:
>
>>> --- a/drivers/misc/sgi-gru/grutlbpurge.c
>>> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
>>> @@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
>>>   			return ERR_PTR(-ENOMEM);
>>>   		STAT(gms_alloc);
>>>   		spin_lock_init(&gms->ms_asid_lock);
>>> +		gms->ms_notifier.flags = 0;
>>>   		gms->ms_notifier.ops = &gru_mmuops;
>>>   		atomic_set(&gms->ms_refcnt, 1);
>>>   		init_waitqueue_head(&gms->ms_wait_queue);
>>> diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
>> There is a kzalloc() just above this:
>> 	gms = kzalloc(sizeof(*gms), GFP_KERNEL);
>>
>> Is that not sufficient to clear the 'flags' field?
>>
> Absolutely, but whether it is better to explicitly document that the mmu
> notifier has cleared flags, i.e. there are no blockable callbacks, is
> another story.  I can change it if preferred.

Actually I would invert the new flag, in other words specify that an MMU 
notifier will never sleep.

The first reason is that we have 8 blocking notifiers and 5 not blocking 
if I counted right. So it is actually more common to sleep than not to.

The second reason is to be conservative and assume the worst, e.g. that 
the flag is forgotten when a new notifier is added.

Regards,
Christian.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-13  9:34       ` Christian König
  0 siblings, 0 replies; 53+ messages in thread
From: Christian König @ 2017-12-13  9:34 UTC (permalink / raw)
  To: David Rientjes, Dimitri Sivanich
  Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, David Airlie, Joerg Roedel, Doug Ledford,
	Jani Nikula, Mike Marciniszyn, Sean Hefty, Dimitri Sivanich,
	Boris Ostrovsky, Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

Am 12.12.2017 um 22:28 schrieb David Rientjes:
> On Tue, 12 Dec 2017, Dimitri Sivanich wrote:
>
>>> --- a/drivers/misc/sgi-gru/grutlbpurge.c
>>> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
>>> @@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
>>>   			return ERR_PTR(-ENOMEM);
>>>   		STAT(gms_alloc);
>>>   		spin_lock_init(&gms->ms_asid_lock);
>>> +		gms->ms_notifier.flags = 0;
>>>   		gms->ms_notifier.ops = &gru_mmuops;
>>>   		atomic_set(&gms->ms_refcnt, 1);
>>>   		init_waitqueue_head(&gms->ms_wait_queue);
>>> diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
>> There is a kzalloc() just above this:
>> 	gms = kzalloc(sizeof(*gms), GFP_KERNEL);
>>
>> Is that not sufficient to clear the 'flags' field?
>>
> Absolutely, but whether it is better to explicitly document that the mmu
> notifier has cleared flags, i.e. there are no blockable callbacks, is
> another story.  I can change it if preferred.

Actually I would invert the new flag, in other words specify that an MMU 
notifier will never sleep.

The first reason is that we have 8 blocking notifiers and 5 not blocking 
if I counted right. So it is actually more common to sleep than not to.

The second reason is to be conservative and assume the worst, e.g. that 
the flag is forgotten when a new notifier is added.

Regards,
Christian.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-13  9:34       ` Christian König
@ 2017-12-13 10:26         ` Tetsuo Handa
  -1 siblings, 0 replies; 53+ messages in thread
From: Tetsuo Handa @ 2017-12-13 10:26 UTC (permalink / raw)
  To: Christian König, David Rientjes, Dimitri Sivanich
  Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, David Airlie, Joerg Roedel, Doug Ledford,
	Jani Nikula, Mike Marciniszyn, Sean Hefty, Dimitri Sivanich,
	Boris Ostrovsky, Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	LKML, linux-mm

On 2017/12/13 18:34, Christian König wrote:
> Am 12.12.2017 um 22:28 schrieb David Rientjes:
>> On Tue, 12 Dec 2017, Dimitri Sivanich wrote:
>>
>>>> --- a/drivers/misc/sgi-gru/grutlbpurge.c
>>>> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
>>>> @@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
>>>>               return ERR_PTR(-ENOMEM);
>>>>           STAT(gms_alloc);
>>>>           spin_lock_init(&gms->ms_asid_lock);
>>>> +        gms->ms_notifier.flags = 0;
>>>>           gms->ms_notifier.ops = &gru_mmuops;
>>>>           atomic_set(&gms->ms_refcnt, 1);
>>>>           init_waitqueue_head(&gms->ms_wait_queue);
>>>> diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
>>> There is a kzalloc() just above this:
>>>     gms = kzalloc(sizeof(*gms), GFP_KERNEL);
>>>
>>> Is that not sufficient to clear the 'flags' field?
>>>
>> Absolutely, but whether it is better to explicitly document that the mmu
>> notifier has cleared flags, i.e. there are no blockable callbacks, is
>> another story.  I can change it if preferred.
> 
> Actually I would invert the new flag, in other words specify that an MMU notifier will never sleep.
> 
> The first reason is that we have 8 blocking notifiers and 5 not blocking if I counted right. So it is actually more common to sleep than not to.
> 
> The second reason is to be conservative and assume the worst, e.g. that the flag is forgotten when a new notifier is added.

I agree. Some out of tree module might forget to set the flags.

Although you don't need to fix out of tree modules, as a troubleshooting
staff at a support center, I want to be able to identify the careless module.

I guess specifying the flags at register function would be the best, for
an attempt to call register function without knowing this change will
simply results in a build failure.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-13 10:26         ` Tetsuo Handa
  0 siblings, 0 replies; 53+ messages in thread
From: Tetsuo Handa @ 2017-12-13 10:26 UTC (permalink / raw)
  To: Christian König, David Rientjes, Dimitri Sivanich
  Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, David Airlie, Joerg Roedel, Doug Ledford,
	Jani Nikula, Mike Marciniszyn, Sean Hefty, Dimitri Sivanich,
	Boris Ostrovsky, Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	LKML, linux-mm

On 2017/12/13 18:34, Christian KA?nig wrote:
> Am 12.12.2017 um 22:28 schrieb David Rientjes:
>> On Tue, 12 Dec 2017, Dimitri Sivanich wrote:
>>
>>>> --- a/drivers/misc/sgi-gru/grutlbpurge.c
>>>> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
>>>> @@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
>>>>               return ERR_PTR(-ENOMEM);
>>>>           STAT(gms_alloc);
>>>>           spin_lock_init(&gms->ms_asid_lock);
>>>> +        gms->ms_notifier.flags = 0;
>>>>           gms->ms_notifier.ops = &gru_mmuops;
>>>>           atomic_set(&gms->ms_refcnt, 1);
>>>>           init_waitqueue_head(&gms->ms_wait_queue);
>>>> diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
>>> There is a kzalloc() just above this:
>>>     gms = kzalloc(sizeof(*gms), GFP_KERNEL);
>>>
>>> Is that not sufficient to clear the 'flags' field?
>>>
>> Absolutely, but whether it is better to explicitly document that the mmu
>> notifier has cleared flags, i.e. there are no blockable callbacks, is
>> another story.  I can change it if preferred.
> 
> Actually I would invert the new flag, in other words specify that an MMU notifier will never sleep.
> 
> The first reason is that we have 8 blocking notifiers and 5 not blocking if I counted right. So it is actually more common to sleep than not to.
> 
> The second reason is to be conservative and assume the worst, e.g. that the flag is forgotten when a new notifier is added.

I agree. Some out of tree module might forget to set the flags.

Although you don't need to fix out of tree modules, as a troubleshooting
staff at a support center, I want to be able to identify the careless module.

I guess specifying the flags at register function would be the best, for
an attempt to call register function without knowing this change will
simply results in a build failure.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-13 10:26         ` Tetsuo Handa
@ 2017-12-13 10:37           ` Paolo Bonzini
  -1 siblings, 0 replies; 53+ messages in thread
From: Paolo Bonzini @ 2017-12-13 10:37 UTC (permalink / raw)
  To: Tetsuo Handa, Christian König, David Rientjes, Dimitri Sivanich
  Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, David Airlie, Joerg Roedel, Doug Ledford,
	Jani Nikula, Mike Marciniszyn, Sean Hefty, Dimitri Sivanich,
	Boris Ostrovsky, Jérôme Glisse,
	Radim Krčmář,
	LKML, linux-mm

On 13/12/2017 11:26, Tetsuo Handa wrote:
> On 2017/12/13 18:34, Christian König wrote:
>> Am 12.12.2017 um 22:28 schrieb David Rientjes:
>>> On Tue, 12 Dec 2017, Dimitri Sivanich wrote:
>>>
>>>>> --- a/drivers/misc/sgi-gru/grutlbpurge.c
>>>>> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
>>>>> @@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
>>>>>               return ERR_PTR(-ENOMEM);
>>>>>           STAT(gms_alloc);
>>>>>           spin_lock_init(&gms->ms_asid_lock);
>>>>> +        gms->ms_notifier.flags = 0;
>>>>>           gms->ms_notifier.ops = &gru_mmuops;
>>>>>           atomic_set(&gms->ms_refcnt, 1);
>>>>>           init_waitqueue_head(&gms->ms_wait_queue);
>>>>> diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
>>>> There is a kzalloc() just above this:
>>>>     gms = kzalloc(sizeof(*gms), GFP_KERNEL);
>>>>
>>>> Is that not sufficient to clear the 'flags' field?
>>>>
>>> Absolutely, but whether it is better to explicitly document that the mmu
>>> notifier has cleared flags, i.e. there are no blockable callbacks, is
>>> another story.  I can change it if preferred.
>>
>> Actually I would invert the new flag, in other words specify that an MMU notifier will never sleep.
>>
>> The first reason is that we have 8 blocking notifiers and 5 not blocking if I counted right. So it is actually more common to sleep than not to.
>>
>> The second reason is to be conservative and assume the worst, e.g. that the flag is forgotten when a new notifier is added.
> 
> I agree. Some out of tree module might forget to set the flags.
> 
> Although you don't need to fix out of tree modules, as a troubleshooting
> staff at a support center, I want to be able to identify the careless module.
> 
> I guess specifying the flags at register function would be the best, for
> an attempt to call register function without knowing this change will
> simply results in a build failure.

Specifying them in the ops would have the same effect and it would be
even better, as you don't have to split the information across two places.

Paolo

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-13 10:37           ` Paolo Bonzini
  0 siblings, 0 replies; 53+ messages in thread
From: Paolo Bonzini @ 2017-12-13 10:37 UTC (permalink / raw)
  To: Tetsuo Handa, Christian König, David Rientjes, Dimitri Sivanich
  Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, David Airlie, Joerg Roedel, Doug Ledford,
	Jani Nikula, Mike Marciniszyn, Sean Hefty, Dimitri Sivanich,
	Boris Ostrovsky, Jérôme Glisse,
	Radim Krčmář,
	LKML, linux-mm

On 13/12/2017 11:26, Tetsuo Handa wrote:
> On 2017/12/13 18:34, Christian KA?nig wrote:
>> Am 12.12.2017 um 22:28 schrieb David Rientjes:
>>> On Tue, 12 Dec 2017, Dimitri Sivanich wrote:
>>>
>>>>> --- a/drivers/misc/sgi-gru/grutlbpurge.c
>>>>> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
>>>>> @@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
>>>>>               return ERR_PTR(-ENOMEM);
>>>>>           STAT(gms_alloc);
>>>>>           spin_lock_init(&gms->ms_asid_lock);
>>>>> +        gms->ms_notifier.flags = 0;
>>>>>           gms->ms_notifier.ops = &gru_mmuops;
>>>>>           atomic_set(&gms->ms_refcnt, 1);
>>>>>           init_waitqueue_head(&gms->ms_wait_queue);
>>>>> diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
>>>> There is a kzalloc() just above this:
>>>>     gms = kzalloc(sizeof(*gms), GFP_KERNEL);
>>>>
>>>> Is that not sufficient to clear the 'flags' field?
>>>>
>>> Absolutely, but whether it is better to explicitly document that the mmu
>>> notifier has cleared flags, i.e. there are no blockable callbacks, is
>>> another story.  I can change it if preferred.
>>
>> Actually I would invert the new flag, in other words specify that an MMU notifier will never sleep.
>>
>> The first reason is that we have 8 blocking notifiers and 5 not blocking if I counted right. So it is actually more common to sleep than not to.
>>
>> The second reason is to be conservative and assume the worst, e.g. that the flag is forgotten when a new notifier is added.
> 
> I agree. Some out of tree module might forget to set the flags.
> 
> Although you don't need to fix out of tree modules, as a troubleshooting
> staff at a support center, I want to be able to identify the careless module.
> 
> I guess specifying the flags at register function would be the best, for
> an attempt to call register function without knowing this change will
> simply results in a build failure.

Specifying them in the ops would have the same effect and it would be
even better, as you don't have to split the information across two places.

Paolo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-13  9:34       ` Christian König
@ 2017-12-14  9:19         ` David Rientjes
  -1 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-14  9:19 UTC (permalink / raw)
  To: Christian König
  Cc: Dimitri Sivanich, Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, David Airlie, Joerg Roedel, Doug Ledford,
	Jani Nikula, Mike Marciniszyn, Sean Hefty, Dimitri Sivanich,
	Boris Ostrovsky, Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1178 bytes --]

On Wed, 13 Dec 2017, Christian König wrote:

> > > > --- a/drivers/misc/sgi-gru/grutlbpurge.c
> > > > +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> > > > @@ -298,6 +298,7 @@ struct gru_mm_struct
> > > > *gru_register_mmu_notifier(void)
> > > >   			return ERR_PTR(-ENOMEM);
> > > >   		STAT(gms_alloc);
> > > >   		spin_lock_init(&gms->ms_asid_lock);
> > > > +		gms->ms_notifier.flags = 0;
> > > >   		gms->ms_notifier.ops = &gru_mmuops;
> > > >   		atomic_set(&gms->ms_refcnt, 1);
> > > >   		init_waitqueue_head(&gms->ms_wait_queue);
> > > > diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
> > > There is a kzalloc() just above this:
> > > 	gms = kzalloc(sizeof(*gms), GFP_KERNEL);
> > > 
> > > Is that not sufficient to clear the 'flags' field?
> > > 
> > Absolutely, but whether it is better to explicitly document that the mmu
> > notifier has cleared flags, i.e. there are no blockable callbacks, is
> > another story.  I can change it if preferred.
> 
> Actually I would invert the new flag, in other words specify that an MMU
> notifier will never sleep.
> 

Very good idea, I'll do that.  I'll also move the flags member to ops as 
Paolo suggested.

Thanks both!

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-14  9:19         ` David Rientjes
  0 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-14  9:19 UTC (permalink / raw)
  To: Christian König
  Cc: Dimitri Sivanich, Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, David Airlie, Joerg Roedel, Doug Ledford,
	Jani Nikula, Mike Marciniszyn, Sean Hefty, Dimitri Sivanich,
	Boris Ostrovsky, Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1178 bytes --]

On Wed, 13 Dec 2017, Christian KA?nig wrote:

> > > > --- a/drivers/misc/sgi-gru/grutlbpurge.c
> > > > +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> > > > @@ -298,6 +298,7 @@ struct gru_mm_struct
> > > > *gru_register_mmu_notifier(void)
> > > >   			return ERR_PTR(-ENOMEM);
> > > >   		STAT(gms_alloc);
> > > >   		spin_lock_init(&gms->ms_asid_lock);
> > > > +		gms->ms_notifier.flags = 0;
> > > >   		gms->ms_notifier.ops = &gru_mmuops;
> > > >   		atomic_set(&gms->ms_refcnt, 1);
> > > >   		init_waitqueue_head(&gms->ms_wait_queue);
> > > > diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
> > > There is a kzalloc() just above this:
> > > 	gms = kzalloc(sizeof(*gms), GFP_KERNEL);
> > > 
> > > Is that not sufficient to clear the 'flags' field?
> > > 
> > Absolutely, but whether it is better to explicitly document that the mmu
> > notifier has cleared flags, i.e. there are no blockable callbacks, is
> > another story.  I can change it if preferred.
> 
> Actually I would invert the new flag, in other words specify that an MMU
> notifier will never sleep.
> 

Very good idea, I'll do that.  I'll also move the flags member to ops as 
Paolo suggested.

Thanks both!

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-11 22:11 ` David Rientjes
                   ` (3 preceding siblings ...)
  (?)
@ 2017-12-14 12:46 ` kbuild test robot
  -1 siblings, 0 replies; 53+ messages in thread
From: kbuild test robot @ 2017-12-14 12:46 UTC (permalink / raw)
  To: David Rientjes
  Cc: kbuild-all, Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, Christian König, David Airlie, Joerg Roedel,
	Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
	Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
	Paolo Bonzini, Radim Krčmář,
	linux-kernel, linux-mm

[-- Attachment #1: Type: text/plain, Size: 5272 bytes --]

Hi David,

I love your patch! Yet something to improve:

[auto build test ERROR on drm/drm-next]
[also build test ERROR on v4.15-rc3 next-20171214]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/David-Rientjes/mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks/20171214-173044
base:   git://people.freedesktop.org/~airlied/linux.git drm-next
config: powerpc-defconfig (attached as .config)
compiler: powerpc64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=powerpc 

All errors (new ones prefixed by >>):

   arch/powerpc/platforms/powernv/npu-dma.c: In function 'pnv_npu2_init_context':
>> arch/powerpc/platforms/powernv/npu-dma.c:713:3: error: 'npu_content' undeclared (first use in this function); did you mean 'npu_context'?
      npu_content->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
      ^~~~~~~~~~~
      npu_context
   arch/powerpc/platforms/powernv/npu-dma.c:713:3: note: each undeclared identifier is reported only once for each function it appears in

vim +713 arch/powerpc/platforms/powernv/npu-dma.c

   639	
   640	/*
   641	 * Call into OPAL to setup the nmmu context for the current task in
   642	 * the NPU. This must be called to setup the context tables before the
   643	 * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
   644	 *
   645	 * A release callback should be registered to allow a device driver to
   646	 * be notified that it should not launch any new translation requests
   647	 * as the final TLB invalidate is about to occur.
   648	 *
   649	 * Returns an error if there no contexts are currently available or a
   650	 * npu_context which should be passed to pnv_npu2_handle_fault().
   651	 *
   652	 * mmap_sem must be held in write mode.
   653	 */
   654	struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
   655				unsigned long flags,
   656				struct npu_context *(*cb)(struct npu_context *, void *),
   657				void *priv)
   658	{
   659		int rc;
   660		u32 nvlink_index;
   661		struct device_node *nvlink_dn;
   662		struct mm_struct *mm = current->mm;
   663		struct pnv_phb *nphb;
   664		struct npu *npu;
   665		struct npu_context *npu_context;
   666	
   667		/*
   668		 * At present we don't support GPUs connected to multiple NPUs and I'm
   669		 * not sure the hardware does either.
   670		 */
   671		struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
   672	
   673		if (!firmware_has_feature(FW_FEATURE_OPAL))
   674			return ERR_PTR(-ENODEV);
   675	
   676		if (!npdev)
   677			/* No nvlink associated with this GPU device */
   678			return ERR_PTR(-ENODEV);
   679	
   680		if (!mm || mm->context.id == 0) {
   681			/*
   682			 * Kernel thread contexts are not supported and context id 0 is
   683			 * reserved on the GPU.
   684			 */
   685			return ERR_PTR(-EINVAL);
   686		}
   687	
   688		nphb = pci_bus_to_host(npdev->bus)->private_data;
   689		npu = &nphb->npu;
   690	
   691		/*
   692		 * Setup the NPU context table for a particular GPU. These need to be
   693		 * per-GPU as we need the tables to filter ATSDs when there are no
   694		 * active contexts on a particular GPU.
   695		 */
   696		rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
   697					PCI_DEVID(gpdev->bus->number, gpdev->devfn));
   698		if (rc < 0)
   699			return ERR_PTR(-ENOSPC);
   700	
   701		/*
   702		 * We store the npu pci device so we can more easily get at the
   703		 * associated npus.
   704		 */
   705		npu_context = mm->context.npu_context;
   706		if (!npu_context) {
   707			npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
   708			if (!npu_context)
   709				return ERR_PTR(-ENOMEM);
   710	
   711			mm->context.npu_context = npu_context;
   712			npu_context->mm = mm;
 > 713			npu_content->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
   714			npu_context->mn.ops = &nv_nmmu_notifier_ops;
   715			__mmu_notifier_register(&npu_context->mn, mm);
   716			kref_init(&npu_context->kref);
   717		} else {
   718			kref_get(&npu_context->kref);
   719		}
   720	
   721		npu_context->release_cb = cb;
   722		npu_context->priv = priv;
   723		nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
   724		if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
   725								&nvlink_index)))
   726			return ERR_PTR(-ENODEV);
   727		npu_context->npdev[npu->index][nvlink_index] = npdev;
   728	
   729		if (!nphb->npu.nmmu_flush) {
   730			/*
   731			 * If we're not explicitly flushing ourselves we need to mark
   732			 * the thread for global flushes
   733			 */
   734			npu_context->nmmu_flush = false;
   735			mm_context_add_copro(mm);
   736		} else
   737			npu_context->nmmu_flush = true;
   738	
   739		return npu_context;
   740	}
   741	EXPORT_SYMBOL(pnv_npu2_init_context);
   742	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 24072 bytes --]

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-11 22:11 ` David Rientjes
@ 2017-12-14 21:30   ` David Rientjes
  -1 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-14 21:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
prevented the oom reaper from unmapping private anonymous memory with the
oom reaper when the oom victim mm had mmu notifiers registered.

The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
around the unmap_page_range(), which is needed, can block and the oom
killer will stall forever waiting for the victim to exit, which may not
be possible without reaping.

That concern is real, but only true for mmu notifiers that have blockable
invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
to mmu notifier ops that can set a bit to indicate that these callbacks do
not block.

The implementation is steered toward an expensive slowpath, such as after
the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 v2:
   - specifically exclude mmu_notifiers without invalidate callbacks
   - move flags to mmu_notifier_ops per Paolo
   - reverse flag from blockable -> not blockable per Christian

 drivers/infiniband/hw/hfi1/mmu_rb.c |  1 +
 drivers/iommu/amd_iommu_v2.c        |  1 +
 drivers/iommu/intel-svm.c           |  1 +
 drivers/misc/sgi-gru/grutlbpurge.c  |  1 +
 include/linux/mmu_notifier.h        | 21 +++++++++++++++++++++
 mm/mmu_notifier.c                   | 31 +++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c                 |  1 +
 7 files changed, 57 insertions(+)

diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
 static void handle_remove(struct work_struct *work);
 
 static const struct mmu_notifier_ops mn_opts = {
+	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
 	.invalidate_range_start = mmu_notifier_range_start,
 };
 
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
 }
 
 static const struct mmu_notifier_ops iommu_mn = {
+	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
 	.release		= mn_release,
 	.clear_flush_young      = mn_clear_flush_young,
 	.invalidate_range       = mn_invalidate_range,
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 }
 
 static const struct mmu_notifier_ops intel_mmuops = {
+	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
 	.release = intel_mm_release,
 	.change_pte = intel_change_pte,
 	.invalidate_range = intel_invalidate_range,
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
 
 
 static const struct mmu_notifier_ops gru_mmuops = {
+	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
 	.invalidate_range_start	= gru_invalidate_range_start,
 	.invalidate_range_end	= gru_invalidate_range_end,
 	.release		= gru_release,
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -10,6 +10,9 @@
 struct mmu_notifier;
 struct mmu_notifier_ops;
 
+/* mmu_notifier_ops flags */
+#define MMU_INVALIDATE_DOES_NOT_BLOCK	(0x01)
+
 #ifdef CONFIG_MMU_NOTIFIER
 
 /*
@@ -26,6 +29,15 @@ struct mmu_notifier_mm {
 };
 
 struct mmu_notifier_ops {
+	/*
+	 * Flags to specify behavior of callbacks for this MMU notifier.
+	 * Used to determine which context an operation may be called.
+	 *
+	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
+	 *				  block
+	 */
+	int flags;
+
 	/*
 	 * Called either by mmu_notifier_unregister or when the mm is
 	 * being destroyed by exit_mmap, always before all pages are
@@ -137,6 +149,9 @@ struct mmu_notifier_ops {
 	 * page. Pages will no longer be referenced by the linux
 	 * address space but may still be referenced by sptes until
 	 * the last refcount is dropped.
+	 *
+	 * If both of these callbacks cannot block, mmu_notifier_ops.flags
+	 * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
 	 */
 	void (*invalidate_range_start)(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
@@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 				  bool only_end);
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
+extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
 
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
@@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
 {
 }
 
+static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+	return 0;
+}
+
 static inline void mmu_notifier_mm_init(struct mm_struct *mm)
 {
 }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 }
 EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
 
+/*
+ * Must be called while holding mm->mmap_sem for either read or write.
+ * The result is guaranteed to be valid until mm->mmap_sem is dropped.
+ */
+int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+	struct mmu_notifier *mn;
+	int id;
+	int ret = 0;
+
+	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
+
+	if (!mm_has_notifiers(mm))
+		return ret;
+
+	id = srcu_read_lock(&srcu);
+	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+		if (!mn->ops->invalidate_range &&
+		    !mn->ops->invalidate_range_start &&
+		    !mn->ops->invalidate_range_end)
+				continue;
+
+		if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
+			ret = 1;
+			break;
+		}
+	}
+	srcu_read_unlock(&srcu, id);
+	return ret;
+}
+
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
 				    struct mm_struct *mm,
 				    int take_mmap_sem)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 }
 
 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-14 21:30   ` David Rientjes
  0 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-14 21:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
prevented the oom reaper from unmapping private anonymous memory with the
oom reaper when the oom victim mm had mmu notifiers registered.

The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
around the unmap_page_range(), which is needed, can block and the oom
killer will stall forever waiting for the victim to exit, which may not
be possible without reaping.

That concern is real, but only true for mmu notifiers that have blockable
invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
to mmu notifier ops that can set a bit to indicate that these callbacks do
not block.

The implementation is steered toward an expensive slowpath, such as after
the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 v2:
   - specifically exclude mmu_notifiers without invalidate callbacks
   - move flags to mmu_notifier_ops per Paolo
   - reverse flag from blockable -> not blockable per Christian

 drivers/infiniband/hw/hfi1/mmu_rb.c |  1 +
 drivers/iommu/amd_iommu_v2.c        |  1 +
 drivers/iommu/intel-svm.c           |  1 +
 drivers/misc/sgi-gru/grutlbpurge.c  |  1 +
 include/linux/mmu_notifier.h        | 21 +++++++++++++++++++++
 mm/mmu_notifier.c                   | 31 +++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c                 |  1 +
 7 files changed, 57 insertions(+)

diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
 static void handle_remove(struct work_struct *work);
 
 static const struct mmu_notifier_ops mn_opts = {
+	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
 	.invalidate_range_start = mmu_notifier_range_start,
 };
 
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
 }
 
 static const struct mmu_notifier_ops iommu_mn = {
+	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
 	.release		= mn_release,
 	.clear_flush_young      = mn_clear_flush_young,
 	.invalidate_range       = mn_invalidate_range,
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 }
 
 static const struct mmu_notifier_ops intel_mmuops = {
+	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
 	.release = intel_mm_release,
 	.change_pte = intel_change_pte,
 	.invalidate_range = intel_invalidate_range,
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
 
 
 static const struct mmu_notifier_ops gru_mmuops = {
+	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
 	.invalidate_range_start	= gru_invalidate_range_start,
 	.invalidate_range_end	= gru_invalidate_range_end,
 	.release		= gru_release,
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -10,6 +10,9 @@
 struct mmu_notifier;
 struct mmu_notifier_ops;
 
+/* mmu_notifier_ops flags */
+#define MMU_INVALIDATE_DOES_NOT_BLOCK	(0x01)
+
 #ifdef CONFIG_MMU_NOTIFIER
 
 /*
@@ -26,6 +29,15 @@ struct mmu_notifier_mm {
 };
 
 struct mmu_notifier_ops {
+	/*
+	 * Flags to specify behavior of callbacks for this MMU notifier.
+	 * Used to determine which context an operation may be called.
+	 *
+	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
+	 *				  block
+	 */
+	int flags;
+
 	/*
 	 * Called either by mmu_notifier_unregister or when the mm is
 	 * being destroyed by exit_mmap, always before all pages are
@@ -137,6 +149,9 @@ struct mmu_notifier_ops {
 	 * page. Pages will no longer be referenced by the linux
 	 * address space but may still be referenced by sptes until
 	 * the last refcount is dropped.
+	 *
+	 * If both of these callbacks cannot block, mmu_notifier_ops.flags
+	 * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
 	 */
 	void (*invalidate_range_start)(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
@@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 				  bool only_end);
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
+extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
 
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
@@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
 {
 }
 
+static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+	return 0;
+}
+
 static inline void mmu_notifier_mm_init(struct mm_struct *mm)
 {
 }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 }
 EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
 
+/*
+ * Must be called while holding mm->mmap_sem for either read or write.
+ * The result is guaranteed to be valid until mm->mmap_sem is dropped.
+ */
+int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+	struct mmu_notifier *mn;
+	int id;
+	int ret = 0;
+
+	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
+
+	if (!mm_has_notifiers(mm))
+		return ret;
+
+	id = srcu_read_lock(&srcu);
+	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+		if (!mn->ops->invalidate_range &&
+		    !mn->ops->invalidate_range_start &&
+		    !mn->ops->invalidate_range_end)
+				continue;
+
+		if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
+			ret = 1;
+			break;
+		}
+	}
+	srcu_read_unlock(&srcu, id);
+	return ret;
+}
+
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
 				    struct mm_struct *mm,
 				    int take_mmap_sem)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 }
 
 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [patch v2 2/2] mm, oom: avoid reaping only for mm's with blockable invalidate callbacks
  2017-12-14 21:30   ` David Rientjes
@ 2017-12-14 21:31     ` David Rientjes
  -1 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-14 21:31 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

This uses the new annotation to determine if an mm has mmu notifiers with
blockable invalidate range callbacks to avoid oom reaping.  Otherwise, the
callbacks are used around unmap_page_range().

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/oom_kill.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 	}
 
 	/*
-	 * If the mm has notifiers then we would need to invalidate them around
-	 * unmap_page_range and that is risky because notifiers can sleep and
-	 * what they do is basically undeterministic.  So let's have a short
+	 * If the mm has invalidate_{start,end}() notifiers that could block,
 	 * sleep to give the oom victim some more time.
 	 * TODO: we really want to get rid of this ugly hack and make sure that
-	 * notifiers cannot block for unbounded amount of time and add
-	 * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
+	 * notifiers cannot block for unbounded amount of time
 	 */
-	if (mm_has_notifiers(mm)) {
+	if (mm_has_blockable_invalidate_notifiers(mm)) {
 		up_read(&mm->mmap_sem);
 		schedule_timeout_idle(HZ);
 		goto unlock_oom;
@@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 		 * count elevated without a good reason.
 		 */
 		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
-			tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
-			unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
-					 NULL);
-			tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+			const unsigned long start = vma->vm_start;
+			const unsigned long end = vma->vm_end;
+
+			tlb_gather_mmu(&tlb, mm, start, end);
+			mmu_notifier_invalidate_range_start(mm, start, end);
+			unmap_page_range(&tlb, vma, start, end, NULL);
+			mmu_notifier_invalidate_range_end(mm, start, end);
+			tlb_finish_mmu(&tlb, start, end);
 		}
 	}
 	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [patch v2 2/2] mm, oom: avoid reaping only for mm's with blockable invalidate callbacks
@ 2017-12-14 21:31     ` David Rientjes
  0 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-14 21:31 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

This uses the new annotation to determine if an mm has mmu notifiers with
blockable invalidate range callbacks to avoid oom reaping.  Otherwise, the
callbacks are used around unmap_page_range().

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/oom_kill.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 	}
 
 	/*
-	 * If the mm has notifiers then we would need to invalidate them around
-	 * unmap_page_range and that is risky because notifiers can sleep and
-	 * what they do is basically undeterministic.  So let's have a short
+	 * If the mm has invalidate_{start,end}() notifiers that could block,
 	 * sleep to give the oom victim some more time.
 	 * TODO: we really want to get rid of this ugly hack and make sure that
-	 * notifiers cannot block for unbounded amount of time and add
-	 * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
+	 * notifiers cannot block for unbounded amount of time
 	 */
-	if (mm_has_notifiers(mm)) {
+	if (mm_has_blockable_invalidate_notifiers(mm)) {
 		up_read(&mm->mmap_sem);
 		schedule_timeout_idle(HZ);
 		goto unlock_oom;
@@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 		 * count elevated without a good reason.
 		 */
 		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
-			tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
-			unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
-					 NULL);
-			tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+			const unsigned long start = vma->vm_start;
+			const unsigned long end = vma->vm_end;
+
+			tlb_gather_mmu(&tlb, mm, start, end);
+			mmu_notifier_invalidate_range_start(mm, start, end);
+			unmap_page_range(&tlb, vma, start, end, NULL);
+			mmu_notifier_invalidate_range_end(mm, start, end);
+			tlb_finish_mmu(&tlb, start, end);
 		}
 	}
 	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-14 21:30   ` David Rientjes
@ 2017-12-15  8:42     ` Paolo Bonzini
  -1 siblings, 0 replies; 53+ messages in thread
From: Paolo Bonzini @ 2017-12-15  8:42 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Radim Krčmář,
	linux-kernel, linux-mm

On 14/12/2017 22:30, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
> 
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
> 
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
> 
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  v2:
>    - specifically exclude mmu_notifiers without invalidate callbacks
>    - move flags to mmu_notifier_ops per Paolo
>    - reverse flag from blockable -> not blockable per Christian
> 
>  drivers/infiniband/hw/hfi1/mmu_rb.c |  1 +
>  drivers/iommu/amd_iommu_v2.c        |  1 +
>  drivers/iommu/intel-svm.c           |  1 +
>  drivers/misc/sgi-gru/grutlbpurge.c  |  1 +
>  include/linux/mmu_notifier.h        | 21 +++++++++++++++++++++
>  mm/mmu_notifier.c                   | 31 +++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c                 |  1 +
>  7 files changed, 57 insertions(+)
> 
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
>  static void handle_remove(struct work_struct *work);
>  
>  static const struct mmu_notifier_ops mn_opts = {
> +	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.invalidate_range_start = mmu_notifier_range_start,
>  };
>  
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
>  }
>  
>  static const struct mmu_notifier_ops iommu_mn = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.release		= mn_release,
>  	.clear_flush_young      = mn_clear_flush_young,
>  	.invalidate_range       = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
>  }
>  
>  static const struct mmu_notifier_ops intel_mmuops = {
> +	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.release = intel_mm_release,
>  	.change_pte = intel_change_pte,
>  	.invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
>  
>  
>  static const struct mmu_notifier_ops gru_mmuops = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.invalidate_range_start	= gru_invalidate_range_start,
>  	.invalidate_range_end	= gru_invalidate_range_end,
>  	.release		= gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
>  struct mmu_notifier;
>  struct mmu_notifier_ops;
>  
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK	(0x01)
> +
>  #ifdef CONFIG_MMU_NOTIFIER
>  
>  /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
>  };
>  
>  struct mmu_notifier_ops {
> +	/*
> +	 * Flags to specify behavior of callbacks for this MMU notifier.
> +	 * Used to determine which context an operation may be called.
> +	 *
> +	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> +	 *				  block
> +	 */
> +	int flags;
> +
>  	/*
>  	 * Called either by mmu_notifier_unregister or when the mm is
>  	 * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
>  	 * page. Pages will no longer be referenced by the linux
>  	 * address space but may still be referenced by sptes until
>  	 * the last refcount is dropped.
> +	 *
> +	 * If both of these callbacks cannot block, mmu_notifier_ops.flags
> +	 * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
>  	 */
>  	void (*invalidate_range_start)(struct mmu_notifier *mn,
>  				       struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
>  				  bool only_end);
>  extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>  				  unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>  
>  static inline void mmu_notifier_release(struct mm_struct *mm)
>  {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
>  {
>  }
>  
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> +	return 0;
> +}
> +
>  static inline void mmu_notifier_mm_init(struct mm_struct *mm)
>  {
>  }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>  }
>  EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>  
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> +	struct mmu_notifier *mn;
> +	int id;
> +	int ret = 0;
> +
> +	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> +	if (!mm_has_notifiers(mm))
> +		return ret;
> +
> +	id = srcu_read_lock(&srcu);
> +	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> +		if (!mn->ops->invalidate_range &&
> +		    !mn->ops->invalidate_range_start &&
> +		    !mn->ops->invalidate_range_end)
> +				continue;
> +
> +		if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> +			ret = 1;
> +			break;
> +		}
> +	}
> +	srcu_read_unlock(&srcu, id);
> +	return ret;
> +}
> +
>  static int do_mmu_notifier_register(struct mmu_notifier *mn,
>  				    struct mm_struct *mm,
>  				    int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
>  }
>  
>  static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
>  	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
>  	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
> 

Acked-by: Paolo Bonzini <pbonzini@redhat.com>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-15  8:42     ` Paolo Bonzini
  0 siblings, 0 replies; 53+ messages in thread
From: Paolo Bonzini @ 2017-12-15  8:42 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Radim Krčmář,
	linux-kernel, linux-mm

On 14/12/2017 22:30, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
> 
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
> 
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
> 
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  v2:
>    - specifically exclude mmu_notifiers without invalidate callbacks
>    - move flags to mmu_notifier_ops per Paolo
>    - reverse flag from blockable -> not blockable per Christian
> 
>  drivers/infiniband/hw/hfi1/mmu_rb.c |  1 +
>  drivers/iommu/amd_iommu_v2.c        |  1 +
>  drivers/iommu/intel-svm.c           |  1 +
>  drivers/misc/sgi-gru/grutlbpurge.c  |  1 +
>  include/linux/mmu_notifier.h        | 21 +++++++++++++++++++++
>  mm/mmu_notifier.c                   | 31 +++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c                 |  1 +
>  7 files changed, 57 insertions(+)
> 
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
>  static void handle_remove(struct work_struct *work);
>  
>  static const struct mmu_notifier_ops mn_opts = {
> +	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.invalidate_range_start = mmu_notifier_range_start,
>  };
>  
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
>  }
>  
>  static const struct mmu_notifier_ops iommu_mn = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.release		= mn_release,
>  	.clear_flush_young      = mn_clear_flush_young,
>  	.invalidate_range       = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
>  }
>  
>  static const struct mmu_notifier_ops intel_mmuops = {
> +	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.release = intel_mm_release,
>  	.change_pte = intel_change_pte,
>  	.invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
>  
>  
>  static const struct mmu_notifier_ops gru_mmuops = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.invalidate_range_start	= gru_invalidate_range_start,
>  	.invalidate_range_end	= gru_invalidate_range_end,
>  	.release		= gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
>  struct mmu_notifier;
>  struct mmu_notifier_ops;
>  
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK	(0x01)
> +
>  #ifdef CONFIG_MMU_NOTIFIER
>  
>  /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
>  };
>  
>  struct mmu_notifier_ops {
> +	/*
> +	 * Flags to specify behavior of callbacks for this MMU notifier.
> +	 * Used to determine which context an operation may be called.
> +	 *
> +	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> +	 *				  block
> +	 */
> +	int flags;
> +
>  	/*
>  	 * Called either by mmu_notifier_unregister or when the mm is
>  	 * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
>  	 * page. Pages will no longer be referenced by the linux
>  	 * address space but may still be referenced by sptes until
>  	 * the last refcount is dropped.
> +	 *
> +	 * If both of these callbacks cannot block, mmu_notifier_ops.flags
> +	 * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
>  	 */
>  	void (*invalidate_range_start)(struct mmu_notifier *mn,
>  				       struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
>  				  bool only_end);
>  extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>  				  unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>  
>  static inline void mmu_notifier_release(struct mm_struct *mm)
>  {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
>  {
>  }
>  
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> +	return 0;
> +}
> +
>  static inline void mmu_notifier_mm_init(struct mm_struct *mm)
>  {
>  }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>  }
>  EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>  
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> +	struct mmu_notifier *mn;
> +	int id;
> +	int ret = 0;
> +
> +	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> +	if (!mm_has_notifiers(mm))
> +		return ret;
> +
> +	id = srcu_read_lock(&srcu);
> +	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> +		if (!mn->ops->invalidate_range &&
> +		    !mn->ops->invalidate_range_start &&
> +		    !mn->ops->invalidate_range_end)
> +				continue;
> +
> +		if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> +			ret = 1;
> +			break;
> +		}
> +	}
> +	srcu_read_unlock(&srcu, id);
> +	return ret;
> +}
> +
>  static int do_mmu_notifier_register(struct mmu_notifier *mn,
>  				    struct mm_struct *mm,
>  				    int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
>  }
>  
>  static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
>  	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
>  	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
> 

Acked-by: Paolo Bonzini <pbonzini@redhat.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-14 21:30   ` David Rientjes
@ 2017-12-15 12:19     ` Christian König
  -1 siblings, 0 replies; 53+ messages in thread
From: Christian König @ 2017-12-15 12:19 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, David Airlie,
	Joerg Roedel, Doug Ledford, Jani Nikula, Mike Marciniszyn,
	Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

Am 14.12.2017 um 22:30 schrieb David Rientjes:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Christian König <christian.koenig@amd.com>

> ---
>   v2:
>     - specifically exclude mmu_notifiers without invalidate callbacks
>     - move flags to mmu_notifier_ops per Paolo
>     - reverse flag from blockable -> not blockable per Christian
>
>   drivers/infiniband/hw/hfi1/mmu_rb.c |  1 +
>   drivers/iommu/amd_iommu_v2.c        |  1 +
>   drivers/iommu/intel-svm.c           |  1 +
>   drivers/misc/sgi-gru/grutlbpurge.c  |  1 +
>   include/linux/mmu_notifier.h        | 21 +++++++++++++++++++++
>   mm/mmu_notifier.c                   | 31 +++++++++++++++++++++++++++++++
>   virt/kvm/kvm_main.c                 |  1 +
>   7 files changed, 57 insertions(+)
>
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
>   static void handle_remove(struct work_struct *work);
>   
>   static const struct mmu_notifier_ops mn_opts = {
> +	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
>   	.invalidate_range_start = mmu_notifier_range_start,
>   };
>   
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
>   }
>   
>   static const struct mmu_notifier_ops iommu_mn = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>   	.release		= mn_release,
>   	.clear_flush_young      = mn_clear_flush_young,
>   	.invalidate_range       = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
>   }
>   
>   static const struct mmu_notifier_ops intel_mmuops = {
> +	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
>   	.release = intel_mm_release,
>   	.change_pte = intel_change_pte,
>   	.invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
>   
>   
>   static const struct mmu_notifier_ops gru_mmuops = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>   	.invalidate_range_start	= gru_invalidate_range_start,
>   	.invalidate_range_end	= gru_invalidate_range_end,
>   	.release		= gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
>   struct mmu_notifier;
>   struct mmu_notifier_ops;
>   
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK	(0x01)
> +
>   #ifdef CONFIG_MMU_NOTIFIER
>   
>   /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
>   };
>   
>   struct mmu_notifier_ops {
> +	/*
> +	 * Flags to specify behavior of callbacks for this MMU notifier.
> +	 * Used to determine which context an operation may be called.
> +	 *
> +	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> +	 *				  block
> +	 */
> +	int flags;
> +
>   	/*
>   	 * Called either by mmu_notifier_unregister or when the mm is
>   	 * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
>   	 * page. Pages will no longer be referenced by the linux
>   	 * address space but may still be referenced by sptes until
>   	 * the last refcount is dropped.
> +	 *
> +	 * If both of these callbacks cannot block, mmu_notifier_ops.flags
> +	 * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
>   	 */
>   	void (*invalidate_range_start)(struct mmu_notifier *mn,
>   				       struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
>   				  bool only_end);
>   extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>   				  unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>   
>   static inline void mmu_notifier_release(struct mm_struct *mm)
>   {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
>   {
>   }
>   
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> +	return 0;
> +}
> +
>   static inline void mmu_notifier_mm_init(struct mm_struct *mm)
>   {
>   }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>   }
>   EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>   
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> +	struct mmu_notifier *mn;
> +	int id;
> +	int ret = 0;
> +
> +	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> +	if (!mm_has_notifiers(mm))
> +		return ret;
> +
> +	id = srcu_read_lock(&srcu);
> +	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> +		if (!mn->ops->invalidate_range &&
> +		    !mn->ops->invalidate_range_start &&
> +		    !mn->ops->invalidate_range_end)
> +				continue;
> +
> +		if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> +			ret = 1;
> +			break;
> +		}
> +	}
> +	srcu_read_unlock(&srcu, id);
> +	return ret;
> +}
> +
>   static int do_mmu_notifier_register(struct mmu_notifier *mn,
>   				    struct mm_struct *mm,
>   				    int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
>   }
>   
>   static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>   	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
>   	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
>   	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-15 12:19     ` Christian König
  0 siblings, 0 replies; 53+ messages in thread
From: Christian König @ 2017-12-15 12:19 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, David Airlie,
	Joerg Roedel, Doug Ledford, Jani Nikula, Mike Marciniszyn,
	Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

Am 14.12.2017 um 22:30 schrieb David Rientjes:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Christian KA?nig <christian.koenig@amd.com>

> ---
>   v2:
>     - specifically exclude mmu_notifiers without invalidate callbacks
>     - move flags to mmu_notifier_ops per Paolo
>     - reverse flag from blockable -> not blockable per Christian
>
>   drivers/infiniband/hw/hfi1/mmu_rb.c |  1 +
>   drivers/iommu/amd_iommu_v2.c        |  1 +
>   drivers/iommu/intel-svm.c           |  1 +
>   drivers/misc/sgi-gru/grutlbpurge.c  |  1 +
>   include/linux/mmu_notifier.h        | 21 +++++++++++++++++++++
>   mm/mmu_notifier.c                   | 31 +++++++++++++++++++++++++++++++
>   virt/kvm/kvm_main.c                 |  1 +
>   7 files changed, 57 insertions(+)
>
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
>   static void handle_remove(struct work_struct *work);
>   
>   static const struct mmu_notifier_ops mn_opts = {
> +	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
>   	.invalidate_range_start = mmu_notifier_range_start,
>   };
>   
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
>   }
>   
>   static const struct mmu_notifier_ops iommu_mn = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>   	.release		= mn_release,
>   	.clear_flush_young      = mn_clear_flush_young,
>   	.invalidate_range       = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
>   }
>   
>   static const struct mmu_notifier_ops intel_mmuops = {
> +	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
>   	.release = intel_mm_release,
>   	.change_pte = intel_change_pte,
>   	.invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
>   
>   
>   static const struct mmu_notifier_ops gru_mmuops = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>   	.invalidate_range_start	= gru_invalidate_range_start,
>   	.invalidate_range_end	= gru_invalidate_range_end,
>   	.release		= gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
>   struct mmu_notifier;
>   struct mmu_notifier_ops;
>   
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK	(0x01)
> +
>   #ifdef CONFIG_MMU_NOTIFIER
>   
>   /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
>   };
>   
>   struct mmu_notifier_ops {
> +	/*
> +	 * Flags to specify behavior of callbacks for this MMU notifier.
> +	 * Used to determine which context an operation may be called.
> +	 *
> +	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> +	 *				  block
> +	 */
> +	int flags;
> +
>   	/*
>   	 * Called either by mmu_notifier_unregister or when the mm is
>   	 * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
>   	 * page. Pages will no longer be referenced by the linux
>   	 * address space but may still be referenced by sptes until
>   	 * the last refcount is dropped.
> +	 *
> +	 * If both of these callbacks cannot block, mmu_notifier_ops.flags
> +	 * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
>   	 */
>   	void (*invalidate_range_start)(struct mmu_notifier *mn,
>   				       struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
>   				  bool only_end);
>   extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>   				  unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>   
>   static inline void mmu_notifier_release(struct mm_struct *mm)
>   {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
>   {
>   }
>   
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> +	return 0;
> +}
> +
>   static inline void mmu_notifier_mm_init(struct mm_struct *mm)
>   {
>   }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>   }
>   EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>   
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> +	struct mmu_notifier *mn;
> +	int id;
> +	int ret = 0;
> +
> +	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> +	if (!mm_has_notifiers(mm))
> +		return ret;
> +
> +	id = srcu_read_lock(&srcu);
> +	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> +		if (!mn->ops->invalidate_range &&
> +		    !mn->ops->invalidate_range_start &&
> +		    !mn->ops->invalidate_range_end)
> +				continue;
> +
> +		if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> +			ret = 1;
> +			break;
> +		}
> +	}
> +	srcu_read_unlock(&srcu, id);
> +	return ret;
> +}
> +
>   static int do_mmu_notifier_register(struct mmu_notifier *mn,
>   				    struct mm_struct *mm,
>   				    int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
>   }
>   
>   static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>   	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
>   	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
>   	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-14 21:30   ` David Rientjes
@ 2017-12-15 13:36     ` Dimitri Sivanich
  -1 siblings, 0 replies; 53+ messages in thread
From: Dimitri Sivanich @ 2017-12-15 13:36 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, Christian König, David Airlie, Joerg Roedel,
	Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
	Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
	Paolo Bonzini, Radim Krčmář,
	linux-kernel, linux-mm

On Thu, Dec 14, 2017 at 01:30:56PM -0800, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
> 
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
> 
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
> 
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Dimitri Sivanich <sivanich@hpe.com>

> ---
>  v2:
>    - specifically exclude mmu_notifiers without invalidate callbacks
>    - move flags to mmu_notifier_ops per Paolo
>    - reverse flag from blockable -> not blockable per Christian
> 
>  drivers/infiniband/hw/hfi1/mmu_rb.c |  1 +
>  drivers/iommu/amd_iommu_v2.c        |  1 +
>  drivers/iommu/intel-svm.c           |  1 +
>  drivers/misc/sgi-gru/grutlbpurge.c  |  1 +
>  include/linux/mmu_notifier.h        | 21 +++++++++++++++++++++
>  mm/mmu_notifier.c                   | 31 +++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c                 |  1 +
>  7 files changed, 57 insertions(+)
> 
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
>  static void handle_remove(struct work_struct *work);
>  
>  static const struct mmu_notifier_ops mn_opts = {
> +	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.invalidate_range_start = mmu_notifier_range_start,
>  };
>  
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
>  }
>  
>  static const struct mmu_notifier_ops iommu_mn = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.release		= mn_release,
>  	.clear_flush_young      = mn_clear_flush_young,
>  	.invalidate_range       = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
>  }
>  
>  static const struct mmu_notifier_ops intel_mmuops = {
> +	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.release = intel_mm_release,
>  	.change_pte = intel_change_pte,
>  	.invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
>  
>  
>  static const struct mmu_notifier_ops gru_mmuops = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.invalidate_range_start	= gru_invalidate_range_start,
>  	.invalidate_range_end	= gru_invalidate_range_end,
>  	.release		= gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
>  struct mmu_notifier;
>  struct mmu_notifier_ops;
>  
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK	(0x01)
> +
>  #ifdef CONFIG_MMU_NOTIFIER
>  
>  /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
>  };
>  
>  struct mmu_notifier_ops {
> +	/*
> +	 * Flags to specify behavior of callbacks for this MMU notifier.
> +	 * Used to determine which context an operation may be called.
> +	 *
> +	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> +	 *				  block
> +	 */
> +	int flags;
> +
>  	/*
>  	 * Called either by mmu_notifier_unregister or when the mm is
>  	 * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
>  	 * page. Pages will no longer be referenced by the linux
>  	 * address space but may still be referenced by sptes until
>  	 * the last refcount is dropped.
> +	 *
> +	 * If both of these callbacks cannot block, mmu_notifier_ops.flags
> +	 * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
>  	 */
>  	void (*invalidate_range_start)(struct mmu_notifier *mn,
>  				       struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
>  				  bool only_end);
>  extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>  				  unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>  
>  static inline void mmu_notifier_release(struct mm_struct *mm)
>  {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
>  {
>  }
>  
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> +	return 0;
> +}
> +
>  static inline void mmu_notifier_mm_init(struct mm_struct *mm)
>  {
>  }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>  }
>  EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>  
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> +	struct mmu_notifier *mn;
> +	int id;
> +	int ret = 0;
> +
> +	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> +	if (!mm_has_notifiers(mm))
> +		return ret;
> +
> +	id = srcu_read_lock(&srcu);
> +	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> +		if (!mn->ops->invalidate_range &&
> +		    !mn->ops->invalidate_range_start &&
> +		    !mn->ops->invalidate_range_end)
> +				continue;
> +
> +		if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> +			ret = 1;
> +			break;
> +		}
> +	}
> +	srcu_read_unlock(&srcu, id);
> +	return ret;
> +}
> +
>  static int do_mmu_notifier_register(struct mmu_notifier *mn,
>  				    struct mm_struct *mm,
>  				    int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
>  }
>  
>  static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
>  	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
>  	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-15 13:36     ` Dimitri Sivanich
  0 siblings, 0 replies; 53+ messages in thread
From: Dimitri Sivanich @ 2017-12-15 13:36 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Michal Hocko, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, Christian König, David Airlie, Joerg Roedel,
	Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
	Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
	Paolo Bonzini, Radim Krčmář,
	linux-kernel, linux-mm

On Thu, Dec 14, 2017 at 01:30:56PM -0800, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
> 
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
> 
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
> 
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Dimitri Sivanich <sivanich@hpe.com>

> ---
>  v2:
>    - specifically exclude mmu_notifiers without invalidate callbacks
>    - move flags to mmu_notifier_ops per Paolo
>    - reverse flag from blockable -> not blockable per Christian
> 
>  drivers/infiniband/hw/hfi1/mmu_rb.c |  1 +
>  drivers/iommu/amd_iommu_v2.c        |  1 +
>  drivers/iommu/intel-svm.c           |  1 +
>  drivers/misc/sgi-gru/grutlbpurge.c  |  1 +
>  include/linux/mmu_notifier.h        | 21 +++++++++++++++++++++
>  mm/mmu_notifier.c                   | 31 +++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c                 |  1 +
>  7 files changed, 57 insertions(+)
> 
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
>  static void handle_remove(struct work_struct *work);
>  
>  static const struct mmu_notifier_ops mn_opts = {
> +	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.invalidate_range_start = mmu_notifier_range_start,
>  };
>  
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
>  }
>  
>  static const struct mmu_notifier_ops iommu_mn = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.release		= mn_release,
>  	.clear_flush_young      = mn_clear_flush_young,
>  	.invalidate_range       = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
>  }
>  
>  static const struct mmu_notifier_ops intel_mmuops = {
> +	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.release = intel_mm_release,
>  	.change_pte = intel_change_pte,
>  	.invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
>  
>  
>  static const struct mmu_notifier_ops gru_mmuops = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.invalidate_range_start	= gru_invalidate_range_start,
>  	.invalidate_range_end	= gru_invalidate_range_end,
>  	.release		= gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
>  struct mmu_notifier;
>  struct mmu_notifier_ops;
>  
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK	(0x01)
> +
>  #ifdef CONFIG_MMU_NOTIFIER
>  
>  /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
>  };
>  
>  struct mmu_notifier_ops {
> +	/*
> +	 * Flags to specify behavior of callbacks for this MMU notifier.
> +	 * Used to determine which context an operation may be called.
> +	 *
> +	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> +	 *				  block
> +	 */
> +	int flags;
> +
>  	/*
>  	 * Called either by mmu_notifier_unregister or when the mm is
>  	 * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
>  	 * page. Pages will no longer be referenced by the linux
>  	 * address space but may still be referenced by sptes until
>  	 * the last refcount is dropped.
> +	 *
> +	 * If both of these callbacks cannot block, mmu_notifier_ops.flags
> +	 * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
>  	 */
>  	void (*invalidate_range_start)(struct mmu_notifier *mn,
>  				       struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
>  				  bool only_end);
>  extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>  				  unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>  
>  static inline void mmu_notifier_release(struct mm_struct *mm)
>  {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
>  {
>  }
>  
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> +	return 0;
> +}
> +
>  static inline void mmu_notifier_mm_init(struct mm_struct *mm)
>  {
>  }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>  }
>  EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>  
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> +	struct mmu_notifier *mn;
> +	int id;
> +	int ret = 0;
> +
> +	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> +	if (!mm_has_notifiers(mm))
> +		return ret;
> +
> +	id = srcu_read_lock(&srcu);
> +	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> +		if (!mn->ops->invalidate_range &&
> +		    !mn->ops->invalidate_range_start &&
> +		    !mn->ops->invalidate_range_end)
> +				continue;
> +
> +		if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> +			ret = 1;
> +			break;
> +		}
> +	}
> +	srcu_read_unlock(&srcu, id);
> +	return ret;
> +}
> +
>  static int do_mmu_notifier_register(struct mmu_notifier *mn,
>  				    struct mm_struct *mm,
>  				    int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
>  }
>  
>  static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
>  	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
>  	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-14 21:30   ` David Rientjes
@ 2017-12-15 16:25     ` Michal Hocko
  -1 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-15 16:25 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

On Thu 14-12-17 13:30:56, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
> 
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
> 
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
> 
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Yes, this make sense. I haven't checked all the existing mmu notifiers
but those that you have marked seem to be OK.

I just think that the semantic of the flag should be describe more. See
below

Acked-by: Michal Hocko <mhocko@suse.com>

> ---
>  v2:
>    - specifically exclude mmu_notifiers without invalidate callbacks
>    - move flags to mmu_notifier_ops per Paolo
>    - reverse flag from blockable -> not blockable per Christian
> 
>  drivers/infiniband/hw/hfi1/mmu_rb.c |  1 +
>  drivers/iommu/amd_iommu_v2.c        |  1 +
>  drivers/iommu/intel-svm.c           |  1 +
>  drivers/misc/sgi-gru/grutlbpurge.c  |  1 +
>  include/linux/mmu_notifier.h        | 21 +++++++++++++++++++++
>  mm/mmu_notifier.c                   | 31 +++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c                 |  1 +
>  7 files changed, 57 insertions(+)
> 
[...]
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
>  struct mmu_notifier;
>  struct mmu_notifier_ops;
>  
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK	(0x01)
> +
>  #ifdef CONFIG_MMU_NOTIFIER
>  
>  /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
>  };
>  
>  struct mmu_notifier_ops {
> +	/*
> +	 * Flags to specify behavior of callbacks for this MMU notifier.
> +	 * Used to determine which context an operation may be called.
> +	 *
> +	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> +	 *				  block
> +	 */
> +	int flags;

This should be more specific IMHO. What do you think about the following
wording?

invalidate_{start,end,range} doesn't block on any locks which depend
directly or indirectly (via lock chain or resources e.g. worker context)
on a memory allocation.

> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
>  }
>  
>  static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
>  	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
>  	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-15 16:25     ` Michal Hocko
  0 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-15 16:25 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

On Thu 14-12-17 13:30:56, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
> 
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
> 
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
> 
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Yes, this make sense. I haven't checked all the existing mmu notifiers
but those that you have marked seem to be OK.

I just think that the semantic of the flag should be describe more. See
below

Acked-by: Michal Hocko <mhocko@suse.com>

> ---
>  v2:
>    - specifically exclude mmu_notifiers without invalidate callbacks
>    - move flags to mmu_notifier_ops per Paolo
>    - reverse flag from blockable -> not blockable per Christian
> 
>  drivers/infiniband/hw/hfi1/mmu_rb.c |  1 +
>  drivers/iommu/amd_iommu_v2.c        |  1 +
>  drivers/iommu/intel-svm.c           |  1 +
>  drivers/misc/sgi-gru/grutlbpurge.c  |  1 +
>  include/linux/mmu_notifier.h        | 21 +++++++++++++++++++++
>  mm/mmu_notifier.c                   | 31 +++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c                 |  1 +
>  7 files changed, 57 insertions(+)
> 
[...]
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
>  struct mmu_notifier;
>  struct mmu_notifier_ops;
>  
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK	(0x01)
> +
>  #ifdef CONFIG_MMU_NOTIFIER
>  
>  /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
>  };
>  
>  struct mmu_notifier_ops {
> +	/*
> +	 * Flags to specify behavior of callbacks for this MMU notifier.
> +	 * Used to determine which context an operation may be called.
> +	 *
> +	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> +	 *				  block
> +	 */
> +	int flags;

This should be more specific IMHO. What do you think about the following
wording?

invalidate_{start,end,range} doesn't block on any locks which depend
directly or indirectly (via lock chain or resources e.g. worker context)
on a memory allocation.

> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
>  }
>  
>  static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> +	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
>  	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
>  	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
>  	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 2/2] mm, oom: avoid reaping only for mm's with blockable invalidate callbacks
  2017-12-14 21:31     ` David Rientjes
@ 2017-12-15 16:35       ` Michal Hocko
  -1 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-15 16:35 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

On Thu 14-12-17 13:31:00, David Rientjes wrote:
> This uses the new annotation to determine if an mm has mmu notifiers with
> blockable invalidate range callbacks to avoid oom reaping.  Otherwise, the
> callbacks are used around unmap_page_range().

Do you have any example where this helped? KVM guest oom killed I guess?

> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Michal Hocko <mhocko@suse.com>

> ---
>  mm/oom_kill.c | 21 +++++++++++----------
>  1 file changed, 11 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
>  	}
>  
>  	/*
> -	 * If the mm has notifiers then we would need to invalidate them around
> -	 * unmap_page_range and that is risky because notifiers can sleep and
> -	 * what they do is basically undeterministic.  So let's have a short
> +	 * If the mm has invalidate_{start,end}() notifiers that could block,
>  	 * sleep to give the oom victim some more time.
>  	 * TODO: we really want to get rid of this ugly hack and make sure that
> -	 * notifiers cannot block for unbounded amount of time and add
> -	 * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
> +	 * notifiers cannot block for unbounded amount of time
>  	 */
> -	if (mm_has_notifiers(mm)) {
> +	if (mm_has_blockable_invalidate_notifiers(mm)) {
>  		up_read(&mm->mmap_sem);
>  		schedule_timeout_idle(HZ);
>  		goto unlock_oom;
> @@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
>  		 * count elevated without a good reason.
>  		 */
>  		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
> -			tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
> -			unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
> -					 NULL);
> -			tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
> +			const unsigned long start = vma->vm_start;
> +			const unsigned long end = vma->vm_end;
> +
> +			tlb_gather_mmu(&tlb, mm, start, end);
> +			mmu_notifier_invalidate_range_start(mm, start, end);
> +			unmap_page_range(&tlb, vma, start, end, NULL);
> +			mmu_notifier_invalidate_range_end(mm, start, end);
> +			tlb_finish_mmu(&tlb, start, end);
>  		}
>  	}
>  	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 2/2] mm, oom: avoid reaping only for mm's with blockable invalidate callbacks
@ 2017-12-15 16:35       ` Michal Hocko
  0 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-15 16:35 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

On Thu 14-12-17 13:31:00, David Rientjes wrote:
> This uses the new annotation to determine if an mm has mmu notifiers with
> blockable invalidate range callbacks to avoid oom reaping.  Otherwise, the
> callbacks are used around unmap_page_range().

Do you have any example where this helped? KVM guest oom killed I guess?

> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Michal Hocko <mhocko@suse.com>

> ---
>  mm/oom_kill.c | 21 +++++++++++----------
>  1 file changed, 11 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
>  	}
>  
>  	/*
> -	 * If the mm has notifiers then we would need to invalidate them around
> -	 * unmap_page_range and that is risky because notifiers can sleep and
> -	 * what they do is basically undeterministic.  So let's have a short
> +	 * If the mm has invalidate_{start,end}() notifiers that could block,
>  	 * sleep to give the oom victim some more time.
>  	 * TODO: we really want to get rid of this ugly hack and make sure that
> -	 * notifiers cannot block for unbounded amount of time and add
> -	 * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
> +	 * notifiers cannot block for unbounded amount of time
>  	 */
> -	if (mm_has_notifiers(mm)) {
> +	if (mm_has_blockable_invalidate_notifiers(mm)) {
>  		up_read(&mm->mmap_sem);
>  		schedule_timeout_idle(HZ);
>  		goto unlock_oom;
> @@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
>  		 * count elevated without a good reason.
>  		 */
>  		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
> -			tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
> -			unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
> -					 NULL);
> -			tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
> +			const unsigned long start = vma->vm_start;
> +			const unsigned long end = vma->vm_end;
> +
> +			tlb_gather_mmu(&tlb, mm, start, end);
> +			mmu_notifier_invalidate_range_start(mm, start, end);
> +			unmap_page_range(&tlb, vma, start, end, NULL);
> +			mmu_notifier_invalidate_range_end(mm, start, end);
> +			tlb_finish_mmu(&tlb, start, end);
>  		}
>  	}
>  	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 2/2] mm, oom: avoid reaping only for mm's with blockable invalidate callbacks
  2017-12-15 16:35       ` Michal Hocko
@ 2017-12-15 21:46         ` David Rientjes
  -1 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-15 21:46 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

On Fri, 15 Dec 2017, Michal Hocko wrote:

> > This uses the new annotation to determine if an mm has mmu notifiers with
> > blockable invalidate range callbacks to avoid oom reaping.  Otherwise, the
> > callbacks are used around unmap_page_range().
> 
> Do you have any example where this helped? KVM guest oom killed I guess?
> 

KVM is the most significant one that we are interested in, but haven't had 
sufficient time to quantify how much of an impact this has other than to 
say it will certainly be non-zero.

The motivation was more to exclude mmu notifiers that have a reason to be 
excluded rather than a blanket exemption to make the oom reaper more 
robust.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 2/2] mm, oom: avoid reaping only for mm's with blockable invalidate callbacks
@ 2017-12-15 21:46         ` David Rientjes
  0 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2017-12-15 21:46 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

On Fri, 15 Dec 2017, Michal Hocko wrote:

> > This uses the new annotation to determine if an mm has mmu notifiers with
> > blockable invalidate range callbacks to avoid oom reaping.  Otherwise, the
> > callbacks are used around unmap_page_range().
> 
> Do you have any example where this helped? KVM guest oom killed I guess?
> 

KVM is the most significant one that we are interested in, but haven't had 
sufficient time to quantify how much of an impact this has other than to 
say it will certainly be non-zero.

The motivation was more to exclude mmu notifiers that have a reason to be 
excluded rather than a blanket exemption to make the oom reaper more 
robust.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-14 21:30   ` David Rientjes
@ 2017-12-15 23:04     ` Andrew Morton
  -1 siblings, 0 replies; 53+ messages in thread
From: Andrew Morton @ 2017-12-15 23:04 UTC (permalink / raw)
  To: David Rientjes
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

On Thu, 14 Dec 2017 13:30:56 -0800 (PST) David Rientjes <rientjes@google.com> wrote:

> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
> 
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
> 
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
> 
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.

some tweakage, please review.

From: Andrew Morton <akpm@linux-foundation.org>
Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix

make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()

Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: David Airlie <airlied@linux.ie>
Cc: David Rientjes <rientjes@google.com>
Cc: Dimitri Sivanich <sivanich@hpe.com>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Marciniszyn <mike.marciniszyn@intel.com>
Cc: Oded Gabbay <oded.gabbay@gmail.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 include/linux/mmu_notifier.h |    7 ++++---
 mm/mmu_notifier.c            |    8 ++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff -puN include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
+++ a/include/linux/mmu_notifier.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_MMU_NOTIFIER_H
 #define _LINUX_MMU_NOTIFIER_H
 
+#include <linux/types.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/mm_types.h>
@@ -233,7 +234,7 @@ extern void __mmu_notifier_invalidate_ra
 				  bool only_end);
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
-extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
+extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
 
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
@@ -473,9 +474,9 @@ static inline void mmu_notifier_invalida
 {
 }
 
-static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
 {
-	return 0;
+	return false;
 }
 
 static inline void mmu_notifier_mm_init(struct mm_struct *mm)
diff -puN mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix mm/mmu_notifier.c
--- a/mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
+++ a/mm/mmu_notifier.c
@@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
  * Must be called while holding mm->mmap_sem for either read or write.
  * The result is guaranteed to be valid until mm->mmap_sem is dropped.
  */
-int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
 {
 	struct mmu_notifier *mn;
 	int id;
-	int ret = 0;
+	bool ret = false;
 
-	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
+	WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
 
 	if (!mm_has_notifiers(mm))
 		return ret;
@@ -259,7 +259,7 @@ int mm_has_blockable_invalidate_notifier
 				continue;
 
 		if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
-			ret = 1;
+			ret = true;
 			break;
 		}
 	}
_

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-15 23:04     ` Andrew Morton
  0 siblings, 0 replies; 53+ messages in thread
From: Andrew Morton @ 2017-12-15 23:04 UTC (permalink / raw)
  To: David Rientjes
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

On Thu, 14 Dec 2017 13:30:56 -0800 (PST) David Rientjes <rientjes@google.com> wrote:

> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
> 
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
> 
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
> 
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.

some tweakage, please review.

From: Andrew Morton <akpm@linux-foundation.org>
Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix

make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()

Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian KA?nig <christian.koenig@amd.com>
Cc: David Airlie <airlied@linux.ie>
Cc: David Rientjes <rientjes@google.com>
Cc: Dimitri Sivanich <sivanich@hpe.com>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: JA(C)rA'me Glisse <jglisse@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Marciniszyn <mike.marciniszyn@intel.com>
Cc: Oded Gabbay <oded.gabbay@gmail.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Radim KrA?mA!A? <rkrcmar@redhat.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 include/linux/mmu_notifier.h |    7 ++++---
 mm/mmu_notifier.c            |    8 ++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff -puN include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
+++ a/include/linux/mmu_notifier.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_MMU_NOTIFIER_H
 #define _LINUX_MMU_NOTIFIER_H
 
+#include <linux/types.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/mm_types.h>
@@ -233,7 +234,7 @@ extern void __mmu_notifier_invalidate_ra
 				  bool only_end);
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
-extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
+extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
 
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
@@ -473,9 +474,9 @@ static inline void mmu_notifier_invalida
 {
 }
 
-static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
 {
-	return 0;
+	return false;
 }
 
 static inline void mmu_notifier_mm_init(struct mm_struct *mm)
diff -puN mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix mm/mmu_notifier.c
--- a/mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
+++ a/mm/mmu_notifier.c
@@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
  * Must be called while holding mm->mmap_sem for either read or write.
  * The result is guaranteed to be valid until mm->mmap_sem is dropped.
  */
-int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
 {
 	struct mmu_notifier *mn;
 	int id;
-	int ret = 0;
+	bool ret = false;
 
-	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
+	WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
 
 	if (!mm_has_notifiers(mm))
 		return ret;
@@ -259,7 +259,7 @@ int mm_has_blockable_invalidate_notifier
 				continue;
 
 		if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
-			ret = 1;
+			ret = true;
 			break;
 		}
 	}
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-15 16:25     ` Michal Hocko
@ 2017-12-16  6:21       ` Tetsuo Handa
  -1 siblings, 0 replies; 53+ messages in thread
From: Tetsuo Handa @ 2017-12-16  6:21 UTC (permalink / raw)
  To: Michal Hocko, David Rientjes
  Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

On 2017/12/16 1:25, Michal Hocko wrote:
>>  struct mmu_notifier_ops {
>> +	/*
>> +	 * Flags to specify behavior of callbacks for this MMU notifier.
>> +	 * Used to determine which context an operation may be called.
>> +	 *
>> +	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
>> +	 *				  block
>> +	 */
>> +	int flags;
> 
> This should be more specific IMHO. What do you think about the following
> wording?
> 
> invalidate_{start,end,range} doesn't block on any locks which depend
> directly or indirectly (via lock chain or resources e.g. worker context)
> on a memory allocation.

I disagree. It needlessly complicates validating the correctness.

What if the invalidate_{start,end} calls schedule_timeout_idle(10 * HZ) ?
schedule_timeout_idle() will not block on any locks which depend directly or
indirectly on a memory allocation, but we are already blocking other memory
allocating threads at mutex_trylock(&oom_lock) in __alloc_pages_may_oom().

This is essentially same with "sleeping forever due to schedule_timeout_killable(1) by
SCHED_IDLE thread with oom_lock held" versus "looping due to mutex_trylock(&oom_lock)
by all other allocating threads" lockup problem. The OOM reaper does not want to get
blocked for so long.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-16  6:21       ` Tetsuo Handa
  0 siblings, 0 replies; 53+ messages in thread
From: Tetsuo Handa @ 2017-12-16  6:21 UTC (permalink / raw)
  To: Michal Hocko, David Rientjes
  Cc: Andrew Morton, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

On 2017/12/16 1:25, Michal Hocko wrote:
>>  struct mmu_notifier_ops {
>> +	/*
>> +	 * Flags to specify behavior of callbacks for this MMU notifier.
>> +	 * Used to determine which context an operation may be called.
>> +	 *
>> +	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
>> +	 *				  block
>> +	 */
>> +	int flags;
> 
> This should be more specific IMHO. What do you think about the following
> wording?
> 
> invalidate_{start,end,range} doesn't block on any locks which depend
> directly or indirectly (via lock chain or resources e.g. worker context)
> on a memory allocation.

I disagree. It needlessly complicates validating the correctness.

What if the invalidate_{start,end} calls schedule_timeout_idle(10 * HZ) ?
schedule_timeout_idle() will not block on any locks which depend directly or
indirectly on a memory allocation, but we are already blocking other memory
allocating threads at mutex_trylock(&oom_lock) in __alloc_pages_may_oom().

This is essentially same with "sleeping forever due to schedule_timeout_killable(1) by
SCHED_IDLE thread with oom_lock held" versus "looping due to mutex_trylock(&oom_lock)
by all other allocating threads" lockup problem. The OOM reaper does not want to get
blocked for so long.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-15 23:04     ` Andrew Morton
@ 2017-12-16  7:14       ` Tetsuo Handa
  -1 siblings, 0 replies; 53+ messages in thread
From: Tetsuo Handa @ 2017-12-16  7:14 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

On 2017/12/16 8:04, Andrew Morton wrote:
>> The implementation is steered toward an expensive slowpath, such as after
>> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> 
> some tweakage, please review.
> 
> From: Andrew Morton <akpm@linux-foundation.org>
> Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> 
> make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
> 

> @@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
>   * Must be called while holding mm->mmap_sem for either read or write.
>   * The result is guaranteed to be valid until mm->mmap_sem is dropped.
>   */
> -int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
>  {
>  	struct mmu_notifier *mn;
>  	int id;
> -	int ret = 0;
> +	bool ret = false;
>  
> -	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +	WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
>  
>  	if (!mm_has_notifiers(mm))
>  		return ret;

rwsem_is_locked() test isn't equivalent with __mutex_owner() == current test, is it?
If rwsem_is_locked() returns true because somebody else has locked it, there is
no guarantee that current thread has locked it before calling this function.

down_write_trylock() test isn't equivalent with __mutex_owner() == current test, is it?
What if somebody else held it for read or write (the worst case is registration path),
down_write_trylock() will return false even if current thread has not locked it for
read or write.

I think this WARN_ON_ONCE() can not detect incorrect call to this function.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-16  7:14       ` Tetsuo Handa
  0 siblings, 0 replies; 53+ messages in thread
From: Tetsuo Handa @ 2017-12-16  7:14 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

On 2017/12/16 8:04, Andrew Morton wrote:
>> The implementation is steered toward an expensive slowpath, such as after
>> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> 
> some tweakage, please review.
> 
> From: Andrew Morton <akpm@linux-foundation.org>
> Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> 
> make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
> 

> @@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
>   * Must be called while holding mm->mmap_sem for either read or write.
>   * The result is guaranteed to be valid until mm->mmap_sem is dropped.
>   */
> -int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
>  {
>  	struct mmu_notifier *mn;
>  	int id;
> -	int ret = 0;
> +	bool ret = false;
>  
> -	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +	WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
>  
>  	if (!mm_has_notifiers(mm))
>  		return ret;

rwsem_is_locked() test isn't equivalent with __mutex_owner() == current test, is it?
If rwsem_is_locked() returns true because somebody else has locked it, there is
no guarantee that current thread has locked it before calling this function.

down_write_trylock() test isn't equivalent with __mutex_owner() == current test, is it?
What if somebody else held it for read or write (the worst case is registration path),
down_write_trylock() will return false even if current thread has not locked it for
read or write.

I think this WARN_ON_ONCE() can not detect incorrect call to this function.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-15 23:04     ` Andrew Morton
@ 2017-12-16  9:10       ` Michal Hocko
  -1 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-16  9:10 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Rientjes, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

On Fri 15-12-17 15:04:29, Andrew Morton wrote:
> On Thu, 14 Dec 2017 13:30:56 -0800 (PST) David Rientjes <rientjes@google.com> wrote:
> 
> > Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> > prevented the oom reaper from unmapping private anonymous memory with the
> > oom reaper when the oom victim mm had mmu notifiers registered.
> > 
> > The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> > around the unmap_page_range(), which is needed, can block and the oom
> > killer will stall forever waiting for the victim to exit, which may not
> > be possible without reaping.
> > 
> > That concern is real, but only true for mmu notifiers that have blockable
> > invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> > to mmu notifier ops that can set a bit to indicate that these callbacks do
> > not block.
> > 
> > The implementation is steered toward an expensive slowpath, such as after
> > the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> 
> some tweakage, please review.
> 
> From: Andrew Morton <akpm@linux-foundation.org>
> Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> 
> make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()

Yes, that makes sense to me.

> 
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: David Airlie <airlied@linux.ie>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Dimitri Sivanich <sivanich@hpe.com>
> Cc: Doug Ledford <dledford@redhat.com>
> Cc: Jani Nikula <jani.nikula@linux.intel.com>
> Cc: Jérôme Glisse <jglisse@redhat.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Mike Marciniszyn <mike.marciniszyn@intel.com>
> Cc: Oded Gabbay <oded.gabbay@gmail.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Radim Krčmář <rkrcmar@redhat.com>
> Cc: Sean Hefty <sean.hefty@intel.com>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
> 
>  include/linux/mmu_notifier.h |    7 ++++---
>  mm/mmu_notifier.c            |    8 ++++----
>  2 files changed, 8 insertions(+), 7 deletions(-)
> 
> diff -puN include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> +++ a/include/linux/mmu_notifier.h
> @@ -2,6 +2,7 @@
>  #ifndef _LINUX_MMU_NOTIFIER_H
>  #define _LINUX_MMU_NOTIFIER_H
>  
> +#include <linux/types.h>
>  #include <linux/list.h>
>  #include <linux/spinlock.h>
>  #include <linux/mm_types.h>
> @@ -233,7 +234,7 @@ extern void __mmu_notifier_invalidate_ra
>  				  bool only_end);
>  extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>  				  unsigned long start, unsigned long end);
> -extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
> +extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>  
>  static inline void mmu_notifier_release(struct mm_struct *mm)
>  {
> @@ -473,9 +474,9 @@ static inline void mmu_notifier_invalida
>  {
>  }
>  
> -static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
>  {
> -	return 0;
> +	return false;
>  }
>  
>  static inline void mmu_notifier_mm_init(struct mm_struct *mm)
> diff -puN mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> +++ a/mm/mmu_notifier.c
> @@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
>   * Must be called while holding mm->mmap_sem for either read or write.
>   * The result is guaranteed to be valid until mm->mmap_sem is dropped.
>   */
> -int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
>  {
>  	struct mmu_notifier *mn;
>  	int id;
> -	int ret = 0;
> +	bool ret = false;
>  
> -	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +	WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
>  
>  	if (!mm_has_notifiers(mm))
>  		return ret;
> @@ -259,7 +259,7 @@ int mm_has_blockable_invalidate_notifier
>  				continue;
>  
>  		if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> -			ret = 1;
> +			ret = true;
>  			break;
>  		}
>  	}
> _
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-16  9:10       ` Michal Hocko
  0 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-16  9:10 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Rientjes, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

On Fri 15-12-17 15:04:29, Andrew Morton wrote:
> On Thu, 14 Dec 2017 13:30:56 -0800 (PST) David Rientjes <rientjes@google.com> wrote:
> 
> > Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> > prevented the oom reaper from unmapping private anonymous memory with the
> > oom reaper when the oom victim mm had mmu notifiers registered.
> > 
> > The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> > around the unmap_page_range(), which is needed, can block and the oom
> > killer will stall forever waiting for the victim to exit, which may not
> > be possible without reaping.
> > 
> > That concern is real, but only true for mmu notifiers that have blockable
> > invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> > to mmu notifier ops that can set a bit to indicate that these callbacks do
> > not block.
> > 
> > The implementation is steered toward an expensive slowpath, such as after
> > the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> 
> some tweakage, please review.
> 
> From: Andrew Morton <akpm@linux-foundation.org>
> Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> 
> make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()

Yes, that makes sense to me.

> 
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> Cc: Christian KA?nig <christian.koenig@amd.com>
> Cc: David Airlie <airlied@linux.ie>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Dimitri Sivanich <sivanich@hpe.com>
> Cc: Doug Ledford <dledford@redhat.com>
> Cc: Jani Nikula <jani.nikula@linux.intel.com>
> Cc: JA(C)rA'me Glisse <jglisse@redhat.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Mike Marciniszyn <mike.marciniszyn@intel.com>
> Cc: Oded Gabbay <oded.gabbay@gmail.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Radim KrA?mA!A? <rkrcmar@redhat.com>
> Cc: Sean Hefty <sean.hefty@intel.com>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
> 
>  include/linux/mmu_notifier.h |    7 ++++---
>  mm/mmu_notifier.c            |    8 ++++----
>  2 files changed, 8 insertions(+), 7 deletions(-)
> 
> diff -puN include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> +++ a/include/linux/mmu_notifier.h
> @@ -2,6 +2,7 @@
>  #ifndef _LINUX_MMU_NOTIFIER_H
>  #define _LINUX_MMU_NOTIFIER_H
>  
> +#include <linux/types.h>
>  #include <linux/list.h>
>  #include <linux/spinlock.h>
>  #include <linux/mm_types.h>
> @@ -233,7 +234,7 @@ extern void __mmu_notifier_invalidate_ra
>  				  bool only_end);
>  extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>  				  unsigned long start, unsigned long end);
> -extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
> +extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>  
>  static inline void mmu_notifier_release(struct mm_struct *mm)
>  {
> @@ -473,9 +474,9 @@ static inline void mmu_notifier_invalida
>  {
>  }
>  
> -static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
>  {
> -	return 0;
> +	return false;
>  }
>  
>  static inline void mmu_notifier_mm_init(struct mm_struct *mm)
> diff -puN mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> +++ a/mm/mmu_notifier.c
> @@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
>   * Must be called while holding mm->mmap_sem for either read or write.
>   * The result is guaranteed to be valid until mm->mmap_sem is dropped.
>   */
> -int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
>  {
>  	struct mmu_notifier *mn;
>  	int id;
> -	int ret = 0;
> +	bool ret = false;
>  
> -	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +	WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
>  
>  	if (!mm_has_notifiers(mm))
>  		return ret;
> @@ -259,7 +259,7 @@ int mm_has_blockable_invalidate_notifier
>  				continue;
>  
>  		if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> -			ret = 1;
> +			ret = true;
>  			break;
>  		}
>  	}
> _
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-16  6:21       ` Tetsuo Handa
@ 2017-12-16 11:33         ` Michal Hocko
  -1 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-16 11:33 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: David Rientjes, Andrew Morton, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, Christian König, David Airlie, Joerg Roedel,
	Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
	Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
	Paolo Bonzini, Radim Krčmář,
	linux-kernel, linux-mm

On Sat 16-12-17 15:21:51, Tetsuo Handa wrote:
> On 2017/12/16 1:25, Michal Hocko wrote:
> >>  struct mmu_notifier_ops {
> >> +	/*
> >> +	 * Flags to specify behavior of callbacks for this MMU notifier.
> >> +	 * Used to determine which context an operation may be called.
> >> +	 *
> >> +	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> >> +	 *				  block
> >> +	 */
> >> +	int flags;
> > 
> > This should be more specific IMHO. What do you think about the following
> > wording?
> > 
> > invalidate_{start,end,range} doesn't block on any locks which depend
> > directly or indirectly (via lock chain or resources e.g. worker context)
> > on a memory allocation.
> 
> I disagree. It needlessly complicates validating the correctness.

But it makes it clear what is the actual semantic.

> What if the invalidate_{start,end} calls schedule_timeout_idle(10 * HZ) ?

Let's talk seriously about a real code. Any mmu notifier doing this is
just crazy and should be fixed.

> schedule_timeout_idle() will not block on any locks which depend directly or
> indirectly on a memory allocation, but we are already blocking other memory
> allocating threads at mutex_trylock(&oom_lock) in __alloc_pages_may_oom().

Then the reaper will block and progress would be slower.

> This is essentially same with "sleeping forever due to schedule_timeout_killable(1) by
> SCHED_IDLE thread with oom_lock held" versus "looping due to mutex_trylock(&oom_lock)
> by all other allocating threads" lockup problem. The OOM reaper does not want to get
> blocked for so long.

Yes, it absolutely doesn't want to do that. MMu notifiers should be
reasonable because they are called from performance sensitive call
paths.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-16 11:33         ` Michal Hocko
  0 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-16 11:33 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: David Rientjes, Andrew Morton, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, Christian König, David Airlie, Joerg Roedel,
	Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
	Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
	Paolo Bonzini, Radim Krčmář,
	linux-kernel, linux-mm

On Sat 16-12-17 15:21:51, Tetsuo Handa wrote:
> On 2017/12/16 1:25, Michal Hocko wrote:
> >>  struct mmu_notifier_ops {
> >> +	/*
> >> +	 * Flags to specify behavior of callbacks for this MMU notifier.
> >> +	 * Used to determine which context an operation may be called.
> >> +	 *
> >> +	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> >> +	 *				  block
> >> +	 */
> >> +	int flags;
> > 
> > This should be more specific IMHO. What do you think about the following
> > wording?
> > 
> > invalidate_{start,end,range} doesn't block on any locks which depend
> > directly or indirectly (via lock chain or resources e.g. worker context)
> > on a memory allocation.
> 
> I disagree. It needlessly complicates validating the correctness.

But it makes it clear what is the actual semantic.

> What if the invalidate_{start,end} calls schedule_timeout_idle(10 * HZ) ?

Let's talk seriously about a real code. Any mmu notifier doing this is
just crazy and should be fixed.

> schedule_timeout_idle() will not block on any locks which depend directly or
> indirectly on a memory allocation, but we are already blocking other memory
> allocating threads at mutex_trylock(&oom_lock) in __alloc_pages_may_oom().

Then the reaper will block and progress would be slower.

> This is essentially same with "sleeping forever due to schedule_timeout_killable(1) by
> SCHED_IDLE thread with oom_lock held" versus "looping due to mutex_trylock(&oom_lock)
> by all other allocating threads" lockup problem. The OOM reaper does not want to get
> blocked for so long.

Yes, it absolutely doesn't want to do that. MMu notifiers should be
reasonable because they are called from performance sensitive call
paths.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-16  7:14       ` Tetsuo Handa
@ 2017-12-16 11:36         ` Michal Hocko
  -1 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-16 11:36 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Andrew Morton, David Rientjes, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, Christian König, David Airlie, Joerg Roedel,
	Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
	Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
	Paolo Bonzini, Radim Krčmář,
	linux-kernel, linux-mm

On Sat 16-12-17 16:14:07, Tetsuo Handa wrote:
> On 2017/12/16 8:04, Andrew Morton wrote:
> >> The implementation is steered toward an expensive slowpath, such as after
> >> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> > 
> > some tweakage, please review.
> > 
> > From: Andrew Morton <akpm@linux-foundation.org>
> > Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> > 
> > make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
> > 
> 
> > @@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
> >   * Must be called while holding mm->mmap_sem for either read or write.
> >   * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> >   */
> > -int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> > +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> >  {
> >  	struct mmu_notifier *mn;
> >  	int id;
> > -	int ret = 0;
> > +	bool ret = false;
> >  
> > -	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> > +	WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
> >  
> >  	if (!mm_has_notifiers(mm))
> >  		return ret;
> 
> rwsem_is_locked() test isn't equivalent with __mutex_owner() == current test, is it?
> If rwsem_is_locked() returns true because somebody else has locked it, there is
> no guarantee that current thread has locked it before calling this function.
> 
> down_write_trylock() test isn't equivalent with __mutex_owner() == current test, is it?
> What if somebody else held it for read or write (the worst case is registration path),
> down_write_trylock() will return false even if current thread has not locked it for
> read or write.
> 
> I think this WARN_ON_ONCE() can not detect incorrect call to this function.

Yes it cannot catch _all_ cases. This is an inherent problem of
rwsem_is_locked because semaphores do not really have the owner concept.
The core idea behind this, I guess, is to catch obviously incorrect
usage and as such it gives us a reasonabe coverage. I could live without
the annotation but rwsem_is_locked looks better than down_write_trylock
to me.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-16 11:36         ` Michal Hocko
  0 siblings, 0 replies; 53+ messages in thread
From: Michal Hocko @ 2017-12-16 11:36 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Andrew Morton, David Rientjes, Andrea Arcangeli,
	Benjamin Herrenschmidt, Paul Mackerras, Oded Gabbay,
	Alex Deucher, Christian König, David Airlie, Joerg Roedel,
	Doug Ledford, Jani Nikula, Mike Marciniszyn, Sean Hefty,
	Dimitri Sivanich, Boris Ostrovsky, Jérôme Glisse,
	Paolo Bonzini, Radim Krčmář,
	linux-kernel, linux-mm

On Sat 16-12-17 16:14:07, Tetsuo Handa wrote:
> On 2017/12/16 8:04, Andrew Morton wrote:
> >> The implementation is steered toward an expensive slowpath, such as after
> >> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> > 
> > some tweakage, please review.
> > 
> > From: Andrew Morton <akpm@linux-foundation.org>
> > Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> > 
> > make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
> > 
> 
> > @@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
> >   * Must be called while holding mm->mmap_sem for either read or write.
> >   * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> >   */
> > -int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> > +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> >  {
> >  	struct mmu_notifier *mn;
> >  	int id;
> > -	int ret = 0;
> > +	bool ret = false;
> >  
> > -	WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> > +	WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
> >  
> >  	if (!mm_has_notifiers(mm))
> >  		return ret;
> 
> rwsem_is_locked() test isn't equivalent with __mutex_owner() == current test, is it?
> If rwsem_is_locked() returns true because somebody else has locked it, there is
> no guarantee that current thread has locked it before calling this function.
> 
> down_write_trylock() test isn't equivalent with __mutex_owner() == current test, is it?
> What if somebody else held it for read or write (the worst case is registration path),
> down_write_trylock() will return false even if current thread has not locked it for
> read or write.
> 
> I think this WARN_ON_ONCE() can not detect incorrect call to this function.

Yes it cannot catch _all_ cases. This is an inherent problem of
rwsem_is_locked because semaphores do not really have the owner concept.
The core idea behind this, I guess, is to catch obviously incorrect
usage and as such it gives us a reasonabe coverage. I could live without
the annotation but rwsem_is_locked looks better than down_write_trylock
to me.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
  2017-12-16 11:36         ` Michal Hocko
@ 2017-12-16 14:45           ` Tetsuo Handa
  -1 siblings, 0 replies; 53+ messages in thread
From: Tetsuo Handa @ 2017-12-16 14:45 UTC (permalink / raw)
  To: mhocko
  Cc: akpm, rientjes, aarcange, benh, paulus, oded.gabbay,
	alexander.deucher, christian.koenig, airlied, joro, dledford,
	jani.nikula, mike.marciniszyn, sean.hefty, sivanich,
	boris.ostrovsky, jglisse, pbonzini, rkrcmar, linux-kernel,
	linux-mm

Michal Hocko wrote:
> On Sat 16-12-17 16:14:07, Tetsuo Handa wrote:
> > rwsem_is_locked() test isn't equivalent with __mutex_owner() == current test, is it?
> > If rwsem_is_locked() returns true because somebody else has locked it, there is
> > no guarantee that current thread has locked it before calling this function.
> > 
> > down_write_trylock() test isn't equivalent with __mutex_owner() == current test, is it?
> > What if somebody else held it for read or write (the worst case is registration path),
> > down_write_trylock() will return false even if current thread has not locked it for
> > read or write.
> > 
> > I think this WARN_ON_ONCE() can not detect incorrect call to this function.
> 
> Yes it cannot catch _all_ cases. This is an inherent problem of
> rwsem_is_locked because semaphores do not really have the owner concept.
> The core idea behind this, I guess, is to catch obviously incorrect
> usage and as such it gives us a reasonabe coverage. I could live without
> the annotation but rwsem_is_locked looks better than down_write_trylock
> to me.

I agree that rwsem_is_locked() is better than down_write_trylock() because
the former does not have side effect when nobody was holding the rwsem.

Looking at how rwsem_is_locked() is used in mm/ directory for sanity checks,
only VM_BUG_ON(), VM_BUG_ON_MM(), or VM_BUG_ON_VMA() are used. Therefore,
this WARN_ON_ONCE() usage might be irregular.

Also, regarding the problem that semaphores do not really have the owner
concept, we can add "struct task_struct *owner_of_mmap_sem_for_write" to
"struct mm_struct" and replace direct down_write_killable() etc. with
corresponding wrapper functions like

  int __must_check get_mmap_sem_write_killable(struct mm_struct *mm) {
      if (down_write_killable(&mm->mmap_sem))
          return -EINTR;
      mm->owner_of_mmap_sem_for_write = current;
      return 0;
  }

and make the rwsem_is_locked() test more robust by doing like

  bool mmap_sem_is_held_for_write_by_current(struct mm_struct *mm) {
      return mm->owner_of_mmap_sem_for_write == current;
  }

. If there is a guarantee that no thread is allowed to hold multiple
mmap_sem, wrapper functions which manipulate per "struct task_struct"
flag will work.

But the fundamental problem is that we are heavily relying on runtime
testing (e.g. lockdep / syzkaller). Since there are a lot of factors which
prevent sanity checks from being called (e.g. conditional calls based on
threshold check), we can not exercise all paths, and everybody is making
changes without understanding all the dependencies. Consider that nobody
noticed that relying on __GFP_DIRECT_RECLAIM with oom_lock held may cause
lockups. We are too easily introducing unsafe dependency. I think that we
need to describe all the dependencies without relying on runtime testing.

Back to MMU_INVALIDATE_DOES_NOT_BLOCK flag, I worry that we will fail to
notice when somebody in future makes changes with mmu notifier which
currently does not rely on __GFP_DIRECT_RECLAIM to by error rely on
__GFP_DIRECT_RECLAIM. Staying at "whether the callback might sleep"
granularity will help preventing such unnoticed dependency bugs from
being introduced.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
@ 2017-12-16 14:45           ` Tetsuo Handa
  0 siblings, 0 replies; 53+ messages in thread
From: Tetsuo Handa @ 2017-12-16 14:45 UTC (permalink / raw)
  To: mhocko
  Cc: akpm, rientjes, aarcange, benh, paulus, oded.gabbay,
	alexander.deucher, christian.koenig, airlied, joro, dledford,
	jani.nikula, mike.marciniszyn, sean.hefty, sivanich,
	boris.ostrovsky, jglisse, pbonzini, rkrcmar, linux-kernel,
	linux-mm

Michal Hocko wrote:
> On Sat 16-12-17 16:14:07, Tetsuo Handa wrote:
> > rwsem_is_locked() test isn't equivalent with __mutex_owner() == current test, is it?
> > If rwsem_is_locked() returns true because somebody else has locked it, there is
> > no guarantee that current thread has locked it before calling this function.
> > 
> > down_write_trylock() test isn't equivalent with __mutex_owner() == current test, is it?
> > What if somebody else held it for read or write (the worst case is registration path),
> > down_write_trylock() will return false even if current thread has not locked it for
> > read or write.
> > 
> > I think this WARN_ON_ONCE() can not detect incorrect call to this function.
> 
> Yes it cannot catch _all_ cases. This is an inherent problem of
> rwsem_is_locked because semaphores do not really have the owner concept.
> The core idea behind this, I guess, is to catch obviously incorrect
> usage and as such it gives us a reasonabe coverage. I could live without
> the annotation but rwsem_is_locked looks better than down_write_trylock
> to me.

I agree that rwsem_is_locked() is better than down_write_trylock() because
the former does not have side effect when nobody was holding the rwsem.

Looking at how rwsem_is_locked() is used in mm/ directory for sanity checks,
only VM_BUG_ON(), VM_BUG_ON_MM(), or VM_BUG_ON_VMA() are used. Therefore,
this WARN_ON_ONCE() usage might be irregular.

Also, regarding the problem that semaphores do not really have the owner
concept, we can add "struct task_struct *owner_of_mmap_sem_for_write" to
"struct mm_struct" and replace direct down_write_killable() etc. with
corresponding wrapper functions like

  int __must_check get_mmap_sem_write_killable(struct mm_struct *mm) {
      if (down_write_killable(&mm->mmap_sem))
          return -EINTR;
      mm->owner_of_mmap_sem_for_write = current;
      return 0;
  }

and make the rwsem_is_locked() test more robust by doing like

  bool mmap_sem_is_held_for_write_by_current(struct mm_struct *mm) {
      return mm->owner_of_mmap_sem_for_write == current;
  }

. If there is a guarantee that no thread is allowed to hold multiple
mmap_sem, wrapper functions which manipulate per "struct task_struct"
flag will work.

But the fundamental problem is that we are heavily relying on runtime
testing (e.g. lockdep / syzkaller). Since there are a lot of factors which
prevent sanity checks from being called (e.g. conditional calls based on
threshold check), we can not exercise all paths, and everybody is making
changes without understanding all the dependencies. Consider that nobody
noticed that relying on __GFP_DIRECT_RECLAIM with oom_lock held may cause
lockups. We are too easily introducing unsafe dependency. I think that we
need to describe all the dependencies without relying on runtime testing.

Back to MMU_INVALIDATE_DOES_NOT_BLOCK flag, I worry that we will fail to
notice when somebody in future makes changes with mmu notifier which
currently does not rely on __GFP_DIRECT_RECLAIM to by error rely on
__GFP_DIRECT_RECLAIM. Staying at "whether the callback might sleep"
granularity will help preventing such unnoticed dependency bugs from
being introduced.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [patch -mm] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks fix fix
  2017-12-15 23:04     ` Andrew Morton
@ 2018-01-09 21:40       ` David Rientjes
  -1 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2018-01-09 21:40 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

Fix mmu_notifier.h comments in "mm, mmu_notifier: annotate mmu notifiers
with blockable invalidate callbacks".

mmu_notifier_invalidate_range_end() can also call the invalidate_range()
callback, so it must not block for MMU_INVALIDATE_DOES_NOT_BLOCK to be
set.

Also remove a bogus comment about invalidate_range() always being called
under the ptl spinlock.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mmu_notifier.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -34,8 +34,8 @@ struct mmu_notifier_ops {
 	 * Flags to specify behavior of callbacks for this MMU notifier.
 	 * Used to determine which context an operation may be called.
 	 *
-	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
-	 *				  block
+	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not
+	 *	block
 	 */
 	int flags;
 
@@ -151,8 +151,9 @@ struct mmu_notifier_ops {
 	 * address space but may still be referenced by sptes until
 	 * the last refcount is dropped.
 	 *
-	 * If both of these callbacks cannot block, mmu_notifier_ops.flags
-	 * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
+	 * If both of these callbacks cannot block, and invalidate_range
+	 * cannot block, mmu_notifier_ops.flags should have
+	 * MMU_INVALIDATE_DOES_NOT_BLOCK set.
 	 */
 	void (*invalidate_range_start)(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
@@ -175,12 +176,13 @@ struct mmu_notifier_ops {
 	 * external TLB range needs to be flushed. For more in depth
 	 * discussion on this see Documentation/vm/mmu_notifier.txt
 	 *
-	 * The invalidate_range() function is called under the ptl
-	 * spin-lock and not allowed to sleep.
-	 *
 	 * Note that this function might be called with just a sub-range
 	 * of what was passed to invalidate_range_start()/end(), if
 	 * called between those functions.
+	 *
+	 * If this callback cannot block, and invalidate_range_{start,end}
+	 * cannot block, mmu_notifier_ops.flags should have
+	 * MMU_INVALIDATE_DOES_NOT_BLOCK set.
 	 */
 	void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
 				 unsigned long start, unsigned long end);

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [patch -mm] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks fix fix
@ 2018-01-09 21:40       ` David Rientjes
  0 siblings, 0 replies; 53+ messages in thread
From: David Rientjes @ 2018-01-09 21:40 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Andrea Arcangeli, Benjamin Herrenschmidt,
	Paul Mackerras, Oded Gabbay, Alex Deucher, Christian König,
	David Airlie, Joerg Roedel, Doug Ledford, Jani Nikula,
	Mike Marciniszyn, Sean Hefty, Dimitri Sivanich, Boris Ostrovsky,
	Jérôme Glisse, Paolo Bonzini,
	Radim Krčmář,
	linux-kernel, linux-mm

Fix mmu_notifier.h comments in "mm, mmu_notifier: annotate mmu notifiers
with blockable invalidate callbacks".

mmu_notifier_invalidate_range_end() can also call the invalidate_range()
callback, so it must not block for MMU_INVALIDATE_DOES_NOT_BLOCK to be
set.

Also remove a bogus comment about invalidate_range() always being called
under the ptl spinlock.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mmu_notifier.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -34,8 +34,8 @@ struct mmu_notifier_ops {
 	 * Flags to specify behavior of callbacks for this MMU notifier.
 	 * Used to determine which context an operation may be called.
 	 *
-	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
-	 *				  block
+	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not
+	 *	block
 	 */
 	int flags;
 
@@ -151,8 +151,9 @@ struct mmu_notifier_ops {
 	 * address space but may still be referenced by sptes until
 	 * the last refcount is dropped.
 	 *
-	 * If both of these callbacks cannot block, mmu_notifier_ops.flags
-	 * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
+	 * If both of these callbacks cannot block, and invalidate_range
+	 * cannot block, mmu_notifier_ops.flags should have
+	 * MMU_INVALIDATE_DOES_NOT_BLOCK set.
 	 */
 	void (*invalidate_range_start)(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
@@ -175,12 +176,13 @@ struct mmu_notifier_ops {
 	 * external TLB range needs to be flushed. For more in depth
 	 * discussion on this see Documentation/vm/mmu_notifier.txt
 	 *
-	 * The invalidate_range() function is called under the ptl
-	 * spin-lock and not allowed to sleep.
-	 *
 	 * Note that this function might be called with just a sub-range
 	 * of what was passed to invalidate_range_start()/end(), if
 	 * called between those functions.
+	 *
+	 * If this callback cannot block, and invalidate_range_{start,end}
+	 * cannot block, mmu_notifier_ops.flags should have
+	 * MMU_INVALIDATE_DOES_NOT_BLOCK set.
 	 */
 	void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
 				 unsigned long start, unsigned long end);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 53+ messages in thread

end of thread, other threads:[~2018-01-09 21:40 UTC | newest]

Thread overview: 53+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-11 22:11 [patch 1/2] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks David Rientjes
2017-12-11 22:11 ` David Rientjes
2017-12-11 22:11 ` [patch 2/2] mm, oom: avoid reaping only for mm's " David Rientjes
2017-12-11 22:11   ` David Rientjes
2017-12-11 22:23 ` [patch 1/2] mm, mmu_notifier: annotate mmu notifiers " Paolo Bonzini
2017-12-11 22:23   ` Paolo Bonzini
2017-12-11 23:09   ` David Rientjes
2017-12-11 23:09     ` David Rientjes
2017-12-12 20:05 ` Dimitri Sivanich
2017-12-12 20:05   ` Dimitri Sivanich
2017-12-12 21:28   ` David Rientjes
2017-12-12 21:28     ` David Rientjes
2017-12-13  9:34     ` Christian König
2017-12-13  9:34       ` Christian König
2017-12-13 10:26       ` Tetsuo Handa
2017-12-13 10:26         ` Tetsuo Handa
2017-12-13 10:37         ` Paolo Bonzini
2017-12-13 10:37           ` Paolo Bonzini
2017-12-14  9:19       ` David Rientjes
2017-12-14  9:19         ` David Rientjes
2017-12-14 12:46 ` kbuild test robot
2017-12-14 21:30 ` [patch v2 " David Rientjes
2017-12-14 21:30   ` David Rientjes
2017-12-14 21:31   ` [patch v2 2/2] mm, oom: avoid reaping only for mm's " David Rientjes
2017-12-14 21:31     ` David Rientjes
2017-12-15 16:35     ` Michal Hocko
2017-12-15 16:35       ` Michal Hocko
2017-12-15 21:46       ` David Rientjes
2017-12-15 21:46         ` David Rientjes
2017-12-15  8:42   ` [patch v2 1/2] mm, mmu_notifier: annotate mmu notifiers " Paolo Bonzini
2017-12-15  8:42     ` Paolo Bonzini
2017-12-15 12:19   ` Christian König
2017-12-15 12:19     ` Christian König
2017-12-15 13:36   ` Dimitri Sivanich
2017-12-15 13:36     ` Dimitri Sivanich
2017-12-15 16:25   ` Michal Hocko
2017-12-15 16:25     ` Michal Hocko
2017-12-16  6:21     ` Tetsuo Handa
2017-12-16  6:21       ` Tetsuo Handa
2017-12-16 11:33       ` Michal Hocko
2017-12-16 11:33         ` Michal Hocko
2017-12-15 23:04   ` Andrew Morton
2017-12-15 23:04     ` Andrew Morton
2017-12-16  7:14     ` Tetsuo Handa
2017-12-16  7:14       ` Tetsuo Handa
2017-12-16 11:36       ` Michal Hocko
2017-12-16 11:36         ` Michal Hocko
2017-12-16 14:45         ` Tetsuo Handa
2017-12-16 14:45           ` Tetsuo Handa
2017-12-16  9:10     ` Michal Hocko
2017-12-16  9:10       ` Michal Hocko
2018-01-09 21:40     ` [patch -mm] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks fix fix David Rientjes
2018-01-09 21:40       ` David Rientjes

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.