linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
From: Alexey Kardashevskiy <aik@ozlabs.ru>
To: Daniel Jordan <daniel.m.jordan@oracle.com>, jgg@ziepe.ca
Cc: dave@stgolabs.net, jack@suse.cz, kvm@vger.kernel.org,
	atull@kernel.org, linux-fpga@vger.kernel.org,
	linux-kernel@vger.kernel.org, kvm-ppc@vger.kernel.org,
	linux-mm@kvack.org, alex.williamson@redhat.com, mdf@kernel.org,
	akpm@linux-foundation.org, linuxppc-dev@lists.ozlabs.org,
	cl@linux.com, hao.wu@intel.com
Subject: Re: [PATCH 2/5] vfio/spapr_tce: use pinned_vm instead of locked_vm to account pinned pages
Date: Tue, 12 Feb 2019 17:56:18 +1100	[thread overview]
Message-ID: <ee4d14db-05c3-6208-503c-16e287fa78eb@ozlabs.ru> (raw)
In-Reply-To: <20190211224437.25267-3-daniel.m.jordan@oracle.com>



On 12/02/2019 09:44, Daniel Jordan wrote:
> Beginning with bc3e53f682d9 ("mm: distinguish between mlocked and pinned
> pages"), locked and pinned pages are accounted separately.  The SPAPR
> TCE VFIO IOMMU driver accounts pinned pages to locked_vm; use pinned_vm
> instead.
> 
> pinned_vm recently became atomic and so no longer relies on mmap_sem
> held as writer: delete.
> 
> Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> ---
>  Documentation/vfio.txt              |  6 +--
>  drivers/vfio/vfio_iommu_spapr_tce.c | 64 ++++++++++++++---------------
>  2 files changed, 33 insertions(+), 37 deletions(-)
> 
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index f1a4d3c3ba0b..fa37d65363f9 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -308,7 +308,7 @@ This implementation has some specifics:
>     currently there is no way to reduce the number of calls. In order to make
>     things faster, the map/unmap handling has been implemented in real mode
>     which provides an excellent performance which has limitations such as
> -   inability to do locked pages accounting in real time.
> +   inability to do pinned pages accounting in real time.
>  
>  4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
>     subtree that can be treated as a unit for the purposes of partitioning and
> @@ -324,7 +324,7 @@ This implementation has some specifics:
>  		returns the size and the start of the DMA window on the PCI bus.
>  
>  	VFIO_IOMMU_ENABLE
> -		enables the container. The locked pages accounting
> +		enables the container. The pinned pages accounting
>  		is done at this point. This lets user first to know what
>  		the DMA window is and adjust rlimit before doing any real job.
>  
> @@ -454,7 +454,7 @@ This implementation has some specifics:
>  
>     PPC64 paravirtualized guests generate a lot of map/unmap requests,
>     and the handling of those includes pinning/unpinning pages and updating
> -   mm::locked_vm counter to make sure we do not exceed the rlimit.
> +   mm::pinned_vm counter to make sure we do not exceed the rlimit.
>     The v2 IOMMU splits accounting and pinning into separate operations:
>  
>     - VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> index c424913324e3..f47e020dc5e4 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -34,9 +34,11 @@
>  static void tce_iommu_detach_group(void *iommu_data,
>  		struct iommu_group *iommu_group);
>  
> -static long try_increment_locked_vm(struct mm_struct *mm, long npages)
> +static long try_increment_pinned_vm(struct mm_struct *mm, long npages)
>  {
> -	long ret = 0, locked, lock_limit;
> +	long ret = 0;
> +	s64 pinned;
> +	unsigned long lock_limit;
>  
>  	if (WARN_ON_ONCE(!mm))
>  		return -EPERM;
> @@ -44,39 +46,33 @@ static long try_increment_locked_vm(struct mm_struct *mm, long npages)
>  	if (!npages)
>  		return 0;
>  
> -	down_write(&mm->mmap_sem);
> -	locked = mm->locked_vm + npages;
> +	pinned = atomic64_add_return(npages, &mm->pinned_vm);
>  	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> -	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
> +	if (pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
>  		ret = -ENOMEM;
> -	else
> -		mm->locked_vm += npages;
> +		atomic64_sub(npages, &mm->pinned_vm);
> +	}
>  
> -	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
> +	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%lu%s\n", current->pid,
>  			npages << PAGE_SHIFT,
> -			mm->locked_vm << PAGE_SHIFT,
> -			rlimit(RLIMIT_MEMLOCK),
> -			ret ? " - exceeded" : "");
> -
> -	up_write(&mm->mmap_sem);
> +			atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
> +			rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
>  
>  	return ret;
>  }
>  
> -static void decrement_locked_vm(struct mm_struct *mm, long npages)
> +static void decrement_pinned_vm(struct mm_struct *mm, long npages)
>  {
>  	if (!mm || !npages)
>  		return;
>  
> -	down_write(&mm->mmap_sem);
> -	if (WARN_ON_ONCE(npages > mm->locked_vm))
> -		npages = mm->locked_vm;
> -	mm->locked_vm -= npages;
> -	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
> +	if (WARN_ON_ONCE(npages > atomic64_read(&mm->pinned_vm)))
> +		npages = atomic64_read(&mm->pinned_vm);
> +	atomic64_sub(npages, &mm->pinned_vm);
> +	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%lu\n", current->pid,
>  			npages << PAGE_SHIFT,
> -			mm->locked_vm << PAGE_SHIFT,
> +			atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
>  			rlimit(RLIMIT_MEMLOCK));
> -	up_write(&mm->mmap_sem);


So it used to be down_write+up_write and stuff in between.

Now it is 3 independent accesses (actually 4 but the last one is
diagnostic) with no locking around them. Why do not we need a lock
anymore precisely? Thanks,




>  }
>  
>  /*
> @@ -110,7 +106,7 @@ struct tce_container {
>  	bool enabled;
>  	bool v2;
>  	bool def_window_pending;
> -	unsigned long locked_pages;
> +	unsigned long pinned_pages;
>  	struct mm_struct *mm;
>  	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
>  	struct list_head group_list;
> @@ -283,7 +279,7 @@ static int tce_iommu_find_free_table(struct tce_container *container)
>  static int tce_iommu_enable(struct tce_container *container)
>  {
>  	int ret = 0;
> -	unsigned long locked;
> +	unsigned long pinned;
>  	struct iommu_table_group *table_group;
>  	struct tce_iommu_group *tcegrp;
>  
> @@ -292,15 +288,15 @@ static int tce_iommu_enable(struct tce_container *container)
>  
>  	/*
>  	 * When userspace pages are mapped into the IOMMU, they are effectively
> -	 * locked memory, so, theoretically, we need to update the accounting
> -	 * of locked pages on each map and unmap.  For powerpc, the map unmap
> +	 * pinned memory, so, theoretically, we need to update the accounting
> +	 * of pinned pages on each map and unmap.  For powerpc, the map unmap
>  	 * paths can be very hot, though, and the accounting would kill
>  	 * performance, especially since it would be difficult to impossible
>  	 * to handle the accounting in real mode only.
>  	 *
>  	 * To address that, rather than precisely accounting every page, we
> -	 * instead account for a worst case on locked memory when the iommu is
> -	 * enabled and disabled.  The worst case upper bound on locked memory
> +	 * instead account for a worst case on pinned memory when the iommu is
> +	 * enabled and disabled.  The worst case upper bound on pinned memory
>  	 * is the size of the whole iommu window, which is usually relatively
>  	 * small (compared to total memory sizes) on POWER hardware.
>  	 *
> @@ -317,7 +313,7 @@ static int tce_iommu_enable(struct tce_container *container)
>  	 *
>  	 * So we do not allow enabling a container without a group attached
>  	 * as there is no way to know how much we should increment
> -	 * the locked_vm counter.
> +	 * the pinned_vm counter.
>  	 */
>  	if (!tce_groups_attached(container))
>  		return -ENODEV;
> @@ -335,12 +331,12 @@ static int tce_iommu_enable(struct tce_container *container)
>  	if (ret)
>  		return ret;
>  
> -	locked = table_group->tce32_size >> PAGE_SHIFT;
> -	ret = try_increment_locked_vm(container->mm, locked);
> +	pinned = table_group->tce32_size >> PAGE_SHIFT;
> +	ret = try_increment_pinned_vm(container->mm, pinned);
>  	if (ret)
>  		return ret;
>  
> -	container->locked_pages = locked;
> +	container->pinned_pages = pinned;
>  
>  	container->enabled = true;
>  
> @@ -355,7 +351,7 @@ static void tce_iommu_disable(struct tce_container *container)
>  	container->enabled = false;
>  
>  	BUG_ON(!container->mm);
> -	decrement_locked_vm(container->mm, container->locked_pages);
> +	decrement_pinned_vm(container->mm, container->pinned_pages);
>  }
>  
>  static void *tce_iommu_open(unsigned long arg)
> @@ -658,7 +654,7 @@ static long tce_iommu_create_table(struct tce_container *container,
>  	if (!table_size)
>  		return -EINVAL;
>  
> -	ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
> +	ret = try_increment_pinned_vm(container->mm, table_size >> PAGE_SHIFT);
>  	if (ret)
>  		return ret;
>  
> @@ -677,7 +673,7 @@ static void tce_iommu_free_table(struct tce_container *container,
>  	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
>  
>  	iommu_tce_table_put(tbl);
> -	decrement_locked_vm(container->mm, pages);
> +	decrement_pinned_vm(container->mm, pages);
>  }
>  
>  static long tce_iommu_create_window(struct tce_container *container,
> 

-- 
Alexey

  reply	other threads:[~2019-02-12  6:58 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-02-11 22:44 [PATCH 0/5] use pinned_vm instead of locked_vm to account pinned pages Daniel Jordan
2019-02-11 22:44 ` [PATCH 1/5] vfio/type1: " Daniel Jordan
2019-02-11 22:56   ` Jason Gunthorpe
2019-02-11 23:11     ` Daniel Jordan
2019-02-12 18:41       ` Alex Williamson
2019-02-13  0:26         ` Daniel Jordan
2019-02-13 20:03           ` Alex Williamson
2019-02-13 23:07             ` Jason Gunthorpe
2019-02-14  1:46             ` Daniel Jordan
2019-02-11 22:44 ` [PATCH 2/5] vfio/spapr_tce: " Daniel Jordan
2019-02-12  6:56   ` Alexey Kardashevskiy [this message]
2019-02-12 16:50     ` Christopher Lameter
2019-02-12 17:18       ` Daniel Jordan
2019-02-13  0:37         ` Alexey Kardashevskiy
2019-02-12 18:56     ` Alex Williamson
2019-02-13  0:34       ` Alexey Kardashevskiy
2019-02-11 22:44 ` [PATCH 3/5] fpga/dlf/afu: " Daniel Jordan
2019-02-11 22:44 ` [PATCH 4/5] powerpc/mmu: " Daniel Jordan
2019-02-13  1:14   ` kbuild test robot
2019-02-11 22:44 ` [PATCH 5/5] kvm/book3s: " Daniel Jordan
2019-02-13  1:43   ` kbuild test robot
2019-02-11 22:54 ` [PATCH 0/5] " Jason Gunthorpe
2019-02-11 23:15   ` Daniel Jordan
2019-02-14  1:53   ` Ira Weiny
2019-02-14  6:00     ` Jason Gunthorpe
2019-02-14 19:33       ` Ira Weiny
2019-02-14 20:12         ` Jason Gunthorpe
2019-02-14 21:46           ` Ira Weiny
2019-02-14 22:16             ` Jason Gunthorpe
2019-02-15 15:26               ` Christopher Lameter

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ee4d14db-05c3-6208-503c-16e287fa78eb@ozlabs.ru \
    --to=aik@ozlabs.ru \
    --cc=akpm@linux-foundation.org \
    --cc=alex.williamson@redhat.com \
    --cc=atull@kernel.org \
    --cc=cl@linux.com \
    --cc=daniel.m.jordan@oracle.com \
    --cc=dave@stgolabs.net \
    --cc=hao.wu@intel.com \
    --cc=jack@suse.cz \
    --cc=jgg@ziepe.ca \
    --cc=kvm-ppc@vger.kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-fpga@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=mdf@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).