linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Khalid Aziz <khalid.aziz@oracle.com>
To: Barry Song <21cnbao@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	Matthew Wilcox <willy@infradead.org>,
	Aneesh Kumar <aneesh.kumar@linux.ibm.com>,
	Arnd Bergmann <arnd@arndb.de>, Jonathan Corbet <corbet@lwn.net>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	David Hildenbrand <david@redhat.com>,
	ebiederm@xmission.com, hagen@jauu.net, jack@suse.cz,
	Kees Cook <keescook@chromium.org>,
	kirill@shutemov.name, kucharsk@gmail.com, linkinjeon@kernel.org,
	linux-fsdevel@vger.kernel.org,
	LKML <linux-kernel@vger.kernel.org>,
	Linux-MM <linux-mm@kvack.org>,
	longpeng2@huawei.com, Andy Lutomirski <luto@kernel.org>,
	markhemm@googlemail.com, Peter Collingbourne <pcc@google.com>,
	Mike Rapoport <rppt@kernel.org>,
	sieberf@amazon.com, sjpark@amazon.de,
	Suren Baghdasaryan <surenb@google.com>,
	tst@schoebel-theuer.de, Iurii Zaikin <yzaikin@google.com>
Subject: Re: [PATCH v1 08/14] mm/mshare: Add basic page table sharing using mshare
Date: Tue, 28 Jun 2022 14:16:46 -0600	[thread overview]
Message-ID: <d5795029-9d77-b4e1-b92e-0b6bc920a0d0@oracle.com> (raw)
In-Reply-To: <CAGsJ_4ySieW+9a2yZQPVa1VA6dtv=9m2qEZ8w4ZoT6PqRFpLDg@mail.gmail.com>

On 5/30/22 21:46, Barry Song wrote:
> On Tue, Apr 12, 2022 at 4:07 AM Khalid Aziz <khalid.aziz@oracle.com> wrote:
>>
>> This patch adds basic page table sharing across tasks by making
>> mshare syscall. It does this by creating a new mm_struct which
>> hosts the shared vmas and page tables. This mm_struct is
>> maintained as long as there is at least one task using the mshare'd
>> range. It is cleaned up by the last mshare_unlink syscall.
>>
>> Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
>> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
>> ---
>>   mm/internal.h |   2 +
>>   mm/memory.c   |  35 ++++++++++
>>   mm/mshare.c   | 190 ++++++++++++++++++++++++++++++++++++++++++++++----
>>   3 files changed, 214 insertions(+), 13 deletions(-)
>>
>> diff --git a/mm/internal.h b/mm/internal.h
>> index cf50a471384e..68f82f0f8b66 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -718,6 +718,8 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
>>   int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
>>                        unsigned long addr, int page_nid, int *flags);
>>
>> +extern vm_fault_t find_shared_vma(struct vm_area_struct **vma,
>> +                       unsigned long *addrp);
>>   static inline bool vma_is_shared(const struct vm_area_struct *vma)
>>   {
>>          return vma->vm_flags & VM_SHARED_PT;
>> diff --git a/mm/memory.c b/mm/memory.c
>> index c125c4969913..c77c0d643ea8 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4776,6 +4776,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
>>                             unsigned int flags, struct pt_regs *regs)
>>   {
>>          vm_fault_t ret;
>> +       bool shared = false;
>>
>>          __set_current_state(TASK_RUNNING);
>>
>> @@ -4785,6 +4786,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
>>          /* do counter updates before entering really critical section. */
>>          check_sync_rss_stat(current);
>>
>> +       if (unlikely(vma_is_shared(vma))) {
>> +               ret = find_shared_vma(&vma, &address);
>> +               if (ret)
>> +                       return ret;
>> +               if (!vma)
>> +                       return VM_FAULT_SIGSEGV;
>> +               shared = true;
>> +       }
>> +
>>          if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
>>                                              flags & FAULT_FLAG_INSTRUCTION,
>>                                              flags & FAULT_FLAG_REMOTE))
>> @@ -4802,6 +4812,31 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
>>          else
>>                  ret = __handle_mm_fault(vma, address, flags);
>>
>> +       /*
>> +        * Release the read lock on shared VMA's parent mm unless
>> +        * __handle_mm_fault released the lock already.
>> +        * __handle_mm_fault sets VM_FAULT_RETRY in return value if
>> +        * it released mmap lock. If lock was released, that implies
>> +        * the lock would have been released on task's original mm if
>> +        * this were not a shared PTE vma. To keep lock state consistent,
>> +        * make sure to release the lock on task's original mm
>> +        */
>> +       if (shared) {
>> +               int release_mmlock = 1;
>> +
>> +               if (!(ret & VM_FAULT_RETRY)) {
>> +                       mmap_read_unlock(vma->vm_mm);
>> +                       release_mmlock = 0;
>> +               } else if ((flags & FAULT_FLAG_ALLOW_RETRY) &&
>> +                       (flags & FAULT_FLAG_RETRY_NOWAIT)) {
>> +                       mmap_read_unlock(vma->vm_mm);
>> +                       release_mmlock = 0;
>> +               }
>> +
>> +               if (release_mmlock)
>> +                       mmap_read_unlock(current->mm);
>> +       }
>> +
>>          if (flags & FAULT_FLAG_USER) {
>>                  mem_cgroup_exit_user_fault();
>>                  /*
>> diff --git a/mm/mshare.c b/mm/mshare.c
>> index cd2f7ad24d9d..d1896adcb00f 100644
>> --- a/mm/mshare.c
>> +++ b/mm/mshare.c
>> @@ -17,18 +17,49 @@
>>   #include <linux/pseudo_fs.h>
>>   #include <linux/fileattr.h>
>>   #include <linux/refcount.h>
>> +#include <linux/mman.h>
>>   #include <linux/sched/mm.h>
>>   #include <uapi/linux/magic.h>
>>   #include <uapi/linux/limits.h>
>>
>>   struct mshare_data {
>> -       struct mm_struct *mm;
>> +       struct mm_struct *mm, *host_mm;
>>          mode_t mode;
>>          refcount_t refcnt;
>>   };
>>
>>   static struct super_block *msharefs_sb;
>>
>> +/* Returns holding the host mm's lock for read.  Caller must release. */
>> +vm_fault_t
>> +find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp)
>> +{
>> +       struct vm_area_struct *vma, *guest = *vmap;
>> +       struct mshare_data *info = guest->vm_private_data;
>> +       struct mm_struct *host_mm = info->mm;
>> +       unsigned long host_addr;
>> +       pgd_t *pgd, *guest_pgd;
>> +
>> +       host_addr = *addrp - guest->vm_start + host_mm->mmap_base;
>> +       pgd = pgd_offset(host_mm, host_addr);
>> +       guest_pgd = pgd_offset(current->mm, *addrp);
>> +       if (!pgd_same(*guest_pgd, *pgd)) {
>> +               set_pgd(guest_pgd, *pgd);
>> +               return VM_FAULT_NOPAGE;
>> +       }
>> +
>> +       *addrp = host_addr;
>> +       mmap_read_lock(host_mm);
>> +       vma = find_vma(host_mm, host_addr);
>> +
>> +       /* XXX: expand stack? */
>> +       if (vma && vma->vm_start > host_addr)
>> +               vma = NULL;
>> +
>> +       *vmap = vma;
>> +       return 0;
>> +}
>> +
>>   static void
>>   msharefs_evict_inode(struct inode *inode)
>>   {
>> @@ -169,11 +200,13 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr,
>>                  unsigned long, len, int, oflag, mode_t, mode)
>>   {
>>          struct mshare_data *info;
>> -       struct mm_struct *mm;
>>          struct filename *fname = getname(name);
>>          struct dentry *dentry;
>>          struct inode *inode;
>>          struct qstr namestr;
>> +       struct vm_area_struct *vma, *next, *new_vma;
>> +       struct mm_struct *new_mm;
>> +       unsigned long end;
>>          int err = PTR_ERR(fname);
>>
>>          /*
>> @@ -193,6 +226,8 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr,
>>          if (IS_ERR(fname))
>>                  goto err_out;
>>
>> +       end = addr + len;
>> +
>>          /*
>>           * Does this mshare entry exist already? If it does, calling
>>           * mshare with O_EXCL|O_CREAT is an error
>> @@ -205,49 +240,165 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr,
>>          inode_lock(d_inode(msharefs_sb->s_root));
>>          dentry = d_lookup(msharefs_sb->s_root, &namestr);
>>          if (dentry && (oflag & (O_EXCL|O_CREAT))) {
>> +               inode = d_inode(dentry);
>>                  err = -EEXIST;
>>                  dput(dentry);
>>                  goto err_unlock_inode;
>>          }
>>
>>          if (dentry) {
>> +               unsigned long mapaddr, prot = PROT_NONE;
>> +
>>                  inode = d_inode(dentry);
>>                  if (inode == NULL) {
>> +                       mmap_write_unlock(current->mm);
>>                          err = -EINVAL;
>>                          goto err_out;
>>                  }
>>                  info = inode->i_private;
>> -               refcount_inc(&info->refcnt);
>>                  dput(dentry);
>> +
>> +               /*
>> +                * Map in the address range as anonymous mappings
>> +                */
>> +               oflag &= (O_RDONLY | O_WRONLY | O_RDWR);
>> +               if (oflag & O_RDONLY)
>> +                       prot |= PROT_READ;
>> +               else if (oflag & O_WRONLY)
>> +                       prot |= PROT_WRITE;
>> +               else if (oflag & O_RDWR)
>> +                       prot |= (PROT_READ | PROT_WRITE);
>> +               mapaddr = vm_mmap(NULL, addr, len, prot,
>> +                               MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, 0);
>> +               if (IS_ERR((void *)mapaddr)) {
>> +                       err = -EINVAL;
>> +                       goto err_out;
>> +               }
>> +
>> +               refcount_inc(&info->refcnt);
>> +
>> +               /*
>> +                * Now that we have mmap'd the mshare'd range, update vma
>> +                * flags and vm_mm pointer for this mshare'd range.
>> +                */
>> +               mmap_write_lock(current->mm);
>> +               vma = find_vma(current->mm, addr);
>> +               if (vma && vma->vm_start < addr) {
> 
> and I don't understand why "vma->vm_start < addr" can happen.
> does it mean we have given mshare() an address which is not
> aligned? then we should have return -EINVAL earlier?
> 
Yes, this could potentially be caught earlier. I have rewritten the code to new API based upon mmap() entirely without 
adding new system calls. Much of the above code is no longer needed since mmap takes care of lots of these checks 
already. I am running some final tests on v2 patch series and will send it out shortly.

Thanks,
Khalid



  reply	other threads:[~2022-06-28 20:17 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-11 16:05 [PATCH v1 00/14] Add support for shared PTEs across processes Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 01/14] mm: Add new system calls mshare, mshare_unlink Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 02/14] mm/mshare: Add msharefs filesystem Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 03/14] mm/mshare: Add read for msharefs Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 04/14] mm/mshare: implement mshare_unlink syscall Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 05/14] mm/mshare: Add locking to msharefs syscalls Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 06/14] mm/mshare: Check for mounted filesystem Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 07/14] mm/mshare: Add vm flag for shared PTE Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 08/14] mm/mshare: Add basic page table sharing using mshare Khalid Aziz
2022-04-11 18:48   ` Dave Hansen
2022-04-11 20:39     ` Khalid Aziz
2022-05-30 11:11   ` Barry Song
2022-06-28 20:11     ` Khalid Aziz
2022-05-31  3:46   ` Barry Song
2022-06-28 20:16     ` Khalid Aziz [this message]
2022-04-11 16:05 ` [PATCH v1 09/14] mm/mshare: Do not free PTEs for mshare'd PTEs Khalid Aziz
2022-05-31  4:24   ` Barry Song
2022-06-29 17:38     ` Khalid Aziz
2022-07-03 20:54       ` Andy Lutomirski
2022-07-06 20:33         ` Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 10/14] mm/mshare: Check for mapped vma when mshare'ing existing mshare'd range Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 11/14] mm/mshare: unmap vmas in mshare_unlink Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 12/14] mm/mshare: Add a proc file with mshare alignment/size information Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 13/14] mm/mshare: Enforce mshare'd region permissions Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 14/14] mm/mshare: Copy PTEs to host mm Khalid Aziz
2022-04-11 17:37 ` [PATCH v1 00/14] Add support for shared PTEs across processes Matthew Wilcox
2022-04-11 18:51   ` Dave Hansen
2022-04-11 19:08     ` Matthew Wilcox
2022-04-11 19:52   ` Khalid Aziz
2022-04-11 18:47 ` Dave Hansen
2022-04-11 20:10 ` Eric W. Biederman
2022-04-11 22:21   ` Khalid Aziz
2022-05-30 10:48 ` Barry Song
2022-05-30 11:18   ` David Hildenbrand
2022-05-30 11:49     ` Barry Song
2022-06-29 17:48     ` Khalid Aziz
2022-06-29 17:40   ` Khalid Aziz

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=d5795029-9d77-b4e1-b92e-0b6bc920a0d0@oracle.com \
    --to=khalid.aziz@oracle.com \
    --cc=21cnbao@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=aneesh.kumar@linux.ibm.com \
    --cc=arnd@arndb.de \
    --cc=corbet@lwn.net \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@redhat.com \
    --cc=ebiederm@xmission.com \
    --cc=hagen@jauu.net \
    --cc=jack@suse.cz \
    --cc=keescook@chromium.org \
    --cc=kirill@shutemov.name \
    --cc=kucharsk@gmail.com \
    --cc=linkinjeon@kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=longpeng2@huawei.com \
    --cc=luto@kernel.org \
    --cc=markhemm@googlemail.com \
    --cc=pcc@google.com \
    --cc=rppt@kernel.org \
    --cc=sieberf@amazon.com \
    --cc=sjpark@amazon.de \
    --cc=surenb@google.com \
    --cc=tst@schoebel-theuer.de \
    --cc=willy@infradead.org \
    --cc=yzaikin@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).