From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1758780AbYJQXOo (ORCPT ); Fri, 17 Oct 2008 19:14:44 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1757601AbYJQXL4 (ORCPT ); Fri, 17 Oct 2008 19:11:56 -0400 Received: from mailhub.sw.ru ([195.214.232.25]:9774 "EHLO relay.sw.ru" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1757522AbYJQXLz (ORCPT ); Fri, 17 Oct 2008 19:11:55 -0400 From: Andrey Mirkin To: containers@lists.linux-foundation.org, linux-kernel@vger.kernel.org Cc: Pavel Emelyanov , Andrey Mirkin Subject: [PATCH 09/10] Introduce functions to restore mm Date: Sat, 18 Oct 2008 03:11:37 +0400 Message-Id: <1224285098-573-10-git-send-email-major@openvz.org> X-Mailer: git-send-email 1.5.6 In-Reply-To: <1224285098-573-9-git-send-email-major@openvz.org> References: <1224285098-573-1-git-send-email-major@openvz.org> <1224285098-573-2-git-send-email-major@openvz.org> <1224285098-573-3-git-send-email-major@openvz.org> <1224285098-573-4-git-send-email-major@openvz.org> <1224285098-573-5-git-send-email-major@openvz.org> <1224285098-573-6-git-send-email-major@openvz.org> <1224285098-573-7-git-send-email-major@openvz.org> <1224285098-573-8-git-send-email-major@openvz.org> <1224285098-573-9-git-send-email-major@openvz.org> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Functions to restore mm, VMAs and mm context are added. Signed-off-by: Andrey Mirkin --- checkpoint/Makefile | 2 +- checkpoint/checkpoint.h | 1 + checkpoint/cpt_image.h | 5 + checkpoint/rst_mm.c | 320 ++++++++++++++++++++++++++++++++++++++++++++++ checkpoint/rst_process.c | 3 +- mm/mmap.c | 1 + mm/mprotect.c | 2 + 7 files changed, 332 insertions(+), 2 deletions(-) create mode 100644 checkpoint/rst_mm.c diff --git a/checkpoint/Makefile b/checkpoint/Makefile index 689a0eb..19ca732 100644 --- a/checkpoint/Makefile +++ b/checkpoint/Makefile @@ -3,4 +3,4 @@ obj-y += sys_core.o obj-$(CONFIG_CHECKPOINT) += cptrst.o cptrst-objs := sys.o checkpoint.o cpt_process.o cpt_mm.o restart.o \ - rst_process.o + rst_process.o rst_mm.o diff --git a/checkpoint/checkpoint.h b/checkpoint/checkpoint.h index 1d0ca49..195fdc6 100644 --- a/checkpoint/checkpoint.h +++ b/checkpoint/checkpoint.h @@ -65,3 +65,4 @@ int cpt_dump_mm(struct task_struct *tsk, struct cpt_context *ctx); int restart_container(struct cpt_context *ctx); int rst_get_object(int type, void *tmp, int size, struct cpt_context *ctx); int rst_restart_process(struct cpt_context *ctx); +int rst_restore_mm(struct cpt_context *ctx); diff --git a/checkpoint/cpt_image.h b/checkpoint/cpt_image.h index 160cf85..e1fb483 100644 --- a/checkpoint/cpt_image.h +++ b/checkpoint/cpt_image.h @@ -233,6 +233,11 @@ struct cpt_x86_regs __u32 cpt_ss; } __attribute__ ((aligned (8))); +static inline void __user * cpt_ptr_import(__u64 ptr) +{ + return (void*)(unsigned long)ptr; +} + static inline __u64 cpt_timespec_export(struct timespec *tv) { return (((u64)tv->tv_sec) << 32) + tv->tv_nsec; diff --git a/checkpoint/rst_mm.c b/checkpoint/rst_mm.c new file mode 100644 index 0000000..fe53c45 --- /dev/null +++ b/checkpoint/rst_mm.c @@ -0,0 +1,320 @@ +/* + * Copyright (C) 2008 Parallels, Inc. + * + * Author: Andrey Mirkin + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "checkpoint.h" +#include "cpt_image.h" + +static unsigned long make_prot(struct cpt_vma_image *vmai) +{ + unsigned long prot = 0; + + if (vmai->cpt_flags & VM_READ) + prot |= PROT_READ; + if (vmai->cpt_flags & VM_WRITE) + prot |= PROT_WRITE; + if (vmai->cpt_flags & VM_EXEC) + prot |= PROT_EXEC; + if (vmai->cpt_flags & VM_GROWSDOWN) + prot |= PROT_GROWSDOWN; + if (vmai->cpt_flags & VM_GROWSUP) + prot |= PROT_GROWSUP; + return prot; +} + +static unsigned long make_flags(struct cpt_vma_image *vmai) +{ + unsigned long flags = MAP_FIXED; + + if (vmai->cpt_flags&(VM_SHARED|VM_MAYSHARE)) + flags |= MAP_SHARED; + else + flags |= MAP_PRIVATE; + + if (vmai->cpt_file == CPT_NULL) + flags |= MAP_ANONYMOUS; + if (vmai->cpt_flags & VM_GROWSDOWN) + flags |= MAP_GROWSDOWN; +#ifdef MAP_GROWSUP + if (vmai->cpt_flags & VM_GROWSUP) + flags |= MAP_GROWSUP; +#endif + if (vmai->cpt_flags & VM_DENYWRITE) + flags |= MAP_DENYWRITE; + if (vmai->cpt_flags & VM_EXECUTABLE) + flags |= MAP_EXECUTABLE; + if (!(vmai->cpt_flags & VM_ACCOUNT)) + flags |= MAP_NORESERVE; + return flags; +} + +static int rst_restore_one_vma(struct cpt_context *ctx) +{ + int err; + int i; + unsigned long addr; + struct mm_struct *mm = current->mm; + struct cpt_vma_image vmai; + struct vm_area_struct *vma; + struct file *file = NULL; + unsigned long prot; + + err = rst_get_object(CPT_OBJ_VMA, &vmai, sizeof(vmai), ctx); + if (err) + return err; + + prot = make_prot(&vmai); + + if (vmai.cpt_vma_type == CPT_VMA_FILE) { + struct cpt_object_hdr h; + int len; + char *path; + + err = rst_get_object(CPT_OBJ_NAME, &h, sizeof(h), ctx); + if (err) + goto out; + len = h.cpt_len - sizeof(h); + if (len < 0) { + err = -EINVAL; + goto out; + } + path = kmalloc(len, GFP_KERNEL); + if (!path) { + err = -ENOMEM; + goto out; + } + err = ctx->read(path, len, ctx); + if (err) { + kfree(path); + goto out; + } + + /* Just open file + TODO: open with correct flags */ + file = filp_open(path, O_RDONLY, 0); + kfree(path); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto out; + } + } + + down_write(&mm->mmap_sem); + addr = do_mmap_pgoff(file, vmai.cpt_start, + vmai.cpt_end - vmai.cpt_start, + prot, make_flags(&vmai), + vmai.cpt_pgoff); + + if (addr != vmai.cpt_start) { + up_write(&mm->mmap_sem); + + err = -EINVAL; + if (IS_ERR((void*)addr)) + err = addr; + goto out; + } + + vma = find_vma(mm, vmai.cpt_start); + if (vma == NULL) { + up_write(&mm->mmap_sem); + eprintk("cannot find mmapped vma\n"); + err = -ESRCH; + goto out; + } + + /* do_mmap_pgoff() can merge new area to previous one (not to the next, + * we mmap in order, the rest of mm is still unmapped). This can happen + * f.e. if flags are to be adjusted later, or if we had different + * anon_vma on two adjacent regions. Split it by brute force. */ + if (vma->vm_start != vmai.cpt_start) { + err = split_vma(mm, vma, (unsigned long)vmai.cpt_start, 0); + if (err) { + up_write(&mm->mmap_sem); + eprintk("cannot split vma\n"); + goto out; + } + } + up_write(&mm->mmap_sem); + + for (i = 0; i < vmai.cpt_page_num; i++) { + struct cpt_page_block pb; + + err = rst_get_object(CPT_OBJ_PAGES, &pb, sizeof(pb), ctx); + if (err) + goto out; + if (!(vmai.cpt_flags & VM_ACCOUNT) && !(prot & PROT_WRITE)) { + /* I guess this is get_user_pages() messed things, + * this happens f.e. when gdb inserts breakpoints. + */ + int j; + for (j = 0; j < (pb.cpt_end-pb.cpt_start)/PAGE_SIZE; j++) { + struct page *page; + void *maddr; + err = get_user_pages(current, current->mm, + (unsigned long)pb.cpt_start + + j * PAGE_SIZE, + 1, 1, 1, &page, NULL); + if (err == 0) + err = -EFAULT; + if (err < 0) { + eprintk("get_user_pages: %d\n", err); + goto out; + } + err = 0; + maddr = kmap(page); + if (pb.cpt_content == CPT_CONTENT_VOID) { + memset(maddr, 0, PAGE_SIZE); + } else if (pb.cpt_content == CPT_CONTENT_DATA) { + err = ctx->read(maddr, PAGE_SIZE, ctx); + if (err) { + kunmap(page); + goto out; + } + } else { + err = -EINVAL; + kunmap(page); + goto out; + } + set_page_dirty_lock(page); + kunmap(page); + page_cache_release(page); + } + } else { + if (!(prot & PROT_WRITE)) + sys_mprotect(vmai.cpt_start, + vmai.cpt_end - vmai.cpt_start, + prot | PROT_WRITE); + if (pb.cpt_content == CPT_CONTENT_VOID) { + int j; + for (j=0; j<(pb.cpt_end-pb.cpt_start)/sizeof(unsigned long); j++) { + err = __put_user(0UL, ((unsigned long __user*)(unsigned long)pb.cpt_start) + j); + if (err) { + eprintk("__put_user 2 %d\n", err); + goto out; + } + } + } else if (pb.cpt_content == CPT_CONTENT_DATA) { + err = ctx->read(cpt_ptr_import(pb.cpt_start), + pb.cpt_end - pb.cpt_start, + ctx); + if (err) + goto out; + } else { + err = -EINVAL; + goto out; + } + if (!(prot & PROT_WRITE)) + sys_mprotect(vmai.cpt_start, + vmai.cpt_end - vmai.cpt_start, + prot); + } + } + +out: + if (file) + fput(file); + return err; +} + +static int rst_restore_mm_context(struct cpt_context *ctx) +{ + struct cpt_obj_bits b; + struct mm_struct *mm = current->mm; + int oldsize = mm->context.size; + int err; + void *oldldt; + void *newldt; + + err = rst_get_object(CPT_OBJ_BITS, &b, sizeof(b), ctx); + if (err) + return err; + + if (b.cpt_size > PAGE_SIZE) + newldt = vmalloc(b.cpt_size); + else + newldt = kmalloc(b.cpt_size, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + err = ctx->read(newldt, b.cpt_size, ctx); + if (err) + return err; + + oldldt = mm->context.ldt; + mm->context.ldt = newldt; + mm->context.size = b.cpt_size / LDT_ENTRY_SIZE; + + load_LDT(&mm->context); + + if (oldsize) { + if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + kfree(oldldt); + } + + return 0; +} + +int rst_restore_mm(struct cpt_context *ctx) +{ + int err; + int i; + struct mm_struct *mm = current->mm; + struct cpt_mm_image m; + + err = rst_get_object(CPT_OBJ_MM, &m, sizeof(m), ctx); + if (err) + return err; + + down_write(&mm->mmap_sem); + do_munmap(mm, 0, TASK_SIZE); + + mm->start_code = m.cpt_start_code; + mm->end_code = m.cpt_end_code; + mm->start_data = m.cpt_start_data; + mm->end_data = m.cpt_end_data; + mm->start_brk = m.cpt_start_brk; + mm->brk = m.cpt_brk; + mm->start_stack = m.cpt_start_stack; + mm->arg_start = m.cpt_start_arg; + mm->arg_end = m.cpt_end_arg; + mm->env_start = m.cpt_start_env; + mm->env_end = m.cpt_end_env; + mm->def_flags = m.cpt_def_flags; + mm->flags = m.cpt_flags; + + up_write(&mm->mmap_sem); + + for (i = 0; i < m.cpt_map_count; i++) { + err = rst_restore_one_vma(ctx); + if (err < 0) + goto out; + } + + err = rst_restore_mm_context(ctx); +out: + return err; +} + diff --git a/checkpoint/rst_process.c b/checkpoint/rst_process.c index b9f745e..9e448b2 100644 --- a/checkpoint/rst_process.c +++ b/checkpoint/rst_process.c @@ -210,7 +210,8 @@ static int restart_thread(void *arg) err = rst_get_object(CPT_OBJ_TASK, ti, sizeof(*ti), ctx); if (!err) err = rst_restore_task_struct(current, ti, ctx); - /* Restore mm here */ + if (!err) + err = rst_restore_mm(ctx); if (!err) err = rst_restore_fpustate(current, ti, ctx); if (!err) diff --git a/mm/mmap.c b/mm/mmap.c index 971d0ed..98d1ba9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1858,6 +1858,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, return 0; } +EXPORT_SYMBOL(split_vma); /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the diff --git a/mm/mprotect.c b/mm/mprotect.c index fded06f..47c7d75 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -317,3 +318,4 @@ out: up_write(¤t->mm->mmap_sem); return error; } +EXPORT_SYMBOL(sys_mprotect); -- 1.5.6