From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S964854AbVI0IGq (ORCPT ); Tue, 27 Sep 2005 04:06:46 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S964855AbVI0IGq (ORCPT ); Tue, 27 Sep 2005 04:06:46 -0400 Received: from anf141.internetdsl.tpnet.pl ([83.17.87.141]:13518 "EHLO anf141.internetdsl.tpnet.pl") by vger.kernel.org with ESMTP id S964854AbVI0IGp (ORCPT ); Tue, 27 Sep 2005 04:06:45 -0400 From: "Rafael J. Wysocki" To: Pavel Machek Subject: [PATCH][Fix] Fix Bug #4959 (take 2) Date: Tue, 27 Sep 2005 10:07:03 +0200 User-Agent: KMail/1.8.2 Cc: LKML , Andi Kleen , Andrew Morton References: <200509241936.12214.rjw@sisk.pl> In-Reply-To: <200509241936.12214.rjw@sisk.pl> MIME-Version: 1.0 Content-Type: text/plain; charset="iso-8859-2" Content-Transfer-Encoding: 7bit Content-Disposition: inline Message-Id: <200509271007.03865.rjw@sisk.pl> Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Hi, The following patch fixes Bug #4959. For this purpose it creates temporary page translation tables including the kernel mapping (reused) and the direct mapping (created from scratch) and makes swsusp switch to these tables right before the image is restored. The code used to generate the direct mapping comes from arch/x86_64/mm/init.c. Please consider for applying. Greetings, Rafael PS As you can see, I really would like this bug to go officially, as swsusp is hardly usable on x86-64 with it, and I know some people using swsusp on this architecture. I just don't want them to be hurt by this bug. NOTES: (1) I'm quite sure that to fix the problem we need to use temporary page translation tables that won't be modified in the process of copying the image. (2) These page translation tables have to be present in memory before the image is copied, so there are two possible ways in which they can be created: (a) in the startup kernel code that is executed before calling swsusp on resume, in which case they have to be marked with PG_nosave, (b) in swsusp, after the image has been loaded from disk (to set up the tables we need to know which pages will be overwritten while copying the image). However, (a) is tricky, because it will only work if the tables are always located at the same physical addresses, which I think would be quite difficult to achieve. Moreover, such a code would have to be executed on every boot and the temporary page tables would always be present in memory. For this reason I decided to chose (b). (3) To create the temporary page translation tables in swsusp I need to allocate some memory pages and theoretically this operaction could fail. Fortunately the probability of this is extremely small, because the number of pages needed is low (4 pages for a machine with 2GB of RAM) and we reserve 512 spare pages on suspend (in principle they are reserved for I/O, but they should be available at the time the temporary page tables are created). Thus I think the memory allocations here are completely safe. Signed-off-by: Rafael J. Wysocki Index: linux-2.6.14-rc2-git6/arch/x86_64/kernel/suspend.c =================================================================== --- linux-2.6.14-rc2-git6.orig/arch/x86_64/kernel/suspend.c 2005-09-26 21:05:13.000000000 +0200 +++ linux-2.6.14-rc2-git6/arch/x86_64/kernel/suspend.c 2005-09-27 07:41:04.000000000 +0200 @@ -11,6 +11,17 @@ #include #include #include +#include +#include +#include + +/* Defined in kernel/power/swsusp.c */ +extern unsigned long get_usable_page(unsigned gfp_mask); +extern void free_eaten_memory(void); +/* Defined in arch/x86_64/kernel/suspend_asm.S */ +int restore_image(void); + +pgd_t *temp_level4_pgt; struct saved_context saved_context; @@ -140,4 +151,128 @@ } +static void **pages; + +static inline void *__add_page(void) +{ + void **c; + + c = (void **)get_usable_page(GFP_ATOMIC); + if (c) { + *c = pages; + pages = c; + } + return c; +} + +static inline void *__next_page(void) +{ + void **c; + + c = pages; + if (c) { + pages = *c; + *c = NULL; + } + return c; +} + +/* + * Try to allocate as many usable pages as needed and daisy chain them. + * If one allocation fails, free the pages allocated so far + */ +static int alloc_usable_pages(unsigned long n) +{ + void *p; + + pages = NULL; + do + if (!__add_page()) + break; + while (--n); + if (n) { + p = __next_page(); + while (p) { + free_page((unsigned long)p); + p = __next_page(); + } + return -ENOMEM; + } + return 0; +} +static void phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +{ + long i, j; + + i = pud_index(address); + pud = pud + i; + for (; i < PTRS_PER_PUD; pud++, i++) { + unsigned long paddr; + pmd_t *pmd; + + paddr = address + i*PUD_SIZE; + if (paddr >= end) { + for (; i < PTRS_PER_PUD; i++, pud++) + set_pud(pud, __pud(0)); + break; + } + + pmd = (pmd_t *)__next_page(); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { + unsigned long pe; + + if (paddr >= end) { + for (; j < PTRS_PER_PMD; j++, pmd++) + set_pmd(pmd, __pmd(0)); + break; + } + pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr; + pe &= __supported_pte_mask; + set_pmd(pmd, __pmd(pe)); + } + } +} + +static void set_up_temporary_mappings(void) +{ + unsigned long start, end, next; + + temp_level4_pgt = (pgd_t *)__next_page(); + + /* It is safe to reuse the original kernel mapping */ + set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), + init_level4_pgt[pgd_index(__START_KERNEL_map)]); + + /* Set up the direct mapping from scratch */ + start = (unsigned long)pfn_to_kaddr(0); + end = (unsigned long)pfn_to_kaddr(end_pfn); + + for (; start < end; start = next) { + pud_t *pud = (pud_t *)__next_page(); + next = start + PGDIR_SIZE; + if (next > end) + next = end; + phys_pud_init(pud, __pa(start), __pa(next)); + set_pgd(temp_level4_pgt + pgd_index(start), + mk_kernel_pgd(__pa(pud))); + } +} + +int swsusp_arch_resume(void) +{ + unsigned long n; + + n = ((end_pfn << PAGE_SHIFT) + PUD_SIZE - 1) >> PUD_SHIFT; + n += n / PTRS_PER_PUD + !!(n % PTRS_PER_PUD) + 1; + pr_debug("swsusp_arch_resume(): pages needed = %lu\n", n); + if (alloc_usable_pages(n)) { + free_eaten_memory(); + return -ENOMEM; + } + /* We have enough memory and from now on we cannot recover */ + set_up_temporary_mappings(); + restore_image(); + return 0; +} Index: linux-2.6.14-rc2-git6/kernel/power/swsusp.c =================================================================== --- linux-2.6.14-rc2-git6.orig/kernel/power/swsusp.c 2005-09-26 21:05:13.000000000 +0200 +++ linux-2.6.14-rc2-git6/kernel/power/swsusp.c 2005-09-26 22:13:21.000000000 +0200 @@ -1095,7 +1095,7 @@ *eaten_memory = c; } -static unsigned long get_usable_page(unsigned gfp_mask) +unsigned long get_usable_page(unsigned gfp_mask) { unsigned long m; @@ -1109,7 +1109,7 @@ return m; } -static void free_eaten_memory(void) +void free_eaten_memory(void) { unsigned long m; void **c; @@ -1481,11 +1481,12 @@ /* Allocate memory for the image and read the data from swap */ error = check_pagedir(pagedir_nosave); - free_eaten_memory(); + if (!error) error = data_read(pagedir_nosave); if (error) { /* We fail cleanly */ + free_eaten_memory(); for_each_pbe (p, pagedir_nosave) if (p->address) { free_page(p->address); Index: linux-2.6.14-rc2-git6/arch/x86_64/kernel/suspend_asm.S =================================================================== --- linux-2.6.14-rc2-git6.orig/arch/x86_64/kernel/suspend_asm.S 2005-09-26 21:05:13.000000000 +0200 +++ linux-2.6.14-rc2-git6/arch/x86_64/kernel/suspend_asm.S 2005-09-27 00:33:35.000000000 +0200 @@ -39,12 +39,13 @@ call swsusp_save ret -ENTRY(swsusp_arch_resume) - /* set up cr3 */ - leaq init_level4_pgt(%rip),%rax - subq $__START_KERNEL_map,%rax - movq %rax,%cr3 - +ENTRY(restore_image) + /* switch to temporary page tables */ + movq $__PAGE_OFFSET, %rdx + movq temp_level4_pgt(%rip), %rax + subq %rdx, %rax + movq %rax, %cr3 + /* Flush TLB */ movq mmu_cr4_features(%rip), %rax movq %rax, %rdx andq $~(1<<7), %rdx # PGE @@ -69,6 +70,10 @@ movq pbe_next(%rdx), %rdx jmp loop done: + /* go back to the original page tables */ + leaq init_level4_pgt(%rip), %rax + subq $__START_KERNEL_map, %rax + movq %rax, %cr3 /* Flush TLB, including "global" things (vmalloc) */ movq mmu_cr4_features(%rip), %rax movq %rax, %rdx