linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 00/11] xen: Initial kexec/kdump implementation
@ 2012-09-27 18:06 Daniel Kiper
  2012-09-27 18:06 ` [PATCH 01/11] kexec: introduce kexec_ops struct Daniel Kiper
  0 siblings, 1 reply; 32+ messages in thread
From: Daniel Kiper @ 2012-09-27 18:06 UTC (permalink / raw)
  To: konrad.wilk, andrew.cooper3, jbeulich, linux-kernel, xen-devel

Hi,

This set of patches contains initial kexec/kdump implementation for Xen.
Currently only dom0 is supported, however, almost all infrustructure
required for domU support is ready.

Daniel

 arch/x86/include/asm/kexec.h         |   10 +-
 arch/x86/include/asm/xen/hypercall.h |    6 +
 arch/x86/include/asm/xen/kexec.h     |   83 +++++++++
 arch/x86/kernel/machine_kexec_64.c   |   12 +-
 arch/x86/kernel/vmlinux.lds.S        |    7 +-
 arch/x86/xen/Makefile                |    3 +
 arch/x86/xen/enlighten.c             |   12 ++
 arch/x86/xen/kexec.c                 |  150 ++++++++++++++++
 arch/x86/xen/machine_kexec_32.c      |  245 ++++++++++++++++++++++++++
 arch/x86/xen/machine_kexec_64.c      |  301 +++++++++++++++++++++++++++++++
 arch/x86/xen/relocate_kernel_32.S    |  323 ++++++++++++++++++++++++++++++++++
 arch/x86/xen/relocate_kernel_64.S    |  309 ++++++++++++++++++++++++++++++++
 drivers/xen/sys-hypervisor.c         |   42 +++++-
 include/linux/kexec.h                |   18 ++
 include/xen/interface/xen.h          |   33 ++++
 kernel/kexec.c                       |  125 ++++++++++----
 16 files changed, 1636 insertions(+), 43 deletions(-)

Daniel Kiper (11):
      kexec: introduce kexec_ops struct
      x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE
      xen: Introduce architecture independent data for kexec/kdump
      x86/xen: Introduce architecture dependent data for kexec/kdump
      x86/xen: Register resources required by kexec-tools
      x86/xen: Add i386 kexec/kdump implementation
      x86/xen: Add x86_64 kexec/kdump implementation
      x86/xen: Add kexec/kdump makefile rules
      x86/xen/enlighten: Add init and crash kexec/kdump hooks
      drivers/xen: Export vmcoreinfo through sysfs
      x86: Add Xen kexec control code size check to linker script

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 01/11] kexec: introduce kexec_ops struct
  2012-09-27 18:06 [PATCH 00/11] xen: Initial kexec/kdump implementation Daniel Kiper
@ 2012-09-27 18:06 ` Daniel Kiper
  2012-09-27 18:06   ` [PATCH 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE Daniel Kiper
                     ` (2 more replies)
  0 siblings, 3 replies; 32+ messages in thread
From: Daniel Kiper @ 2012-09-27 18:06 UTC (permalink / raw)
  To: konrad.wilk, andrew.cooper3, jbeulich, linux-kernel, xen-devel
  Cc: Daniel Kiper

Some kexec/kdump implementations (e.g. Xen PVOPS) on different archs could
not use default functions or require some changes in behavior of kexec/kdump
generic code. To cope with that problem kexec_ops struct was introduced.
It allows a developer to replace all or some functions and control some
functionality of kexec/kdump generic code.

Default behavior of kexec/kdump generic code is not changed.

Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
---
 include/linux/kexec.h |   18 +++++++
 kernel/kexec.c        |  125 ++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 111 insertions(+), 32 deletions(-)

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 37c5f72..beb08ca 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -165,7 +165,25 @@ struct kimage {
 #endif
 };
 
+struct kexec_ops {
+	bool always_use_normal_alloc;
+	struct page *(*kimage_alloc_pages)(gfp_t gfp_mask,
+						unsigned int order,
+						unsigned long limit);
+	void (*kimage_free_pages)(struct page *page);
+	unsigned long (*page_to_pfn)(struct page *page);
+	struct page *(*pfn_to_page)(unsigned long pfn);
+	unsigned long (*virt_to_phys)(volatile void *address);
+	void *(*phys_to_virt)(unsigned long address);
+	int (*machine_kexec_prepare)(struct kimage *image);
+	int (*machine_kexec_load)(struct kimage *image);
+	void (*machine_kexec_cleanup)(struct kimage *image);
+	void (*machine_kexec_unload)(struct kimage *image);
+	void (*machine_kexec_shutdown)(void);
+	void (*machine_kexec)(struct kimage *image);
+};
 
+extern struct kexec_ops kexec_ops;
 
 /* kexec interface functions */
 extern void machine_kexec(struct kimage *image);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 0668d58..98556f3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -56,6 +56,47 @@ struct resource crashk_res = {
 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
 };
 
+static struct page *kimage_alloc_pages(gfp_t gfp_mask,
+					unsigned int order,
+					unsigned long limit);
+static void kimage_free_pages(struct page *page);
+
+static unsigned long generic_page_to_pfn(struct page *page)
+{
+	return page_to_pfn(page);
+}
+
+static struct page *generic_pfn_to_page(unsigned long pfn)
+{
+	return pfn_to_page(pfn);
+}
+
+static unsigned long generic_virt_to_phys(volatile void *address)
+{
+	return virt_to_phys(address);
+}
+
+static void *generic_phys_to_virt(unsigned long address)
+{
+	return phys_to_virt(address);
+}
+
+struct kexec_ops kexec_ops = {
+	.always_use_normal_alloc = false,
+	.kimage_alloc_pages = kimage_alloc_pages,
+	.kimage_free_pages = kimage_free_pages,
+	.page_to_pfn = generic_page_to_pfn,
+	.pfn_to_page = generic_pfn_to_page,
+	.virt_to_phys = generic_virt_to_phys,
+	.phys_to_virt = generic_phys_to_virt,
+	.machine_kexec_prepare = machine_kexec_prepare,
+	.machine_kexec_load = NULL,
+	.machine_kexec_cleanup = machine_kexec_cleanup,
+	.machine_kexec_unload = NULL,
+	.machine_kexec_shutdown = machine_shutdown,
+	.machine_kexec = machine_kexec
+};
+
 int kexec_should_crash(struct task_struct *p)
 {
 	if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
@@ -355,7 +396,9 @@ static int kimage_is_destination_range(struct kimage *image,
 	return 0;
 }
 
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+static struct page *kimage_alloc_pages(gfp_t gfp_mask,
+					unsigned int order,
+					unsigned long limit)
 {
 	struct page *pages;
 
@@ -392,7 +435,7 @@ static void kimage_free_page_list(struct list_head *list)
 
 		page = list_entry(pos, struct page, lru);
 		list_del(&page->lru);
-		kimage_free_pages(page);
+		(*kexec_ops.kimage_free_pages)(page);
 	}
 }
 
@@ -425,10 +468,11 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 	do {
 		unsigned long pfn, epfn, addr, eaddr;
 
-		pages = kimage_alloc_pages(GFP_KERNEL, order);
+		pages = (*kexec_ops.kimage_alloc_pages)(GFP_KERNEL, order,
+							KEXEC_CONTROL_MEMORY_LIMIT);
 		if (!pages)
 			break;
-		pfn   = page_to_pfn(pages);
+		pfn   = (*kexec_ops.page_to_pfn)(pages);
 		epfn  = pfn + count;
 		addr  = pfn << PAGE_SHIFT;
 		eaddr = epfn << PAGE_SHIFT;
@@ -515,7 +559,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 		}
 		/* If I don't overlap any segments I have found my hole! */
 		if (i == image->nr_segments) {
-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+			pages = (*kexec_ops.pfn_to_page)(hole_start >> PAGE_SHIFT);
 			break;
 		}
 	}
@@ -532,12 +576,13 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
 	struct page *pages = NULL;
 
 	switch (image->type) {
+	case KEXEC_TYPE_CRASH:
+		if (!kexec_ops.always_use_normal_alloc) {
+			pages = kimage_alloc_crash_control_pages(image, order);
+			break;
+		}
 	case KEXEC_TYPE_DEFAULT:
 		pages = kimage_alloc_normal_control_pages(image, order);
-		break;
-	case KEXEC_TYPE_CRASH:
-		pages = kimage_alloc_crash_control_pages(image, order);
-		break;
 	}
 
 	return pages;
@@ -557,7 +602,7 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 			return -ENOMEM;
 
 		ind_page = page_address(page);
-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+		*image->entry = (*kexec_ops.virt_to_phys)(ind_page) | IND_INDIRECTION;
 		image->entry = ind_page;
 		image->last_entry = ind_page +
 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -616,14 +661,14 @@ static void kimage_terminate(struct kimage *image)
 #define for_each_kimage_entry(image, ptr, entry) \
 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 		ptr = (entry & IND_INDIRECTION)? \
-			phys_to_virt((entry & PAGE_MASK)): ptr +1)
+			(*kexec_ops.phys_to_virt)((entry & PAGE_MASK)): ptr +1)
 
 static void kimage_free_entry(kimage_entry_t entry)
 {
 	struct page *page;
 
-	page = pfn_to_page(entry >> PAGE_SHIFT);
-	kimage_free_pages(page);
+	page = (*kexec_ops.pfn_to_page)(entry >> PAGE_SHIFT);
+	(*kexec_ops.kimage_free_pages)(page);
 }
 
 static void kimage_free(struct kimage *image)
@@ -653,7 +698,7 @@ static void kimage_free(struct kimage *image)
 		kimage_free_entry(ind);
 
 	/* Handle any machine specific cleanup */
-	machine_kexec_cleanup(image);
+	(*kexec_ops.machine_kexec_cleanup)(image);
 
 	/* Free the kexec control pages... */
 	kimage_free_page_list(&image->control_pages);
@@ -709,7 +754,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
 	 * have a match.
 	 */
 	list_for_each_entry(page, &image->dest_pages, lru) {
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = (*kexec_ops.page_to_pfn)(page) << PAGE_SHIFT;
 		if (addr == destination) {
 			list_del(&page->lru);
 			return page;
@@ -720,16 +765,17 @@ static struct page *kimage_alloc_page(struct kimage *image,
 		kimage_entry_t *old;
 
 		/* Allocate a page, if we run out of memory give up */
-		page = kimage_alloc_pages(gfp_mask, 0);
+		page = (*kexec_ops.kimage_alloc_pages)(gfp_mask, 0,
+							KEXEC_SOURCE_MEMORY_LIMIT);
 		if (!page)
 			return NULL;
 		/* If the page cannot be used file it away */
-		if (page_to_pfn(page) >
+		if ((*kexec_ops.page_to_pfn)(page) >
 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 			list_add(&page->lru, &image->unuseable_pages);
 			continue;
 		}
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = (*kexec_ops.page_to_pfn)(page) << PAGE_SHIFT;
 
 		/* If it is the destination page we want use it */
 		if (addr == destination)
@@ -752,7 +798,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
 			struct page *old_page;
 
 			old_addr = *old & PAGE_MASK;
-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+			old_page = (*kexec_ops.pfn_to_page)(old_addr >> PAGE_SHIFT);
 			copy_highpage(page, old_page);
 			*old = addr | (*old & ~PAGE_MASK);
 
@@ -762,7 +808,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
 			 */
 			if (!(gfp_mask & __GFP_HIGHMEM) &&
 			    PageHighMem(old_page)) {
-				kimage_free_pages(old_page);
+				(*kexec_ops.kimage_free_pages)(old_page);
 				continue;
 			}
 			addr = old_addr;
@@ -808,7 +854,7 @@ static int kimage_load_normal_segment(struct kimage *image,
 			result  = -ENOMEM;
 			goto out;
 		}
-		result = kimage_add_page(image, page_to_pfn(page)
+		result = kimage_add_page(image, (*kexec_ops.page_to_pfn)(page)
 								<< PAGE_SHIFT);
 		if (result < 0)
 			goto out;
@@ -862,7 +908,7 @@ static int kimage_load_crash_segment(struct kimage *image,
 		char *ptr;
 		size_t uchunk, mchunk;
 
-		page = pfn_to_page(maddr >> PAGE_SHIFT);
+		page = (*kexec_ops.pfn_to_page)(maddr >> PAGE_SHIFT);
 		if (!page) {
 			result  = -ENOMEM;
 			goto out;
@@ -901,12 +947,13 @@ static int kimage_load_segment(struct kimage *image,
 	int result = -ENOMEM;
 
 	switch (image->type) {
+	case KEXEC_TYPE_CRASH:
+		if (!kexec_ops.always_use_normal_alloc) {
+			result = kimage_load_crash_segment(image, segment);
+			break;
+		}
 	case KEXEC_TYPE_DEFAULT:
 		result = kimage_load_normal_segment(image, segment);
-		break;
-	case KEXEC_TYPE_CRASH:
-		result = kimage_load_crash_segment(image, segment);
-		break;
 	}
 
 	return result;
@@ -994,6 +1041,8 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 			/* Free any current crash dump kernel before
 			 * we corrupt it.
 			 */
+			if (kexec_ops.machine_kexec_unload)
+				(*kexec_ops.machine_kexec_unload)(image);
 			kimage_free(xchg(&kexec_crash_image, NULL));
 			result = kimage_crash_alloc(&image, entry,
 						     nr_segments, segments);
@@ -1004,7 +1053,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 
 		if (flags & KEXEC_PRESERVE_CONTEXT)
 			image->preserve_context = 1;
-		result = machine_kexec_prepare(image);
+		result = (*kexec_ops.machine_kexec_prepare)(image);
 		if (result)
 			goto out;
 
@@ -1017,11 +1066,23 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 		if (flags & KEXEC_ON_CRASH)
 			crash_unmap_reserved_pages();
 	}
+
+	if (kexec_ops.machine_kexec_load) {
+		result = (*kexec_ops.machine_kexec_load)(image);
+
+		if (result)
+			goto out;
+	}
+
 	/* Install the new kernel, and  Uninstall the old */
 	image = xchg(dest_image, image);
 
 out:
 	mutex_unlock(&kexec_mutex);
+
+	if (kexec_ops.machine_kexec_unload)
+		(*kexec_ops.machine_kexec_unload)(image);
+
 	kimage_free(image);
 
 	return result;
@@ -1095,7 +1156,7 @@ void crash_kexec(struct pt_regs *regs)
 			crash_setup_regs(&fixed_regs, regs);
 			crash_save_vmcoreinfo();
 			machine_crash_shutdown(&fixed_regs);
-			machine_kexec(kexec_crash_image);
+			(*kexec_ops.machine_kexec)(kexec_crash_image);
 		}
 		mutex_unlock(&kexec_mutex);
 	}
@@ -1117,8 +1178,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
 	unsigned long addr;
 
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
-		ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
-		init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
+		ClearPageReserved((*kexec_ops.pfn_to_page)(addr >> PAGE_SHIFT));
+		init_page_count((*kexec_ops.pfn_to_page)(addr >> PAGE_SHIFT));
 		free_page((unsigned long)__va(addr));
 		totalram_pages++;
 	}
@@ -1572,10 +1633,10 @@ int kernel_kexec(void)
 	{
 		kernel_restart_prepare(NULL);
 		printk(KERN_EMERG "Starting new kernel\n");
-		machine_shutdown();
+		(*kexec_ops.machine_kexec_shutdown)();
 	}
 
-	machine_kexec(kexec_image);
+	(*kexec_ops.machine_kexec)(kexec_image);
 
 #ifdef CONFIG_KEXEC_JUMP
 	if (kexec_image->preserve_context) {
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE
  2012-09-27 18:06 ` [PATCH 01/11] kexec: introduce kexec_ops struct Daniel Kiper
@ 2012-09-27 18:06   ` Daniel Kiper
  2012-09-27 18:06     ` [PATCH 03/11] xen: Introduce architecture independent data for kexec/kdump Daniel Kiper
  2012-09-28  7:56     ` [PATCH 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE Jan Beulich
  2012-09-28  7:49   ` [PATCH 01/11] kexec: introduce kexec_ops struct Jan Beulich
  2012-09-28 16:07   ` Konrad Rzeszutek Wilk
  2 siblings, 2 replies; 32+ messages in thread
From: Daniel Kiper @ 2012-09-27 18:06 UTC (permalink / raw)
  To: konrad.wilk, andrew.cooper3, jbeulich, linux-kernel, xen-devel
  Cc: Daniel Kiper

Some implementations (e.g. Xen PVOPS) could not use part of identity page table
to construct transition page table. It means that they require separate PUDs,
PMDs and PTEs for virtual and physical (identity) mapping. To satisfy that
requirement add extra pointer to PGD, PUD, PMD and PTE and align existing code.

Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
---
 arch/x86/include/asm/kexec.h       |   10 +++++++---
 arch/x86/kernel/machine_kexec_64.c |   12 ++++++------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 317ff17..3cf5600 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -157,9 +157,13 @@ struct kimage_arch {
 };
 #else
 struct kimage_arch {
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
+	pgd_t *pgd;
+	pud_t *pud0;
+	pud_t *pud1;
+	pmd_t *pmd0;
+	pmd_t *pmd1;
+	pte_t *pte0;
+	pte_t *pte1;
 };
 #endif
 
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index b3ea9db..976e54b 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -137,9 +137,9 @@ out:
 
 static void free_transition_pgtable(struct kimage *image)
 {
-	free_page((unsigned long)image->arch.pud);
-	free_page((unsigned long)image->arch.pmd);
-	free_page((unsigned long)image->arch.pte);
+	free_page((unsigned long)image->arch.pud0);
+	free_page((unsigned long)image->arch.pmd0);
+	free_page((unsigned long)image->arch.pte0);
 }
 
 static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
@@ -157,7 +157,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 		pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
 		if (!pud)
 			goto err;
-		image->arch.pud = pud;
+		image->arch.pud0 = pud;
 		set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
 	}
 	pud = pud_offset(pgd, vaddr);
@@ -165,7 +165,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 		pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
 		if (!pmd)
 			goto err;
-		image->arch.pmd = pmd;
+		image->arch.pmd0 = pmd;
 		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 	}
 	pmd = pmd_offset(pud, vaddr);
@@ -173,7 +173,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 		pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
 		if (!pte)
 			goto err;
-		image->arch.pte = pte;
+		image->arch.pte0 = pte;
 		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 	}
 	pte = pte_offset_kernel(pmd, vaddr);
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 03/11] xen: Introduce architecture independent data for kexec/kdump
  2012-09-27 18:06   ` [PATCH 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE Daniel Kiper
@ 2012-09-27 18:06     ` Daniel Kiper
  2012-09-27 18:06       ` [PATCH 04/11] x86/xen: Introduce architecture dependent " Daniel Kiper
  2012-09-28 16:10       ` [PATCH 03/11] xen: Introduce architecture independent data for kexec/kdump Konrad Rzeszutek Wilk
  2012-09-28  7:56     ` [PATCH 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE Jan Beulich
  1 sibling, 2 replies; 32+ messages in thread
From: Daniel Kiper @ 2012-09-27 18:06 UTC (permalink / raw)
  To: konrad.wilk, andrew.cooper3, jbeulich, linux-kernel, xen-devel
  Cc: Daniel Kiper

Introduce architecture independent constants and structures
required by Xen kexec/kdump implementation.

Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
---
 include/xen/interface/xen.h |   33 +++++++++++++++++++++++++++++++++
 1 files changed, 33 insertions(+), 0 deletions(-)

diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 0801468..ac19f9e 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -58,6 +58,7 @@
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
 #define __HYPERVISOR_hvm_op               34
+#define __HYPERVISOR_kexec_op             37
 #define __HYPERVISOR_tmem_op              38
 
 /* Architecture-specific hypercall definitions. */
@@ -232,7 +233,39 @@ DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
 #define VMASST_TYPE_pae_extended_cr3     3
 #define MAX_VMASST_TYPE 3
 
+/*
+ * Commands to HYPERVISOR_kexec_op().
+ */
+#define KEXEC_CMD_kexec			0
+#define KEXEC_CMD_kexec_load		1
+#define KEXEC_CMD_kexec_unload		2
+#define KEXEC_CMD_kexec_get_range	3
+
+/*
+ * Memory ranges for kdump (utilized by HYPERVISOR_kexec_op()).
+ */
+#define KEXEC_RANGE_MA_CRASH		0
+#define KEXEC_RANGE_MA_XEN		1
+#define KEXEC_RANGE_MA_CPU		2
+#define KEXEC_RANGE_MA_XENHEAP		3
+#define KEXEC_RANGE_MA_BOOT_PARAM	4
+#define KEXEC_RANGE_MA_EFI_MEMMAP	5
+#define KEXEC_RANGE_MA_VMCOREINFO	6
+
 #ifndef __ASSEMBLY__
+struct xen_kexec_exec {
+	int type;
+};
+
+struct xen_kexec_range {
+	int range;
+	int nr;
+	unsigned long size;
+	unsigned long start;
+};
+
+extern unsigned long xen_vmcoreinfo_maddr;
+extern unsigned long xen_vmcoreinfo_max_size;
 
 typedef uint16_t domid_t;
 
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 04/11] x86/xen: Introduce architecture dependent data for kexec/kdump
  2012-09-27 18:06     ` [PATCH 03/11] xen: Introduce architecture independent data for kexec/kdump Daniel Kiper
@ 2012-09-27 18:06       ` Daniel Kiper
  2012-09-27 18:06         ` [PATCH 05/11] x86/xen: Register resources required by kexec-tools Daniel Kiper
  2012-09-28 16:10       ` [PATCH 03/11] xen: Introduce architecture independent data for kexec/kdump Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 32+ messages in thread
From: Daniel Kiper @ 2012-09-27 18:06 UTC (permalink / raw)
  To: konrad.wilk, andrew.cooper3, jbeulich, linux-kernel, xen-devel
  Cc: Daniel Kiper

Introduce architecture dependent constants, structures and
functions required by Xen kexec/kdump implementation.

Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
---
 arch/x86/include/asm/xen/hypercall.h |    6 +++
 arch/x86/include/asm/xen/kexec.h     |   83 ++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+), 0 deletions(-)
 create mode 100644 arch/x86/include/asm/xen/kexec.h

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 59c226d..553544c 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -466,6 +466,12 @@ HYPERVISOR_hvm_op(int op, void *arg)
 }
 
 static inline int
+HYPERVISOR_kexec_op(unsigned long op, void *args)
+{
+	return _hypercall2(int, kexec_op, op, args);
+}
+
+static inline int
 HYPERVISOR_tmem_op(
 	struct tmem_op *op)
 {
diff --git a/arch/x86/include/asm/xen/kexec.h b/arch/x86/include/asm/xen/kexec.h
new file mode 100644
index 0000000..3349031
--- /dev/null
+++ b/arch/x86/include/asm/xen/kexec.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2011 Daniel Kiper
+ * Copyright (c) 2012 Daniel Kiper, Oracle Corporation
+ *
+ * kexec/kdump implementation for Xen was written by Daniel Kiper.
+ * Initial work on it was sponsored by Google under Google Summer
+ * of Code 2011 program and Citrix. Konrad Rzeszutek Wilk from Oracle
+ * was the mentor for this project.
+ *
+ * Some ideas are taken from:
+ *   - native kexec/kdump implementation,
+ *   - kexec/kdump implementation for Xen Linux Kernel Ver. 2.6.18,
+ *   - PV-GRUB.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ASM_X86_XEN_KEXEC_H
+#define _ASM_X86_XEN_KEXEC_H
+
+#include <linux/init.h>
+
+#define KEXEC_XEN_NO_PAGES	17
+
+#define XK_MA_CONTROL_PAGE	0
+#define XK_VA_CONTROL_PAGE	1
+#define XK_MA_PGD_PAGE		2
+#define XK_VA_PGD_PAGE		3
+#define XK_MA_PUD0_PAGE		4
+#define XK_VA_PUD0_PAGE		5
+#define XK_MA_PUD1_PAGE		6
+#define XK_VA_PUD1_PAGE		7
+#define XK_MA_PMD0_PAGE		8
+#define XK_VA_PMD0_PAGE		9
+#define XK_MA_PMD1_PAGE		10
+#define XK_VA_PMD1_PAGE		11
+#define XK_MA_PTE0_PAGE		12
+#define XK_VA_PTE0_PAGE		13
+#define XK_MA_PTE1_PAGE		14
+#define XK_VA_PTE1_PAGE		15
+#define XK_MA_TABLE_PAGE	16
+
+#ifndef __ASSEMBLY__
+struct xen_kexec_image {
+	unsigned long page_list[KEXEC_XEN_NO_PAGES];
+	unsigned long indirection_page;
+	unsigned long start_address;
+};
+
+struct xen_kexec_load {
+	int type;
+	struct xen_kexec_image image;
+};
+
+extern unsigned int xen_kexec_control_code_size;
+
+extern void __init xen_init_kexec_ops(void);
+
+#ifdef CONFIG_X86_32
+extern void xen_relocate_kernel(unsigned long indirection_page,
+				unsigned long *page_list,
+				unsigned long start_address,
+				unsigned int has_pae,
+				unsigned int preserve_context);
+#else
+extern void xen_relocate_kernel(unsigned long indirection_page,
+				unsigned long *page_list,
+				unsigned long start_address,
+				unsigned int preserve_context);
+#endif
+#endif
+#endif /* _ASM_X86_XEN_KEXEC_H */
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 05/11] x86/xen: Register resources required by kexec-tools
  2012-09-27 18:06       ` [PATCH 04/11] x86/xen: Introduce architecture dependent " Daniel Kiper
@ 2012-09-27 18:06         ` Daniel Kiper
  2012-09-27 18:06           ` [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation Daniel Kiper
  2012-09-28 16:21           ` [PATCH 05/11] x86/xen: Register resources required by kexec-tools Konrad Rzeszutek Wilk
  0 siblings, 2 replies; 32+ messages in thread
From: Daniel Kiper @ 2012-09-27 18:06 UTC (permalink / raw)
  To: konrad.wilk, andrew.cooper3, jbeulich, linux-kernel, xen-devel
  Cc: Daniel Kiper

Register resources required by kexec-tools.

Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
---
 arch/x86/xen/kexec.c |  150 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 150 insertions(+), 0 deletions(-)
 create mode 100644 arch/x86/xen/kexec.c

diff --git a/arch/x86/xen/kexec.c b/arch/x86/xen/kexec.c
new file mode 100644
index 0000000..eb0108b
--- /dev/null
+++ b/arch/x86/xen/kexec.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2011 Daniel Kiper
+ * Copyright (c) 2012 Daniel Kiper, Oracle Corporation
+ *
+ * kexec/kdump implementation for Xen was written by Daniel Kiper.
+ * Initial work on it was sponsored by Google under Google Summer
+ * of Code 2011 program and Citrix. Konrad Rzeszutek Wilk from Oracle
+ * was the mentor for this project.
+ *
+ * Some ideas are taken from:
+ *   - native kexec/kdump implementation,
+ *   - kexec/kdump implementation for Xen Linux Kernel Ver. 2.6.18,
+ *   - PV-GRUB.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <xen/interface/platform.h>
+#include <xen/interface/xen.h>
+#include <xen/xen.h>
+
+#include <asm/xen/hypercall.h>
+
+unsigned long xen_vmcoreinfo_maddr = 0;
+unsigned long xen_vmcoreinfo_max_size = 0;
+
+static int __init xen_init_kexec_resources(void)
+{
+	int rc;
+	static struct resource xen_hypervisor_res = {
+		.name = "Hypervisor code and data",
+		.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+	};
+	struct resource *cpu_res;
+	struct xen_kexec_range xkr;
+	struct xen_platform_op cpuinfo_op;
+	uint32_t cpus, i;
+
+	if (!xen_initial_domain())
+		return 0;
+
+	if (strstr(boot_command_line, "crashkernel="))
+		pr_info("kexec: Ignoring crashkernel option. "
+			"It should be passed to Xen hypervisor.\n");
+
+	/* Register Crash kernel resource. */
+	xkr.range = KEXEC_RANGE_MA_CRASH;
+	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &xkr);
+
+	if (rc) {
+		pr_info("kexec: %s: HYPERVISOR_kexec_op(KEXEC_RANGE_MA_CRASH)"
+			": %i\n", __func__, rc);
+		return rc;
+	}
+
+	if (!xkr.size)
+		return 0;
+
+	crashk_res.start = xkr.start;
+	crashk_res.end = xkr.start + xkr.size - 1;
+	insert_resource(&iomem_resource, &crashk_res);
+
+	/* Register Hypervisor code and data resource. */
+	xkr.range = KEXEC_RANGE_MA_XEN;
+	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &xkr);
+
+	if (rc) {
+		pr_info("kexec: %s: HYPERVISOR_kexec_op(KEXEC_RANGE_MA_XEN)"
+			": %i\n", __func__, rc);
+		return rc;
+	}
+
+	xen_hypervisor_res.start = xkr.start;
+	xen_hypervisor_res.end = xkr.start + xkr.size - 1;
+	insert_resource(&iomem_resource, &xen_hypervisor_res);
+
+	/* Determine maximum number of physical CPUs. */
+	cpuinfo_op.cmd = XENPF_get_cpuinfo;
+	cpuinfo_op.u.pcpu_info.xen_cpuid = 0;
+	rc = HYPERVISOR_dom0_op(&cpuinfo_op);
+
+	if (rc) {
+		pr_info("kexec: %s: HYPERVISOR_dom0_op(): %i\n", __func__, rc);
+		return rc;
+	}
+
+	cpus = cpuinfo_op.u.pcpu_info.max_present + 1;
+
+	/* Register CPUs Crash note resources. */
+	cpu_res = kcalloc(cpus, sizeof(struct resource), GFP_KERNEL);
+
+	if (!cpu_res) {
+		pr_info("kexec: %s: kcalloc(): %i\n", __func__, -ENOMEM);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < cpus; ++i) {
+		xkr.range = KEXEC_RANGE_MA_CPU;
+		xkr.nr = i;
+		rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &xkr);
+
+		if (rc) {
+			pr_info("kexec: %s: cpu: %u: HYPERVISOR_kexec_op"
+				"(KEXEC_RANGE_MA_XEN): %i\n", __func__, i, rc);
+			continue;
+		}
+
+		cpu_res->name = "Crash note";
+		cpu_res->start = xkr.start;
+		cpu_res->end = xkr.start + xkr.size - 1;
+		cpu_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+		insert_resource(&iomem_resource, cpu_res++);
+	}
+
+	/* Get vmcoreinfo address and maximum allowed size. */
+	xkr.range = KEXEC_RANGE_MA_VMCOREINFO;
+	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &xkr);
+
+	if (rc) {
+		pr_info("kexec: %s: HYPERVISOR_kexec_op(KEXEC_RANGE_MA_VMCOREINFO)"
+			": %i\n", __func__, rc);
+		return rc;
+	}
+
+	xen_vmcoreinfo_maddr = xkr.start;
+	xen_vmcoreinfo_max_size = xkr.size;
+
+	return 0;
+}
+
+core_initcall(xen_init_kexec_resources);
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation
  2012-09-27 18:06         ` [PATCH 05/11] x86/xen: Register resources required by kexec-tools Daniel Kiper
@ 2012-09-27 18:06           ` Daniel Kiper
  2012-09-27 18:06             ` [PATCH 07/11] x86/xen: Add x86_64 " Daniel Kiper
                               ` (2 more replies)
  2012-09-28 16:21           ` [PATCH 05/11] x86/xen: Register resources required by kexec-tools Konrad Rzeszutek Wilk
  1 sibling, 3 replies; 32+ messages in thread
From: Daniel Kiper @ 2012-09-27 18:06 UTC (permalink / raw)
  To: konrad.wilk, andrew.cooper3, jbeulich, linux-kernel, xen-devel
  Cc: Daniel Kiper

Add i386 kexec/kdump implementation.

Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
---
 arch/x86/xen/machine_kexec_32.c   |  245 ++++++++++++++++++++++++++++
 arch/x86/xen/relocate_kernel_32.S |  323 +++++++++++++++++++++++++++++++++++++
 2 files changed, 568 insertions(+), 0 deletions(-)
 create mode 100644 arch/x86/xen/machine_kexec_32.c
 create mode 100644 arch/x86/xen/relocate_kernel_32.S

diff --git a/arch/x86/xen/machine_kexec_32.c b/arch/x86/xen/machine_kexec_32.c
new file mode 100644
index 0000000..6b5141e
--- /dev/null
+++ b/arch/x86/xen/machine_kexec_32.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2011 Daniel Kiper
+ * Copyright (c) 2012 Daniel Kiper, Oracle Corporation
+ *
+ * kexec/kdump implementation for Xen was written by Daniel Kiper.
+ * Initial work on it was sponsored by Google under Google Summer
+ * of Code 2011 program and Citrix. Konrad Rzeszutek Wilk from Oracle
+ * was the mentor for this project.
+ *
+ * Some ideas are taken from:
+ *   - native kexec/kdump implementation,
+ *   - kexec/kdump implementation for Xen Linux Kernel Ver. 2.6.18,
+ *   - PV-GRUB.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+
+#include <xen/xen.h>
+#include <xen/xen-ops.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/kexec.h>
+#include <asm/xen/page.h>
+
+#define __ma(vaddr)	(virt_to_machine(vaddr).maddr)
+
+static struct page *kimage_alloc_pages(gfp_t gfp_mask,
+					unsigned int order,
+					unsigned long limit)
+{
+	struct page *pages;
+	unsigned int address_bits, i;
+
+	pages = alloc_pages(gfp_mask, order);
+
+	if (!pages)
+		return NULL;
+
+	address_bits = (limit == ULONG_MAX) ? BITS_PER_LONG : ilog2(limit);
+
+	/* Relocate set of pages below given limit. */
+	if (xen_create_contiguous_region((unsigned long)page_address(pages),
+							order, address_bits)) {
+		__free_pages(pages, order);
+		return NULL;
+	}
+
+	pages->mapping = NULL;
+	set_page_private(pages, order);
+
+	for (i = 0; i < (1 << order); ++i)
+		SetPageReserved(pages + i);
+
+	return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+	unsigned int i, order;
+
+	order = page_private(page);
+
+	for (i = 0; i < (1 << order); ++i)
+		ClearPageReserved(page + i);
+
+	xen_destroy_contiguous_region((unsigned long)page_address(page), order);
+	__free_pages(page, order);
+}
+
+static unsigned long xen_page_to_mfn(struct page *page)
+{
+	return pfn_to_mfn(page_to_pfn(page));
+}
+
+static struct page *xen_mfn_to_page(unsigned long mfn)
+{
+	return pfn_to_page(mfn_to_pfn(mfn));
+}
+
+static unsigned long xen_virt_to_machine(volatile void *address)
+{
+	return virt_to_machine(address).maddr;
+}
+
+static void *xen_machine_to_virt(unsigned long address)
+{
+	return phys_to_virt(machine_to_phys(XMADDR(address)).paddr);
+}
+
+static void free_transition_pgtable(struct kimage *image)
+{
+	free_page((unsigned long)image->arch.pgd);
+	free_page((unsigned long)image->arch.pmd0);
+	free_page((unsigned long)image->arch.pmd1);
+	free_page((unsigned long)image->arch.pte0);
+	free_page((unsigned long)image->arch.pte1);
+}
+
+static int alloc_transition_pgtable(struct kimage *image)
+{
+	image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+
+	if (!image->arch.pgd)
+		goto err;
+
+	image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+
+	if (!image->arch.pmd0)
+		goto err;
+
+	image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+
+	if (!image->arch.pmd1)
+		goto err;
+
+	image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+
+	if (!image->arch.pte0)
+		goto err;
+
+	image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+
+	if (!image->arch.pte1)
+		goto err;
+
+	return 0;
+
+err:
+	free_transition_pgtable(image);
+
+	return -ENOMEM;
+}
+
+static int machine_xen_kexec_prepare(struct kimage *image)
+{
+#ifdef CONFIG_KEXEC_JUMP
+	if (image->preserve_context) {
+		pr_info_once("kexec: Context preservation is not "
+				"supported in Xen domains.\n");
+		return -ENOSYS;
+	}
+#endif
+
+	return alloc_transition_pgtable(image);
+}
+
+static int machine_xen_kexec_load(struct kimage *image)
+{
+	void *control_page;
+	struct xen_kexec_load xkl = {};
+
+	if (!image)
+		return 0;
+
+	control_page = page_address(image->control_code_page);
+	memcpy(control_page, xen_relocate_kernel, xen_kexec_control_code_size);
+
+	xkl.type = image->type;
+	xkl.image.page_list[XK_MA_CONTROL_PAGE] = __ma(control_page);
+	xkl.image.page_list[XK_MA_TABLE_PAGE] = 0; /* Unused. */
+	xkl.image.page_list[XK_MA_PGD_PAGE] = __ma(image->arch.pgd);
+	xkl.image.page_list[XK_MA_PUD0_PAGE] = 0; /* Unused. */
+	xkl.image.page_list[XK_MA_PUD1_PAGE] = 0; /* Unused. */
+	xkl.image.page_list[XK_MA_PMD0_PAGE] = __ma(image->arch.pmd0);
+	xkl.image.page_list[XK_MA_PMD1_PAGE] = __ma(image->arch.pmd1);
+	xkl.image.page_list[XK_MA_PTE0_PAGE] = __ma(image->arch.pte0);
+	xkl.image.page_list[XK_MA_PTE1_PAGE] = __ma(image->arch.pte1);
+	xkl.image.indirection_page = image->head;
+	xkl.image.start_address = image->start;
+
+	return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl);
+}
+
+static void machine_xen_kexec_cleanup(struct kimage *image)
+{
+	free_transition_pgtable(image);
+}
+
+static void machine_xen_kexec_unload(struct kimage *image)
+{
+	int rc;
+	struct xen_kexec_load xkl = {};
+
+	if (!image)
+		return;
+
+	xkl.type = image->type;
+	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl);
+
+	WARN(rc, "kexec: %s: HYPERVISOR_kexec_op(): %i\n", __func__, rc);
+}
+
+static void machine_xen_kexec_shutdown(void)
+{
+}
+
+static void machine_xen_kexec(struct kimage *image)
+{
+	int rc;
+	struct xen_kexec_exec xke = {};
+
+	xke.type = image->type;
+	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke);
+
+	pr_emerg("kexec: %s: HYPERVISOR_kexec_op(): %i\n", __func__, rc);
+	BUG();
+}
+
+void __init xen_init_kexec_ops(void)
+{
+	if (!xen_initial_domain())
+		return;
+
+	kexec_ops.always_use_normal_alloc = true;
+	kexec_ops.kimage_alloc_pages = kimage_alloc_pages;
+	kexec_ops.kimage_free_pages = kimage_free_pages;
+	kexec_ops.page_to_pfn = xen_page_to_mfn;
+	kexec_ops.pfn_to_page = xen_mfn_to_page;
+	kexec_ops.virt_to_phys = xen_virt_to_machine;
+	kexec_ops.phys_to_virt = xen_machine_to_virt;
+	kexec_ops.machine_kexec_prepare = machine_xen_kexec_prepare;
+	kexec_ops.machine_kexec_load = machine_xen_kexec_load;
+	kexec_ops.machine_kexec_cleanup = machine_xen_kexec_cleanup;
+	kexec_ops.machine_kexec_unload = machine_xen_kexec_unload;
+	kexec_ops.machine_kexec_shutdown = machine_xen_kexec_shutdown;
+	kexec_ops.machine_kexec = machine_xen_kexec;
+}
diff --git a/arch/x86/xen/relocate_kernel_32.S b/arch/x86/xen/relocate_kernel_32.S
new file mode 100644
index 0000000..0e81830
--- /dev/null
+++ b/arch/x86/xen/relocate_kernel_32.S
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2002-2005 Eric Biederman <ebiederm@xmission.com>
+ * Copyright (c) 2011 Daniel Kiper
+ * Copyright (c) 2012 Daniel Kiper, Oracle Corporation
+ *
+ * kexec/kdump implementation for Xen was written by Daniel Kiper.
+ * Initial work on it was sponsored by Google under Google Summer
+ * of Code 2011 program and Citrix. Konrad Rzeszutek Wilk from Oracle
+ * was the mentor for this project.
+ *
+ * Some ideas are taken from:
+ *   - native kexec/kdump implementation,
+ *   - kexec/kdump implementation for Xen Linux Kernel Ver. 2.6.18,
+ *   - PV-GRUB.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either veesion 2 of the License, or
+ * (at your option) any later veesion.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <asm/cache.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/processor-flags.h>
+
+#include <asm/xen/kexec.h>
+
+#define ARG_INDIRECTION_PAGE	0x4
+#define ARG_PAGE_LIST		0x8
+#define ARG_START_ADDRESS	0xc
+
+#define PTR(x)	(x << 2)
+
+	.text
+	.align	PAGE_SIZE
+	.globl	xen_kexec_control_code_size, xen_relocate_kernel
+
+xen_relocate_kernel:
+	/*
+	 * Must be relocatable PIC code callable as a C function.
+	 *
+	 * This function is called by Xen but here hypervisor is dead.
+	 * We are playing on bare metal.
+	 *
+	 * Every machine address passed to this function through
+	 * page_list (e.g. XK_MA_CONTROL_PAGE) is established
+	 * by dom0 during kexec load phase.
+	 *
+	 * Every virtual address passed to this function through page_list
+	 * (e.g. XK_VA_CONTROL_PAGE) is established by hypervisor during
+	 * HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load) hypercall.
+	 *
+	 * 0x4(%esp) - indirection_page,
+	 * 0x8(%esp) - page_list,
+	 * 0xc(%esp) - start_address,
+	 * 0x10(%esp) - cpu_has_pae (ignored),
+	 * 0x14(%esp) - preserve_context (ignored).
+	 */
+
+	/* Zero out flags, and disable interrupts. */
+	pushl	$0
+	popfl
+
+	/* Get page_list address. */
+	movl	ARG_PAGE_LIST(%esp), %esi
+
+	/*
+	 * Map the control page at its virtual address
+	 * in transition page table.
+	 */
+	movl	PTR(XK_VA_CONTROL_PAGE)(%esi), %eax
+
+	/* Get PGD address and PGD entry index. */
+	movl	PTR(XK_VA_PGD_PAGE)(%esi), %ebx
+	movl	%eax, %ecx
+	shrl	$PGDIR_SHIFT, %ecx
+	andl	$(PTRS_PER_PGD - 1), %ecx
+
+	/* Fill PGD entry with PMD0 reference. */
+	movl	PTR(XK_MA_PMD0_PAGE)(%esi), %edx
+	orl	$_PAGE_PRESENT, %edx
+	movl	%edx, (%ebx, %ecx, 8)
+
+	/* Get PMD0 address and PMD0 entry index. */
+	movl	PTR(XK_VA_PMD0_PAGE)(%esi), %ebx
+	movl	%eax, %ecx
+	shrl	$PMD_SHIFT, %ecx
+	andl	$(PTRS_PER_PMD - 1), %ecx
+
+	/* Fill PMD0 entry with PTE0 reference. */
+	movl	PTR(XK_MA_PTE0_PAGE)(%esi), %edx
+	orl	$_KERNPG_TABLE, %edx
+	movl	%edx, (%ebx, %ecx, 8)
+
+	/* Get PTE0 address and PTE0 entry index. */
+	movl	PTR(XK_VA_PTE0_PAGE)(%esi), %ebx
+	movl	%eax, %ecx
+	shrl	$PAGE_SHIFT, %ecx
+	andl	$(PTRS_PER_PTE - 1), %ecx
+
+	/* Fill PTE0 entry with control page reference. */
+	movl	PTR(XK_MA_CONTROL_PAGE)(%esi), %edx
+	orl	$__PAGE_KERNEL_EXEC, %edx
+	movl	%edx, (%ebx, %ecx, 8)
+
+	/*
+	 * Identity map the control page at its machine address
+	 * in transition page table.
+	 */
+	movl	PTR(XK_MA_CONTROL_PAGE)(%esi), %eax
+
+	/* Get PGD address and PGD entry index. */
+	movl	PTR(XK_VA_PGD_PAGE)(%esi), %ebx
+	movl	%eax, %ecx
+	shrl	$PGDIR_SHIFT, %ecx
+	andl	$(PTRS_PER_PGD - 1), %ecx
+
+	/* Fill PGD entry with PMD1 reference. */
+	movl	PTR(XK_MA_PMD1_PAGE)(%esi), %edx
+	orl	$_PAGE_PRESENT, %edx
+	movl	%edx, (%ebx, %ecx, 8)
+
+	/* Get PMD1 address and PMD1 entry index. */
+	movl	PTR(XK_VA_PMD1_PAGE)(%esi), %ebx
+	movl	%eax, %ecx
+	shrl	$PMD_SHIFT, %ecx
+	andl	$(PTRS_PER_PMD - 1), %ecx
+
+	/* Fill PMD1 entry with PTE1 reference. */
+	movl	PTR(XK_MA_PTE1_PAGE)(%esi), %edx
+	orl	$_KERNPG_TABLE, %edx
+	movl	%edx, (%ebx, %ecx, 8)
+
+	/* Get PTE1 address and PTE1 entry index. */
+	movl	PTR(XK_VA_PTE1_PAGE)(%esi), %ebx
+	movl	%eax, %ecx
+	shrl	$PAGE_SHIFT, %ecx
+	andl	$(PTRS_PER_PTE - 1), %ecx
+
+	/* Fill PTE1 entry with control page reference. */
+	movl	PTR(XK_MA_CONTROL_PAGE)(%esi), %edx
+	orl	$__PAGE_KERNEL_EXEC, %edx
+	movl	%edx, (%ebx, %ecx, 8)
+
+	/*
+	 * Get machine address of control page now.
+	 * This is impossible after page table switch.
+	 */
+	movl	PTR(XK_MA_CONTROL_PAGE)(%esi), %ebx
+
+	/* Get machine address of transition page table now too. */
+	movl	PTR(XK_MA_PGD_PAGE)(%esi), %ecx
+
+	/* Get start_address too. */
+	movl	ARG_START_ADDRESS(%esp), %edx
+
+	/* Get indirection_page address too. */
+	movl	ARG_INDIRECTION_PAGE(%esp), %edi
+
+	/* Switch to transition page table. */
+	movl	%ecx, %cr3
+
+	/* Load IDT. */
+	lidtl	(idt_48 - xen_relocate_kernel)(%ebx)
+
+	/* Load GDT. */
+	leal	(gdt - xen_relocate_kernel)(%ebx), %eax
+	movl	%eax, (gdt_48 - xen_relocate_kernel + 2)(%ebx)
+	lgdtl	(gdt_48 - xen_relocate_kernel)(%ebx)
+
+	/* Load data segment registers. */
+	movl	$(gdt_ds - gdt), %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	movl	%eax, %fs
+	movl	%eax, %gs
+	movl	%eax, %ss
+
+	/* Setup a new stack at the end of machine address of control page. */
+	leal	PAGE_SIZE(%ebx), %esp
+
+	/* Store start_address on the stack. */
+	pushl   %edx
+
+	/* Jump to identity mapped page. */
+	pushl	$0
+	pushl	$(gdt_cs - gdt)
+	addl	$(identity_mapped - xen_relocate_kernel), %ebx
+	pushl	%ebx
+	iretl
+
+identity_mapped:
+	/*
+	 * Set %cr0 to a known state:
+	 *   - disable alignment check,
+	 *   - disable floating point emulation,
+	 *   - disable paging,
+	 *   - no task switch,
+	 *   - disable write protect,
+	 *   - enable protected mode.
+	 */
+	movl	%cr0, %eax
+	andl	$~(X86_CR0_AM | X86_CR0_EM | X86_CR0_PG | X86_CR0_TS | X86_CR0_WP), %eax
+	orl	$(X86_CR0_PE), %eax
+	movl	%eax, %cr0
+
+	/* Set %cr4 to a known state. */
+	xorl	%eax, %eax
+	movl	%eax, %cr4
+
+	jmp	1f
+
+1:
+	/* Flush the TLB (needed?). */
+	movl	%eax, %cr3
+
+	/* Do the copies. */
+	movl	%edi, %ecx	/* Put the indirection_page in %ecx. */
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	jmp	1f
+
+0:
+	/*
+	 * Top, read another doubleword from the indirection page.
+	 * Indirection page is an array which contains source
+	 * and destination address pairs. If all pairs could
+	 * not fit in one page then at the end of given
+	 * indirection page is pointer to next one.
+	 * Copy is stopped when done indicator
+	 * is found in indirection page.
+	 */
+	movl	(%ebx), %ecx
+	addl	$4, %ebx
+
+1:
+	testl	$0x1, %ecx	/* Is it a destination page? */
+	jz	2f
+
+	movl	%ecx, %edi
+	andl	$PAGE_MASK, %edi
+	jmp	0b
+
+2:
+	testl	$0x2, %ecx	/* Is it an indirection page? */
+	jz	2f
+
+	movl	%ecx, %ebx
+	andl	$PAGE_MASK, %ebx
+	jmp	0b
+
+2:
+	testl	$0x4, %ecx	/* Is it the done indicator? */
+	jz	2f
+	jmp	3f
+
+2:
+	testl	$0x8, %ecx	/* Is it the source indicator? */
+	jz	0b		/* Ignore it otherwise. */
+
+	movl	%ecx, %esi
+	andl	$PAGE_MASK, %esi
+	movl	$1024, %ecx
+
+	/* Copy page. */
+	rep	movsl
+	jmp	0b
+
+3:
+	/*
+	 * To be certain of avoiding problems with self-modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB by reloading %cr3 here, it's handy,
+	 * and not processor dependent.
+	 */
+	xorl	%eax, %eax
+	movl	%eax, %cr3
+
+	/*
+	 * Set all of the registers to known values.
+	 * Leave %esp alone.
+	 */
+	xorl	%ebx, %ebx
+	xorl    %ecx, %ecx
+	xorl    %edx, %edx
+	xorl    %esi, %esi
+	xorl    %edi, %edi
+	xorl    %ebp, %ebp
+
+	/* Jump to start_address. */
+	retl
+
+	.align	L1_CACHE_BYTES
+
+gdt:
+	.quad	0x0000000000000000	/* NULL descriptor. */
+
+gdt_cs:
+	.quad	0x00cf9a000000ffff	/* 4 GiB code segment at 0x00000000. */
+
+gdt_ds:
+	.quad	0x00cf92000000ffff	/* 4 GiB data segment at 0x00000000. */
+gdt_end:
+
+gdt_48:
+	.word	gdt_end - gdt - 1	/* GDT limit. */
+	.long	0			/* GDT base - filled in by code above. */
+
+idt_48:
+	.word	0			/* IDT limit. */
+	.long	0			/* IDT base. */
+
+xen_kexec_control_code_size:
+	.long	. - xen_relocate_kernel
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 07/11] x86/xen: Add x86_64 kexec/kdump implementation
  2012-09-27 18:06           ` [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation Daniel Kiper
@ 2012-09-27 18:06             ` Daniel Kiper
  2012-09-27 18:06               ` [PATCH 08/11] x86/xen: Add kexec/kdump makefile rules Daniel Kiper
  2012-09-28  8:11             ` [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation Jan Beulich
  2012-09-28 16:39             ` Konrad Rzeszutek Wilk
  2 siblings, 1 reply; 32+ messages in thread
From: Daniel Kiper @ 2012-09-27 18:06 UTC (permalink / raw)
  To: konrad.wilk, andrew.cooper3, jbeulich, linux-kernel, xen-devel
  Cc: Daniel Kiper

Add x86_64 kexec/kdump implementation.

Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
---
 arch/x86/xen/machine_kexec_64.c   |  301 ++++++++++++++++++++++++++++++++++++
 arch/x86/xen/relocate_kernel_64.S |  309 +++++++++++++++++++++++++++++++++++++
 2 files changed, 610 insertions(+), 0 deletions(-)
 create mode 100644 arch/x86/xen/machine_kexec_64.c
 create mode 100644 arch/x86/xen/relocate_kernel_64.S

diff --git a/arch/x86/xen/machine_kexec_64.c b/arch/x86/xen/machine_kexec_64.c
new file mode 100644
index 0000000..f87ffe0
--- /dev/null
+++ b/arch/x86/xen/machine_kexec_64.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2011 Daniel Kiper
+ * Copyright (c) 2012 Daniel Kiper, Oracle Corporation
+ *
+ * kexec/kdump implementation for Xen was written by Daniel Kiper.
+ * Initial work on it was sponsored by Google under Google Summer
+ * of Code 2011 program and Citrix. Konrad Rzeszutek Wilk from Oracle
+ * was the mentor for this project.
+ *
+ * Some ideas are taken from:
+ *   - native kexec/kdump implementation,
+ *   - kexec/kdump implementation for Xen Linux Kernel Ver. 2.6.18,
+ *   - PV-GRUB.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+
+#include <xen/interface/memory.h>
+#include <xen/xen.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/kexec.h>
+#include <asm/xen/page.h>
+
+#define __ma(vaddr)	(virt_to_machine(vaddr).maddr)
+
+static unsigned long xen_page_to_mfn(struct page *page)
+{
+	return pfn_to_mfn(page_to_pfn(page));
+}
+
+static struct page *xen_mfn_to_page(unsigned long mfn)
+{
+	return pfn_to_page(mfn_to_pfn(mfn));
+}
+
+static unsigned long xen_virt_to_machine(volatile void *address)
+{
+	return virt_to_machine(address).maddr;
+}
+
+static void *xen_machine_to_virt(unsigned long address)
+{
+	return phys_to_virt(machine_to_phys(XMADDR(address)).paddr);
+}
+
+static void init_level2_page(pmd_t *pmd, unsigned long addr)
+{
+	unsigned long end_addr = addr + PUD_SIZE;
+
+	while (addr < end_addr) {
+		native_set_pmd(pmd++, native_make_pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+		addr += PMD_SIZE;
+	}
+}
+
+static int init_level3_page(struct kimage *image, pud_t *pud,
+				unsigned long addr, unsigned long last_addr)
+{
+	pmd_t *pmd;
+	struct page *page;
+	unsigned long end_addr = addr + PGDIR_SIZE;
+
+	while ((addr < last_addr) && (addr < end_addr)) {
+		page = kimage_alloc_control_pages(image, 0);
+
+		if (!page)
+			return -ENOMEM;
+
+		pmd = page_address(page);
+		init_level2_page(pmd, addr);
+		native_set_pud(pud++, native_make_pud(__ma(pmd) | _KERNPG_TABLE));
+		addr += PUD_SIZE;
+	}
+
+	/* Clear the unused entries. */
+	while (addr < end_addr) {
+		native_pud_clear(pud++);
+		addr += PUD_SIZE;
+	}
+
+	return 0;
+}
+
+
+static int init_level4_page(struct kimage *image, pgd_t *pgd,
+				unsigned long addr, unsigned long last_addr)
+{
+	int rc;
+	pud_t *pud;
+	struct page *page;
+	unsigned long end_addr = addr + PTRS_PER_PGD * PGDIR_SIZE;
+
+	while ((addr < last_addr) && (addr < end_addr)) {
+		page = kimage_alloc_control_pages(image, 0);
+
+		if (!page)
+			return -ENOMEM;
+
+		pud = page_address(page);
+		rc = init_level3_page(image, pud, addr, last_addr);
+
+		if (rc)
+			return rc;
+
+		native_set_pgd(pgd++, native_make_pgd(__ma(pud) | _KERNPG_TABLE));
+		addr += PGDIR_SIZE;
+	}
+
+	/* Clear the unused entries. */
+	while (addr < end_addr) {
+		native_pgd_clear(pgd++);
+		addr += PGDIR_SIZE;
+	}
+
+	return 0;
+}
+
+static void free_transition_pgtable(struct kimage *image)
+{
+	free_page((unsigned long)image->arch.pgd);
+	free_page((unsigned long)image->arch.pud0);
+	free_page((unsigned long)image->arch.pud1);
+	free_page((unsigned long)image->arch.pmd0);
+	free_page((unsigned long)image->arch.pmd1);
+	free_page((unsigned long)image->arch.pte0);
+	free_page((unsigned long)image->arch.pte1);
+}
+
+static int alloc_transition_pgtable(struct kimage *image)
+{
+	image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+
+	if (!image->arch.pgd)
+		goto err;
+
+	image->arch.pud0 = (pud_t *)get_zeroed_page(GFP_KERNEL);
+
+	if (!image->arch.pud0)
+		goto err;
+
+	image->arch.pud1 = (pud_t *)get_zeroed_page(GFP_KERNEL);
+
+	if (!image->arch.pud1)
+		goto err;
+
+	image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+
+	if (!image->arch.pmd0)
+		goto err;
+
+	image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+
+	if (!image->arch.pmd1)
+		goto err;
+
+	image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+
+	if (!image->arch.pte0)
+		goto err;
+
+	image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+
+	if (!image->arch.pte1)
+		goto err;
+
+	return 0;
+
+err:
+	free_transition_pgtable(image);
+
+	return -ENOMEM;
+}
+
+static int init_pgtable(struct kimage *image, pgd_t *pgd)
+{
+	int rc;
+	unsigned long max_mfn;
+
+	max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
+
+	rc = init_level4_page(image, pgd, 0, PFN_PHYS(max_mfn));
+
+	if (rc)
+		return rc;
+
+	return alloc_transition_pgtable(image);
+}
+
+static int machine_xen_kexec_prepare(struct kimage *image)
+{
+#ifdef CONFIG_KEXEC_JUMP
+	if (image->preserve_context) {
+		pr_info_once("kexec: Context preservation is not "
+				"supported in Xen domains.\n");
+		return -ENOSYS;
+	}
+#endif
+
+	return init_pgtable(image, page_address(image->control_code_page));
+}
+
+static int machine_xen_kexec_load(struct kimage *image)
+{
+	void *control_page, *table_page;
+	struct xen_kexec_load xkl = {};
+
+	if (!image)
+		return 0;
+
+	table_page = page_address(image->control_code_page);
+	control_page = table_page + PAGE_SIZE;
+
+	memcpy(control_page, xen_relocate_kernel, xen_kexec_control_code_size);
+
+	xkl.type = image->type;
+	xkl.image.page_list[XK_MA_CONTROL_PAGE] = __ma(control_page);
+	xkl.image.page_list[XK_MA_TABLE_PAGE] = __ma(table_page);
+	xkl.image.page_list[XK_MA_PGD_PAGE] = __ma(image->arch.pgd);
+	xkl.image.page_list[XK_MA_PUD0_PAGE] = __ma(image->arch.pud0);
+	xkl.image.page_list[XK_MA_PUD1_PAGE] = __ma(image->arch.pud1);
+	xkl.image.page_list[XK_MA_PMD0_PAGE] = __ma(image->arch.pmd0);
+	xkl.image.page_list[XK_MA_PMD1_PAGE] = __ma(image->arch.pmd1);
+	xkl.image.page_list[XK_MA_PTE0_PAGE] = __ma(image->arch.pte0);
+	xkl.image.page_list[XK_MA_PTE1_PAGE] = __ma(image->arch.pte1);
+	xkl.image.indirection_page = image->head;
+	xkl.image.start_address = image->start;
+
+	return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl);
+}
+
+static void machine_xen_kexec_cleanup(struct kimage *image)
+{
+	free_transition_pgtable(image);
+}
+
+static void machine_xen_kexec_unload(struct kimage *image)
+{
+	int rc;
+	struct xen_kexec_load xkl = {};
+
+	if (!image)
+		return;
+
+	xkl.type = image->type;
+	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl);
+
+	WARN(rc, "kexec: %s: HYPERVISOR_kexec_op(): %i\n", __func__, rc);
+}
+
+static void machine_xen_kexec_shutdown(void)
+{
+}
+
+static void machine_xen_kexec(struct kimage *image)
+{
+	int rc;
+	struct xen_kexec_exec xke = {};
+
+	xke.type = image->type;
+	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke);
+
+	pr_emerg("kexec: %s: HYPERVISOR_kexec_op(): %i\n", __func__, rc);
+	BUG();
+}
+
+void __init xen_init_kexec_ops(void)
+{
+	if (!xen_initial_domain())
+		return;
+
+	kexec_ops.always_use_normal_alloc = true;
+	kexec_ops.page_to_pfn = xen_page_to_mfn;
+	kexec_ops.pfn_to_page = xen_mfn_to_page;
+	kexec_ops.virt_to_phys = xen_virt_to_machine;
+	kexec_ops.phys_to_virt = xen_machine_to_virt;
+	kexec_ops.machine_kexec_prepare = machine_xen_kexec_prepare;
+	kexec_ops.machine_kexec_load = machine_xen_kexec_load;
+	kexec_ops.machine_kexec_cleanup = machine_xen_kexec_cleanup;
+	kexec_ops.machine_kexec_unload = machine_xen_kexec_unload;
+	kexec_ops.machine_kexec_shutdown = machine_xen_kexec_shutdown;
+	kexec_ops.machine_kexec = machine_xen_kexec;
+}
diff --git a/arch/x86/xen/relocate_kernel_64.S b/arch/x86/xen/relocate_kernel_64.S
new file mode 100644
index 0000000..8f641f1
--- /dev/null
+++ b/arch/x86/xen/relocate_kernel_64.S
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2002-2005 Eric Biederman <ebiederm@xmission.com>
+ * Copyright (c) 2011 Daniel Kiper
+ * Copyright (c) 2012 Daniel Kiper, Oracle Corporation
+ *
+ * kexec/kdump implementation for Xen was written by Daniel Kiper.
+ * Initial work on it was sponsored by Google under Google Summer
+ * of Code 2011 program and Citrix. Konrad Rzeszutek Wilk from Oracle
+ * was the mentor for this project.
+ *
+ * Some ideas are taken from:
+ *   - native kexec/kdump implementation,
+ *   - kexec/kdump implementation for Xen Linux Kernel Ver. 2.6.18,
+ *   - PV-GRUB.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/processor-flags.h>
+
+#include <asm/xen/kexec.h>
+
+#define PTR(x)	(x << 3)
+
+	.text
+	.code64
+	.globl	xen_kexec_control_code_size, xen_relocate_kernel
+
+xen_relocate_kernel:
+	/*
+	 * Must be relocatable PIC code callable as a C function.
+	 *
+	 * This function is called by Xen but here hypervisor is dead.
+	 * We are playing on bare metal.
+	 *
+	 * Every machine address passed to this function through
+	 * page_list (e.g. XK_MA_CONTROL_PAGE) is established
+	 * by dom0 during kexec load phase.
+	 *
+	 * Every virtual address passed to this function through page_list
+	 * (e.g. XK_VA_CONTROL_PAGE) is established by hypervisor during
+	 * HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load) hypercall.
+	 *
+	 * %rdi - indirection_page,
+	 * %rsi - page_list,
+	 * %rdx - start_address,
+	 * %ecx - preserve_context (ignored).
+	 */
+
+	/* Zero out flags, and disable interrupts. */
+	pushq	$0
+	popfq
+
+	/*
+	 * Map the control page at its virtual address
+	 * in transition page table.
+	 */
+	movq	PTR(XK_VA_CONTROL_PAGE)(%rsi), %r8
+
+	/* Get PGD address and PGD entry index. */
+	movq	PTR(XK_VA_PGD_PAGE)(%rsi), %r9
+	movq	%r8, %r10
+	shrq	$PGDIR_SHIFT, %r10
+	andq	$(PTRS_PER_PGD - 1), %r10
+
+	/* Fill PGD entry with PUD0 reference. */
+	movq	PTR(XK_MA_PUD0_PAGE)(%rsi), %r11
+	orq	$_KERNPG_TABLE, %r11
+	movq	%r11, (%r9, %r10, 8)
+
+	/* Get PUD0 address and PUD0 entry index. */
+	movq	PTR(XK_VA_PUD0_PAGE)(%rsi), %r9
+	movq	%r8, %r10
+	shrq	$PUD_SHIFT, %r10
+	andq	$(PTRS_PER_PUD - 1), %r10
+
+	/* Fill PUD0 entry with PMD0 reference. */
+	movq	PTR(XK_MA_PMD0_PAGE)(%rsi), %r11
+	orq	$_KERNPG_TABLE, %r11
+	movq	%r11, (%r9, %r10, 8)
+
+	/* Get PMD0 address and PMD0 entry index. */
+	movq	PTR(XK_VA_PMD0_PAGE)(%rsi), %r9
+	movq	%r8, %r10
+	shrq	$PMD_SHIFT, %r10
+	andq	$(PTRS_PER_PMD - 1), %r10
+
+	/* Fill PMD0 entry with PTE0 reference. */
+	movq	PTR(XK_MA_PTE0_PAGE)(%rsi), %r11
+	orq	$_KERNPG_TABLE, %r11
+	movq	%r11, (%r9, %r10, 8)
+
+	/* Get PTE0 address and PTE0 entry index. */
+	movq	PTR(XK_VA_PTE0_PAGE)(%rsi), %r9
+	movq	%r8, %r10
+	shrq	$PAGE_SHIFT, %r10
+	andq	$(PTRS_PER_PTE - 1), %r10
+
+	/* Fill PTE0 entry with control page reference. */
+	movq	PTR(XK_MA_CONTROL_PAGE)(%rsi), %r11
+	orq	$__PAGE_KERNEL_EXEC, %r11
+	movq	%r11, (%r9, %r10, 8)
+
+	/*
+	 * Identity map the control page at its machine address
+	 * in transition page table.
+	 */
+	movq	PTR(XK_MA_CONTROL_PAGE)(%rsi), %r8
+
+	/* Get PGD address and PGD entry index. */
+	movq	PTR(XK_VA_PGD_PAGE)(%rsi), %r9
+	movq	%r8, %r10
+	shrq	$PGDIR_SHIFT, %r10
+	andq	$(PTRS_PER_PGD - 1), %r10
+
+	/* Fill PGD entry with PUD1 reference. */
+	movq	PTR(XK_MA_PUD1_PAGE)(%rsi), %r11
+	orq	$_KERNPG_TABLE, %r11
+	movq	%r11, (%r9, %r10, 8)
+
+	/* Get PUD1 address and PUD1 entry index. */
+	movq	PTR(XK_VA_PUD1_PAGE)(%rsi), %r9
+	movq	%r8, %r10
+	shrq	$PUD_SHIFT, %r10
+	andq	$(PTRS_PER_PUD - 1), %r10
+
+	/* Fill PUD1 entry with PMD1 reference. */
+	movq	PTR(XK_MA_PMD1_PAGE)(%rsi), %r11
+	orq	$_KERNPG_TABLE, %r11
+	movq	%r11, (%r9, %r10, 8)
+
+	/* Get PMD1 address and PMD1 entry index. */
+	movq	PTR(XK_VA_PMD1_PAGE)(%rsi), %r9
+	movq	%r8, %r10
+	shrq	$PMD_SHIFT, %r10
+	andq	$(PTRS_PER_PMD - 1), %r10
+
+	/* Fill PMD1 entry with PTE1 reference. */
+	movq	PTR(XK_MA_PTE1_PAGE)(%rsi), %r11
+	orq	$_KERNPG_TABLE, %r11
+	movq	%r11, (%r9, %r10, 8)
+
+	/* Get PTE1 address and PTE1 entry index. */
+	movq	PTR(XK_VA_PTE1_PAGE)(%rsi), %r9
+	movq	%r8, %r10
+	shrq	$PAGE_SHIFT, %r10
+	andq	$(PTRS_PER_PTE - 1), %r10
+
+	/* Fill PTE1 entry with control page reference. */
+	movq	PTR(XK_MA_CONTROL_PAGE)(%rsi), %r11
+	orq	$__PAGE_KERNEL_EXEC, %r11
+	movq	%r11, (%r9, %r10, 8)
+
+	/*
+	 * Get machine address of control page now.
+	 * This is impossible after page table switch.
+	 */
+	movq	PTR(XK_MA_CONTROL_PAGE)(%rsi), %r8
+
+	/* Get machine address of identity page table now too. */
+	movq	PTR(XK_MA_TABLE_PAGE)(%rsi), %r9
+
+	/* Get machine address of transition page table now too. */
+	movq	PTR(XK_MA_PGD_PAGE)(%rsi), %r10
+
+	/* Switch to transition page table. */
+	movq	%r10, %cr3
+
+	/* Setup a new stack at the end of machine address of control page. */
+	leaq	PAGE_SIZE(%r8), %rsp
+
+	/* Store start_address on the stack. */
+	pushq   %rdx
+
+	/* Jump to identity mapped page. */
+	addq	$(identity_mapped - xen_relocate_kernel), %r8
+	jmpq	*%r8
+
+identity_mapped:
+	/* Switch to identity page table. */
+	movq	%r9, %cr3
+
+	/*
+	 * Set %cr0 to a known state:
+	 *   - disable alignment check,
+	 *   - disable floating point emulation,
+	 *   - no task switch,
+	 *   - disable write protect,
+	 *   - enable protected mode,
+	 *   - enable paging.
+	 */
+	movq	%cr0, %rax
+	andq	$~(X86_CR0_AM | X86_CR0_EM | X86_CR0_TS | X86_CR0_WP), %rax
+	orl	$(X86_CR0_PE | X86_CR0_PG), %eax
+	movq	%rax, %cr0
+
+	/*
+	 * Set %cr4 to a known state:
+	 *   - enable physical address extension.
+	 */
+	movq	$X86_CR4_PAE, %rax
+	movq	%rax, %cr4
+
+	jmp	1f
+
+1:
+	/* Flush the TLB (needed?). */
+	movq	%r9, %cr3
+
+	/* Do the copies. */
+	movq	%rdi, %rcx	/* Put the indirection_page in %rcx. */
+	xorq	%rdi, %rdi
+	xorq	%rsi, %rsi
+	jmp	1f
+
+0:
+	/*
+	 * Top, read another quadword from the indirection page.
+	 * Indirection page is an array which contains source
+	 * and destination address pairs. If all pairs could
+	 * not fit in one page then at the end of given
+	 * indirection page is pointer to next one.
+	 * Copy is stopped when done indicator
+	 * is found in indirection page.
+	 */
+	movq	(%rbx), %rcx
+	addq	$8, %rbx
+
+1:
+	testq	$0x1, %rcx	/* Is it a destination page? */
+	jz	2f
+
+	movq	%rcx, %rdi
+	andq	$PAGE_MASK, %rdi
+	jmp	0b
+
+2:
+	testq	$0x2, %rcx	/* Is it an indirection page? */
+	jz	2f
+
+	movq	%rcx, %rbx
+	andq	$PAGE_MASK, %rbx
+	jmp	0b
+
+2:
+	testq	$0x4, %rcx	/* Is it the done indicator? */
+	jz	2f
+	jmp	3f
+
+2:
+	testq	$0x8, %rcx	/* Is it the source indicator? */
+	jz	0b		/* Ignore it otherwise. */
+
+	movq	%rcx, %rsi
+	andq	$PAGE_MASK, %rsi
+	movq	$512, %rcx
+
+	/* Copy page. */
+	rep	movsq
+	jmp	0b
+
+3:
+	/*
+	 * To be certain of avoiding problems with self-modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB by reloading %cr3 here, it's handy,
+	 * and not processor dependent.
+	 */
+	movq	%cr3, %rax
+	movq	%rax, %cr3
+
+	/*
+	 * Set all of the registers to known values.
+	 * Leave %rsp alone.
+	 */
+	xorq	%rax, %rax
+	xorq	%rbx, %rbx
+	xorq    %rcx, %rcx
+	xorq    %rdx, %rdx
+	xorq    %rsi, %rsi
+	xorq    %rdi, %rdi
+	xorq    %rbp, %rbp
+	xorq	%r8, %r8
+	xorq	%r9, %r9
+	xorq	%r10, %r10
+	xorq	%r11, %r11
+	xorq	%r12, %r12
+	xorq	%r13, %r13
+	xorq	%r14, %r14
+	xorq	%r15, %r15
+
+	/* Jump to start_address. */
+	retq
+
+xen_kexec_control_code_size:
+	.long	. - xen_relocate_kernel
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 08/11] x86/xen: Add kexec/kdump makefile rules
  2012-09-27 18:06             ` [PATCH 07/11] x86/xen: Add x86_64 " Daniel Kiper
@ 2012-09-27 18:06               ` Daniel Kiper
  2012-09-27 18:06                 ` [PATCH 09/11] x86/xen/enlighten: Add init and crash kexec/kdump hooks Daniel Kiper
  0 siblings, 1 reply; 32+ messages in thread
From: Daniel Kiper @ 2012-09-27 18:06 UTC (permalink / raw)
  To: konrad.wilk, andrew.cooper3, jbeulich, linux-kernel, xen-devel
  Cc: Daniel Kiper

Add kexec/kdump makefile rules.

Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
---
 arch/x86/xen/Makefile |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c0..7a5db44 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -22,3 +22,6 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
 obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
 obj-$(CONFIG_XEN_DOM0)		+= apic.o vga.o
 obj-$(CONFIG_SWIOTLB_XEN)	+= pci-swiotlb-xen.o
+obj-$(CONFIG_KEXEC)		+= kexec.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 09/11] x86/xen/enlighten: Add init and crash kexec/kdump hooks
  2012-09-27 18:06               ` [PATCH 08/11] x86/xen: Add kexec/kdump makefile rules Daniel Kiper
@ 2012-09-27 18:06                 ` Daniel Kiper
  2012-09-27 18:06                   ` [PATCH 10/11] drivers/xen: Export vmcoreinfo through sysfs Daniel Kiper
  0 siblings, 1 reply; 32+ messages in thread
From: Daniel Kiper @ 2012-09-27 18:06 UTC (permalink / raw)
  To: konrad.wilk, andrew.cooper3, jbeulich, linux-kernel, xen-devel
  Cc: Daniel Kiper

Add init and crash kexec/kdump hooks.

Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
---
 arch/x86/xen/enlighten.c |   12 ++++++++++++
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 1fbe75a..5043d77 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,6 +31,7 @@
 #include <linux/pci.h>
 #include <linux/gfp.h>
 #include <linux/memblock.h>
+#include <linux/kexec.h>
 
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
@@ -66,6 +67,7 @@
 #include <asm/hypervisor.h>
 #include <asm/mwait.h>
 #include <asm/pci_x86.h>
+#include <asm/xen/kexec.h>
 
 #ifdef CONFIG_ACPI
 #include <linux/acpi.h>
@@ -1237,6 +1239,12 @@ static void xen_machine_power_off(void)
 
 static void xen_crash_shutdown(struct pt_regs *regs)
 {
+#ifdef CONFIG_KEXEC
+	if (kexec_crash_image) {
+		crash_save_cpu(regs, safe_smp_processor_id());
+		return;
+	}
+#endif
 	xen_reboot(SHUTDOWN_crash);
 }
 
@@ -1315,6 +1323,10 @@ asmlinkage void __init xen_start_kernel(void)
 
 	xen_init_mmu_ops();
 
+#ifdef CONFIG_KEXEC
+	xen_init_kexec_ops();
+#endif
+
 	/* Prevent unwanted bits from being set in PTEs. */
 	__supported_pte_mask &= ~_PAGE_GLOBAL;
 #if 0
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 10/11] drivers/xen: Export vmcoreinfo through sysfs
  2012-09-27 18:06                 ` [PATCH 09/11] x86/xen/enlighten: Add init and crash kexec/kdump hooks Daniel Kiper
@ 2012-09-27 18:06                   ` Daniel Kiper
  2012-09-27 18:06                     ` [PATCH 11/11] x86: Add Xen kexec control code size check to linker script Daniel Kiper
  0 siblings, 1 reply; 32+ messages in thread
From: Daniel Kiper @ 2012-09-27 18:06 UTC (permalink / raw)
  To: konrad.wilk, andrew.cooper3, jbeulich, linux-kernel, xen-devel
  Cc: Daniel Kiper

Export vmcoreinfo through sysfs.

Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
---
 drivers/xen/sys-hypervisor.c |   42 +++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 41 insertions(+), 1 deletions(-)

diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index fdb6d22..0111ad0 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -355,6 +355,41 @@ static void xen_properties_destroy(void)
 	sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
 }
 
+#ifdef CONFIG_KEXEC
+static ssize_t vmcoreinfo_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	return sprintf(buffer, "%lx %lx\n", xen_vmcoreinfo_maddr,
+						xen_vmcoreinfo_max_size);
+}
+
+HYPERVISOR_ATTR_RO(vmcoreinfo);
+
+static int __init xen_vmcoreinfo_init(void)
+{
+	if (!xen_vmcoreinfo_max_size)
+		return 0;
+
+	return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
+}
+
+static void xen_vmcoreinfo_destroy(void)
+{
+	if (!xen_vmcoreinfo_max_size)
+		return;
+
+	sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
+}
+#else
+static int __init xen_vmcoreinfo_init(void)
+{
+	return 0;
+}
+
+static void xen_vmcoreinfo_destroy(void)
+{
+}
+#endif
+
 static int __init hyper_sysfs_init(void)
 {
 	int ret;
@@ -377,9 +412,14 @@ static int __init hyper_sysfs_init(void)
 	ret = xen_properties_init();
 	if (ret)
 		goto prop_out;
+	ret = xen_vmcoreinfo_init();
+	if (ret)
+		goto vmcoreinfo_out;
 
 	goto out;
 
+vmcoreinfo_out:
+	xen_properties_destroy();
 prop_out:
 	xen_sysfs_uuid_destroy();
 uuid_out:
@@ -394,12 +434,12 @@ out:
 
 static void __exit hyper_sysfs_exit(void)
 {
+	xen_vmcoreinfo_destroy();
 	xen_properties_destroy();
 	xen_compilation_destroy();
 	xen_sysfs_uuid_destroy();
 	xen_sysfs_version_destroy();
 	xen_sysfs_type_destroy();
-
 }
 module_init(hyper_sysfs_init);
 module_exit(hyper_sysfs_exit);
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 11/11] x86: Add Xen kexec control code size check to linker script
  2012-09-27 18:06                   ` [PATCH 10/11] drivers/xen: Export vmcoreinfo through sysfs Daniel Kiper
@ 2012-09-27 18:06                     ` Daniel Kiper
  0 siblings, 0 replies; 32+ messages in thread
From: Daniel Kiper @ 2012-09-27 18:06 UTC (permalink / raw)
  To: konrad.wilk, andrew.cooper3, jbeulich, linux-kernel, xen-devel
  Cc: Daniel Kiper

Add Xen kexec control code size check to linker script.

Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
---
 arch/x86/kernel/vmlinux.lds.S |    7 ++++++-
 1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 22a1530..f18786a 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -360,5 +360,10 @@ INIT_PER_CPU(irq_stack_union);
 
 . = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
            "kexec control code size is too big");
-#endif
 
+#ifdef CONFIG_XEN
+. = ASSERT(xen_kexec_control_code_size - xen_relocate_kernel <=
+		KEXEC_CONTROL_CODE_MAX_SIZE,
+		"Xen kexec control code size is too big");
+#endif
+#endif
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH 01/11] kexec: introduce kexec_ops struct
  2012-09-27 18:06 ` [PATCH 01/11] kexec: introduce kexec_ops struct Daniel Kiper
  2012-09-27 18:06   ` [PATCH 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE Daniel Kiper
@ 2012-09-28  7:49   ` Jan Beulich
  2012-10-01 11:36     ` Daniel Kiper
  2012-09-28 16:07   ` Konrad Rzeszutek Wilk
  2 siblings, 1 reply; 32+ messages in thread
From: Jan Beulich @ 2012-09-28  7:49 UTC (permalink / raw)
  To: Daniel Kiper; +Cc: andrew.cooper3, xen-devel, konrad.wilk, linux-kernel

>>> On 27.09.12 at 20:06, Daniel Kiper <daniel.kiper@oracle.com> wrote:
> Some kexec/kdump implementations (e.g. Xen PVOPS) on different archs could
> not use default functions or require some changes in behavior of kexec/kdump
> generic code. To cope with that problem kexec_ops struct was introduced.
> It allows a developer to replace all or some functions and control some
> functionality of kexec/kdump generic code.

I'm not convinced that doing this at the architecture independent
layer is really necessary/desirable. Nevertheless, if that's the right
place, then everything else looks good to me, except for a
cosmetic thing:

> @@ -392,7 +435,7 @@ static void kimage_free_page_list(struct list_head *list)
>  
>  		page = list_entry(pos, struct page, lru);
>  		list_del(&page->lru);
> -		kimage_free_pages(page);
> +		(*kexec_ops.kimage_free_pages)(page);

These constructs are generally better readable without the
explicit yet redundant indirection:

		kexec_ops.kimage_free_pages(page);

Jan



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE
  2012-09-27 18:06   ` [PATCH 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE Daniel Kiper
  2012-09-27 18:06     ` [PATCH 03/11] xen: Introduce architecture independent data for kexec/kdump Daniel Kiper
@ 2012-09-28  7:56     ` Jan Beulich
  2012-10-01 13:01       ` Daniel Kiper
  1 sibling, 1 reply; 32+ messages in thread
From: Jan Beulich @ 2012-09-28  7:56 UTC (permalink / raw)
  To: Daniel Kiper; +Cc: andrew.cooper3, xen-devel, konrad.wilk, linux-kernel

>>> On 27.09.12 at 20:06, Daniel Kiper <daniel.kiper@oracle.com> wrote:
> Some implementations (e.g. Xen PVOPS) could not use part of identity page 
> table
> to construct transition page table. It means that they require separate 
> PUDs,
> PMDs and PTEs for virtual and physical (identity) mapping. To satisfy that
> requirement add extra pointer to PGD, PUD, PMD and PTE and align existing 
> code.

I'm puzzled by this - why would you need to reintroduce what had
been dropped a long time ago, when the forward ported kernels
don't need it? Xen itself doesn't need the extra entries, their
presence is purely a requirement of the specific kernel
implementation afaict.

Jan


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation
  2012-09-27 18:06           ` [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation Daniel Kiper
  2012-09-27 18:06             ` [PATCH 07/11] x86/xen: Add x86_64 " Daniel Kiper
@ 2012-09-28  8:11             ` Jan Beulich
  2012-10-01 12:52               ` Daniel Kiper
  2012-09-28 16:39             ` Konrad Rzeszutek Wilk
  2 siblings, 1 reply; 32+ messages in thread
From: Jan Beulich @ 2012-09-28  8:11 UTC (permalink / raw)
  To: Daniel Kiper; +Cc: andrew.cooper3, xen-devel, konrad.wilk, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1210 bytes --]

>>> On 27.09.12 at 20:06, Daniel Kiper <daniel.kiper@oracle.com> wrote:
> Add i386 kexec/kdump implementation.

So this as well as the subsequent patch introduces quite a bit of
duplicate code. The old 2.6.18 kernel had an initial pair of cleanup
patches (attached in their forward ported form for 3.6-rc6) that
would allow reducing the amount of duplication, particularly by
eliminating the need to clone relocate_kernel_??.S altogether.

Additionally, in the PAE case (which is the only relevant one for
a 32-bit Xen kernel) I'm missing the address restriction
enforcement for the PGD, without which the __ma() conversion
result may not fit into the field it gets stored into.

Finally, as noticed in an earlier patch already, you appear to
re-introduce stuff long dropped from the kernel - the forward
ported kernels get away with just setting PA_CONTROL_PAGE,
PA_PGD, and PA_SWAP_PAGE in the page list. Since the number
and purpose of the pages is established entirely by the guest
kernel, all you need to obey is that the hypervisor expects
alternating PA_/VA_ pairs (where the VA_ ones can be left
unpopulated). Perhaps taking a look at a recent SLES kernel
would help...

Jan


[-- Attachment #2: kexec-move-segment-code-x86_64.patch --]
[-- Type: text/plain, Size: 4365 bytes --]

Subject: kexec: Move asm segment handling code to the assembly file (x86_64)
From: http://xenbits.xensource.com/xen-unstable.hg (tip 13816)
Patch-mainline: n/a

This patch moves the idt, gdt, and segment handling code from machine_kexec.c
to relocate_kernel.S.  The main reason behind this move is to avoid code 
duplication in the Xen hypervisor. With this patch all code required to kexec
is put on the control page.

On top of that this patch also counts as a cleanup - I think it is much
nicer to write assembly directly in assembly files than wrap inline assembly
in C functions for no apparent reason.

Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
Acked-by: jbeulich@novell.com

 Applies to 2.6.19-rc1.
 jb: fixed up register usage for 2.6.30 (bnc#545206)

--- head-2011-08-09.orig/arch/x86/kernel/machine_kexec_64.c	2011-08-09 10:21:04.000000000 +0200
+++ head-2011-08-09/arch/x86/kernel/machine_kexec_64.c	2010-04-15 09:38:56.000000000 +0200
@@ -203,47 +203,6 @@ static int init_pgtable(struct kimage *i
 	return init_transition_pgtable(image, level4p);
 }
 
-static void set_idt(void *newidt, u16 limit)
-{
-	struct desc_ptr curidt;
-
-	/* x86-64 supports unaliged loads & stores */
-	curidt.size    = limit;
-	curidt.address = (unsigned long)newidt;
-
-	__asm__ __volatile__ (
-		"lidtq %0\n"
-		: : "m" (curidt)
-		);
-};
-
-
-static void set_gdt(void *newgdt, u16 limit)
-{
-	struct desc_ptr curgdt;
-
-	/* x86-64 supports unaligned loads & stores */
-	curgdt.size    = limit;
-	curgdt.address = (unsigned long)newgdt;
-
-	__asm__ __volatile__ (
-		"lgdtq %0\n"
-		: : "m" (curgdt)
-		);
-};
-
-static void load_segments(void)
-{
-	__asm__ __volatile__ (
-		"\tmovl %0,%%ds\n"
-		"\tmovl %0,%%es\n"
-		"\tmovl %0,%%ss\n"
-		"\tmovl %0,%%fs\n"
-		"\tmovl %0,%%gs\n"
-		: : "a" (__KERNEL_DS) : "memory"
-		);
-}
-
 int machine_kexec_prepare(struct kimage *image)
 {
 	unsigned long start_pgtable;
@@ -311,24 +270,6 @@ void machine_kexec(struct kimage *image)
 		page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
 						<< PAGE_SHIFT);
 
-	/*
-	 * The segment registers are funny things, they have both a
-	 * visible and an invisible part.  Whenever the visible part is
-	 * set to a specific selector, the invisible part is loaded
-	 * with from a table in memory.  At no other time is the
-	 * descriptor table in memory accessed.
-	 *
-	 * I take advantage of this here by force loading the
-	 * segments, before I zap the gdt with an invalid value.
-	 */
-	load_segments();
-	/*
-	 * The gdt & idt are now invalid.
-	 * If you want to load them you must set up your own idt & gdt.
-	 */
-	set_gdt(phys_to_virt(0), 0);
-	set_idt(phys_to_virt(0), 0);
-
 	/* now call it */
 	image->start = relocate_kernel((unsigned long)image->head,
 				       (unsigned long)page_list,
--- head-2011-08-09.orig/arch/x86/kernel/relocate_kernel_64.S	2011-08-09 10:19:07.000000000 +0200
+++ head-2011-08-09/arch/x86/kernel/relocate_kernel_64.S	2011-08-09 10:21:24.000000000 +0200
@@ -91,13 +91,30 @@ relocate_kernel:
 	/* Switch to the identity mapped page tables */
 	movq	%r9, %cr3
 
+	/* setup idt */
+	lidtq	idt_80 - relocate_kernel(%r8)
+
+	/* setup gdt */
+	leaq	gdt - relocate_kernel(%r8), %rax
+	movq	%rax, (gdt_80 - relocate_kernel) + 2(%r8)
+	lgdtq	gdt_80 - relocate_kernel(%r8)
+
+	/* setup data segment registers */
+	xorl	%eax, %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	movl	%eax, %fs
+	movl	%eax, %gs
+	movl	%eax, %ss
+
 	/* setup a new stack at the end of the physical control page */
 	lea	PAGE_SIZE(%r8), %rsp
 
-	/* jump to identity mapped page */
+	/* load new code segment and jump to identity mapped page */
 	addq	$(identity_mapped - relocate_kernel), %r8
+	pushq	$(gdt_cs - gdt)
 	pushq	%r8
-	ret
+	lretq
 
 identity_mapped:
 	/* set return address to 0 if not preserving context */
@@ -264,5 +281,20 @@ swap_pages:
 3:
 	ret
 
+	.align  16
+gdt:
+	.quad	0x0000000000000000	/* NULL descriptor */
+gdt_cs:
+	.quad   0x00af9a000000ffff
+gdt_end:
+
+gdt_80:
+	.word	gdt_end - gdt - 1	/* limit */
+	.quad	0			/* base - filled in by code above */
+
+idt_80:
+	.word	0			/* limit */
+	.quad	0			/* base */
+
 	.globl kexec_control_code_size
 .set kexec_control_code_size, . - relocate_kernel

[-- Attachment #3: kexec-move-segment-code-i386.patch --]
[-- Type: text/plain, Size: 4554 bytes --]

Subject: kexec: Move asm segment handling code to the assembly file (i386)
From: http://xenbits.xensource.com/xen-unstable.hg (tip 13816)
Patch-mainline: n/a

This patch moves the idt, gdt, and segment handling code from machine_kexec.c
to relocate_kernel.S. The main reason behind this move is to avoid code 
duplication in the Xen hypervisor. With this patch all code required to kexec
is put on the control page.

On top of that this patch also counts as a cleanup - I think it is much
nicer to write assembly directly in assembly files than wrap inline assembly
in C functions for no apparent reason.

Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
Acked-by: jbeulich@novell.com

 Applies to 2.6.19-rc1.
 jb: fixed up register usage (paralleling what's needed for 2.6.30 on x86-64)

--- head.orig/arch/x86/kernel/machine_kexec_32.c	2012-04-10 14:24:22.000000000 +0200
+++ head/arch/x86/kernel/machine_kexec_32.c	2012-04-10 14:50:08.000000000 +0200
@@ -26,48 +26,6 @@
 #include <asm/cacheflush.h>
 #include <asm/debugreg.h>
 
-static void set_idt(void *newidt, __u16 limit)
-{
-	struct desc_ptr curidt;
-
-	/* ia32 supports unaliged loads & stores */
-	curidt.size    = limit;
-	curidt.address = (unsigned long)newidt;
-
-	load_idt(&curidt);
-}
-
-
-static void set_gdt(void *newgdt, __u16 limit)
-{
-	struct desc_ptr curgdt;
-
-	/* ia32 supports unaligned loads & stores */
-	curgdt.size    = limit;
-	curgdt.address = (unsigned long)newgdt;
-
-	load_gdt(&curgdt);
-}
-
-static void load_segments(void)
-{
-#define __STR(X) #X
-#define STR(X) __STR(X)
-
-	__asm__ __volatile__ (
-		"\tljmp $"STR(__KERNEL_CS)",$1f\n"
-		"\t1:\n"
-		"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
-		"\tmovl %%eax,%%ds\n"
-		"\tmovl %%eax,%%es\n"
-		"\tmovl %%eax,%%fs\n"
-		"\tmovl %%eax,%%gs\n"
-		"\tmovl %%eax,%%ss\n"
-		: : : "eax", "memory");
-#undef STR
-#undef __STR
-}
-
 static void machine_kexec_free_page_tables(struct kimage *image)
 {
 	free_page((unsigned long)image->arch.pgd);
@@ -227,24 +185,6 @@ void machine_kexec(struct kimage *image)
 		page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
 						<< PAGE_SHIFT);
 
-	/*
-	 * The segment registers are funny things, they have both a
-	 * visible and an invisible part.  Whenever the visible part is
-	 * set to a specific selector, the invisible part is loaded
-	 * with from a table in memory.  At no other time is the
-	 * descriptor table in memory accessed.
-	 *
-	 * I take advantage of this here by force loading the
-	 * segments, before I zap the gdt with an invalid value.
-	 */
-	load_segments();
-	/*
-	 * The gdt & idt are now invalid.
-	 * If you want to load them you must set up your own idt & gdt.
-	 */
-	set_gdt(phys_to_virt(0), 0);
-	set_idt(phys_to_virt(0), 0);
-
 	/* now call it */
 	image->start = relocate_kernel_ptr((unsigned long)image->head,
 					   (unsigned long)page_list,
--- head.orig/arch/x86/kernel/relocate_kernel_32.S	2011-10-24 09:10:05.000000000 +0200
+++ head/arch/x86/kernel/relocate_kernel_32.S	2011-08-09 10:21:17.000000000 +0200
@@ -87,14 +87,32 @@ relocate_kernel:
 	movl	PTR(PA_PGD)(%ebp), %eax
 	movl	%eax, %cr3
 
+	/* setup idt */
+	lidtl	idt_48 - relocate_kernel(%edi)
+
+	/* setup gdt */
+	leal	gdt - relocate_kernel(%edi), %eax
+	movl	%eax, (gdt_48 - relocate_kernel) + 2(%edi)
+	lgdtl	gdt_48 - relocate_kernel(%edi)
+
+	/* setup data segment registers */
+	mov	$(gdt_ds - gdt), %eax
+	mov	%eax, %ds
+	mov	%eax, %es
+	mov	%eax, %fs
+	mov	%eax, %gs
+	mov	%eax, %ss
+
 	/* setup a new stack at the end of the physical control page */
 	lea	PAGE_SIZE(%edi), %esp
 
-	/* jump to identity mapped page */
+	/* load new code segment and jump to identity mapped page */
+	pushl	$0
+	pushl	$(gdt_cs - gdt)
 	movl    %edi, %eax
 	addl    $(identity_mapped - relocate_kernel), %eax
 	pushl   %eax
-	ret
+	iretl
 
 identity_mapped:
 	/* set return address to 0 if not preserving context */
@@ -273,5 +291,22 @@ swap_pages:
 	popl	%ebp
 	ret
 
+	.align	16
+gdt:
+	.quad	0x0000000000000000	/* NULL descriptor */
+gdt_cs:
+	.quad	0x00cf9a000000ffff	/* kernel 4GB code at 0x00000000 */
+gdt_ds:
+	.quad	0x00cf92000000ffff	/* kernel 4GB data at 0x00000000 */
+gdt_end:
+
+gdt_48:
+	.word	gdt_end - gdt - 1	/* limit */
+	.long	0			/* base - filled in by code above */
+
+idt_48:
+	.word	0			/* limit */
+	.long	0			/* base */
+
 	.globl kexec_control_code_size
 .set kexec_control_code_size, . - relocate_kernel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 01/11] kexec: introduce kexec_ops struct
  2012-09-27 18:06 ` [PATCH 01/11] kexec: introduce kexec_ops struct Daniel Kiper
  2012-09-27 18:06   ` [PATCH 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE Daniel Kiper
  2012-09-28  7:49   ` [PATCH 01/11] kexec: introduce kexec_ops struct Jan Beulich
@ 2012-09-28 16:07   ` Konrad Rzeszutek Wilk
  2012-10-01 13:40     ` Daniel Kiper
  2 siblings, 1 reply; 32+ messages in thread
From: Konrad Rzeszutek Wilk @ 2012-09-28 16:07 UTC (permalink / raw)
  To: Daniel Kiper; +Cc: andrew.cooper3, jbeulich, linux-kernel, xen-devel

On Thu, Sep 27, 2012 at 08:06:28PM +0200, Daniel Kiper wrote:
> Some kexec/kdump implementations (e.g. Xen PVOPS) on different archs could
> not use default functions or require some changes in behavior of kexec/kdump
> generic code. To cope with that problem kexec_ops struct was introduced.
> It allows a developer to replace all or some functions and control some
> functionality of kexec/kdump generic code.
> 
> Default behavior of kexec/kdump generic code is not changed.
> 
> Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
> ---
>  include/linux/kexec.h |   18 +++++++
>  kernel/kexec.c        |  125 ++++++++++++++++++++++++++++++++++++-------------
>  2 files changed, 111 insertions(+), 32 deletions(-)
> 
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index 37c5f72..beb08ca 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -165,7 +165,25 @@ struct kimage {
>  #endif
>  };
>  
> +struct kexec_ops {
> +	bool always_use_normal_alloc;

So most of these are self-explanatory. But the bool is not that clear
to me. Could you include a documentation comment explaining
its purpose and its implications?

> +	struct page *(*kimage_alloc_pages)(gfp_t gfp_mask,
> +						unsigned int order,
> +						unsigned long limit);
> +	void (*kimage_free_pages)(struct page *page);
> +	unsigned long (*page_to_pfn)(struct page *page);
> +	struct page *(*pfn_to_page)(unsigned long pfn);
> +	unsigned long (*virt_to_phys)(volatile void *address);
> +	void *(*phys_to_virt)(unsigned long address);
> +	int (*machine_kexec_prepare)(struct kimage *image);
> +	int (*machine_kexec_load)(struct kimage *image);
> +	void (*machine_kexec_cleanup)(struct kimage *image);
> +	void (*machine_kexec_unload)(struct kimage *image);
> +	void (*machine_kexec_shutdown)(void);
> +	void (*machine_kexec)(struct kimage *image);
> +};
>  
> +extern struct kexec_ops kexec_ops;

Is this neccessary?

>  
>  /* kexec interface functions */
>  extern void machine_kexec(struct kimage *image);
> diff --git a/kernel/kexec.c b/kernel/kexec.c
> index 0668d58..98556f3 100644
> --- a/kernel/kexec.c
> +++ b/kernel/kexec.c
> @@ -56,6 +56,47 @@ struct resource crashk_res = {
>  	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
>  };
>  
> +static struct page *kimage_alloc_pages(gfp_t gfp_mask,
> +					unsigned int order,
> +					unsigned long limit);
> +static void kimage_free_pages(struct page *page);
> +
> +static unsigned long generic_page_to_pfn(struct page *page)
> +{
> +	return page_to_pfn(page);
> +}
> +
> +static struct page *generic_pfn_to_page(unsigned long pfn)
> +{
> +	return pfn_to_page(pfn);
> +}
> +
> +static unsigned long generic_virt_to_phys(volatile void *address)
> +{
> +	return virt_to_phys(address);
> +}
> +
> +static void *generic_phys_to_virt(unsigned long address)
> +{
> +	return phys_to_virt(address);
> +}
> +
> +struct kexec_ops kexec_ops = {
> +	.always_use_normal_alloc = false,
> +	.kimage_alloc_pages = kimage_alloc_pages,
> +	.kimage_free_pages = kimage_free_pages,
> +	.page_to_pfn = generic_page_to_pfn,
> +	.pfn_to_page = generic_pfn_to_page,
> +	.virt_to_phys = generic_virt_to_phys,
> +	.phys_to_virt = generic_phys_to_virt,
> +	.machine_kexec_prepare = machine_kexec_prepare,
> +	.machine_kexec_load = NULL,

Instead of NULL should they just point to some nop function?

> +	.machine_kexec_cleanup = machine_kexec_cleanup,
> +	.machine_kexec_unload = NULL,
> +	.machine_kexec_shutdown = machine_shutdown,
> +	.machine_kexec = machine_kexec
> +};
> +
>  int kexec_should_crash(struct task_struct *p)
>  {
>  	if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
> @@ -355,7 +396,9 @@ static int kimage_is_destination_range(struct kimage *image,
>  	return 0;
>  }
>  
> -static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
> +static struct page *kimage_alloc_pages(gfp_t gfp_mask,
> +					unsigned int order,
> +					unsigned long limit)
>  {
>  	struct page *pages;
>  
> @@ -392,7 +435,7 @@ static void kimage_free_page_list(struct list_head *list)
>  
>  		page = list_entry(pos, struct page, lru);
>  		list_del(&page->lru);
> -		kimage_free_pages(page);
> +		(*kexec_ops.kimage_free_pages)(page);
>  	}
>  }
>  
> @@ -425,10 +468,11 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
>  	do {
>  		unsigned long pfn, epfn, addr, eaddr;
>  
> -		pages = kimage_alloc_pages(GFP_KERNEL, order);
> +		pages = (*kexec_ops.kimage_alloc_pages)(GFP_KERNEL, order,
> +							KEXEC_CONTROL_MEMORY_LIMIT);
>  		if (!pages)
>  			break;
> -		pfn   = page_to_pfn(pages);
> +		pfn   = (*kexec_ops.page_to_pfn)(pages);
>  		epfn  = pfn + count;
>  		addr  = pfn << PAGE_SHIFT;
>  		eaddr = epfn << PAGE_SHIFT;
> @@ -515,7 +559,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
>  		}
>  		/* If I don't overlap any segments I have found my hole! */
>  		if (i == image->nr_segments) {
> -			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
> +			pages = (*kexec_ops.pfn_to_page)(hole_start >> PAGE_SHIFT);
>  			break;
>  		}
>  	}
> @@ -532,12 +576,13 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
>  	struct page *pages = NULL;
>  
>  	switch (image->type) {
> +	case KEXEC_TYPE_CRASH:
> +		if (!kexec_ops.always_use_normal_alloc) {
> +			pages = kimage_alloc_crash_control_pages(image, order);
> +			break;
> +		}
>  	case KEXEC_TYPE_DEFAULT:
>  		pages = kimage_alloc_normal_control_pages(image, order);
> -		break;
> -	case KEXEC_TYPE_CRASH:
> -		pages = kimage_alloc_crash_control_pages(image, order);
> -		break;
>  	}
>  
>  	return pages;
> @@ -557,7 +602,7 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
>  			return -ENOMEM;
>  
>  		ind_page = page_address(page);
> -		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
> +		*image->entry = (*kexec_ops.virt_to_phys)(ind_page) | IND_INDIRECTION;
>  		image->entry = ind_page;
>  		image->last_entry = ind_page +
>  				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
> @@ -616,14 +661,14 @@ static void kimage_terminate(struct kimage *image)
>  #define for_each_kimage_entry(image, ptr, entry) \
>  	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
>  		ptr = (entry & IND_INDIRECTION)? \
> -			phys_to_virt((entry & PAGE_MASK)): ptr +1)
> +			(*kexec_ops.phys_to_virt)((entry & PAGE_MASK)): ptr +1)
>  
>  static void kimage_free_entry(kimage_entry_t entry)
>  {
>  	struct page *page;
>  
> -	page = pfn_to_page(entry >> PAGE_SHIFT);
> -	kimage_free_pages(page);
> +	page = (*kexec_ops.pfn_to_page)(entry >> PAGE_SHIFT);
> +	(*kexec_ops.kimage_free_pages)(page);
>  }
>  
>  static void kimage_free(struct kimage *image)
> @@ -653,7 +698,7 @@ static void kimage_free(struct kimage *image)
>  		kimage_free_entry(ind);
>  
>  	/* Handle any machine specific cleanup */
> -	machine_kexec_cleanup(image);
> +	(*kexec_ops.machine_kexec_cleanup)(image);
>  
>  	/* Free the kexec control pages... */
>  	kimage_free_page_list(&image->control_pages);
> @@ -709,7 +754,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
>  	 * have a match.
>  	 */
>  	list_for_each_entry(page, &image->dest_pages, lru) {
> -		addr = page_to_pfn(page) << PAGE_SHIFT;
> +		addr = (*kexec_ops.page_to_pfn)(page) << PAGE_SHIFT;
>  		if (addr == destination) {
>  			list_del(&page->lru);
>  			return page;
> @@ -720,16 +765,17 @@ static struct page *kimage_alloc_page(struct kimage *image,
>  		kimage_entry_t *old;
>  
>  		/* Allocate a page, if we run out of memory give up */
> -		page = kimage_alloc_pages(gfp_mask, 0);
> +		page = (*kexec_ops.kimage_alloc_pages)(gfp_mask, 0,
> +							KEXEC_SOURCE_MEMORY_LIMIT);
>  		if (!page)
>  			return NULL;
>  		/* If the page cannot be used file it away */
> -		if (page_to_pfn(page) >
> +		if ((*kexec_ops.page_to_pfn)(page) >
>  				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
>  			list_add(&page->lru, &image->unuseable_pages);
>  			continue;
>  		}
> -		addr = page_to_pfn(page) << PAGE_SHIFT;
> +		addr = (*kexec_ops.page_to_pfn)(page) << PAGE_SHIFT;
>  
>  		/* If it is the destination page we want use it */
>  		if (addr == destination)
> @@ -752,7 +798,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
>  			struct page *old_page;
>  
>  			old_addr = *old & PAGE_MASK;
> -			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
> +			old_page = (*kexec_ops.pfn_to_page)(old_addr >> PAGE_SHIFT);
>  			copy_highpage(page, old_page);
>  			*old = addr | (*old & ~PAGE_MASK);
>  
> @@ -762,7 +808,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
>  			 */
>  			if (!(gfp_mask & __GFP_HIGHMEM) &&
>  			    PageHighMem(old_page)) {
> -				kimage_free_pages(old_page);
> +				(*kexec_ops.kimage_free_pages)(old_page);
>  				continue;
>  			}
>  			addr = old_addr;
> @@ -808,7 +854,7 @@ static int kimage_load_normal_segment(struct kimage *image,
>  			result  = -ENOMEM;
>  			goto out;
>  		}
> -		result = kimage_add_page(image, page_to_pfn(page)
> +		result = kimage_add_page(image, (*kexec_ops.page_to_pfn)(page)
>  								<< PAGE_SHIFT);
>  		if (result < 0)
>  			goto out;
> @@ -862,7 +908,7 @@ static int kimage_load_crash_segment(struct kimage *image,
>  		char *ptr;
>  		size_t uchunk, mchunk;
>  
> -		page = pfn_to_page(maddr >> PAGE_SHIFT);
> +		page = (*kexec_ops.pfn_to_page)(maddr >> PAGE_SHIFT);
>  		if (!page) {
>  			result  = -ENOMEM;
>  			goto out;
> @@ -901,12 +947,13 @@ static int kimage_load_segment(struct kimage *image,
>  	int result = -ENOMEM;
>  
>  	switch (image->type) {
> +	case KEXEC_TYPE_CRASH:
> +		if (!kexec_ops.always_use_normal_alloc) {
> +			result = kimage_load_crash_segment(image, segment);
> +			break;
> +		}
>  	case KEXEC_TYPE_DEFAULT:
>  		result = kimage_load_normal_segment(image, segment);
> -		break;
> -	case KEXEC_TYPE_CRASH:
> -		result = kimage_load_crash_segment(image, segment);
> -		break;
>  	}
>  
>  	return result;
> @@ -994,6 +1041,8 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
>  			/* Free any current crash dump kernel before
>  			 * we corrupt it.
>  			 */
> +			if (kexec_ops.machine_kexec_unload)
> +				(*kexec_ops.machine_kexec_unload)(image);
>  			kimage_free(xchg(&kexec_crash_image, NULL));
>  			result = kimage_crash_alloc(&image, entry,
>  						     nr_segments, segments);
> @@ -1004,7 +1053,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
>  
>  		if (flags & KEXEC_PRESERVE_CONTEXT)
>  			image->preserve_context = 1;
> -		result = machine_kexec_prepare(image);
> +		result = (*kexec_ops.machine_kexec_prepare)(image);
>  		if (result)
>  			goto out;
>  
> @@ -1017,11 +1066,23 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
>  		if (flags & KEXEC_ON_CRASH)
>  			crash_unmap_reserved_pages();
>  	}
> +
> +	if (kexec_ops.machine_kexec_load) {
> +		result = (*kexec_ops.machine_kexec_load)(image);
> +
> +		if (result)
> +			goto out;
> +	}
> +
>  	/* Install the new kernel, and  Uninstall the old */
>  	image = xchg(dest_image, image);
>  
>  out:
>  	mutex_unlock(&kexec_mutex);
> +
> +	if (kexec_ops.machine_kexec_unload)
> +		(*kexec_ops.machine_kexec_unload)(image);
> +
>  	kimage_free(image);
>  
>  	return result;
> @@ -1095,7 +1156,7 @@ void crash_kexec(struct pt_regs *regs)
>  			crash_setup_regs(&fixed_regs, regs);
>  			crash_save_vmcoreinfo();
>  			machine_crash_shutdown(&fixed_regs);
> -			machine_kexec(kexec_crash_image);
> +			(*kexec_ops.machine_kexec)(kexec_crash_image);
>  		}
>  		mutex_unlock(&kexec_mutex);
>  	}
> @@ -1117,8 +1178,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
>  	unsigned long addr;
>  
>  	for (addr = begin; addr < end; addr += PAGE_SIZE) {
> -		ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
> -		init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
> +		ClearPageReserved((*kexec_ops.pfn_to_page)(addr >> PAGE_SHIFT));
> +		init_page_count((*kexec_ops.pfn_to_page)(addr >> PAGE_SHIFT));
>  		free_page((unsigned long)__va(addr));
>  		totalram_pages++;
>  	}
> @@ -1572,10 +1633,10 @@ int kernel_kexec(void)
>  	{
>  		kernel_restart_prepare(NULL);
>  		printk(KERN_EMERG "Starting new kernel\n");
> -		machine_shutdown();
> +		(*kexec_ops.machine_kexec_shutdown)();
>  	}
>  
> -	machine_kexec(kexec_image);
> +	(*kexec_ops.machine_kexec)(kexec_image);
>  
>  #ifdef CONFIG_KEXEC_JUMP
>  	if (kexec_image->preserve_context) {
> -- 
> 1.5.6.5

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 03/11] xen: Introduce architecture independent data for kexec/kdump
  2012-09-27 18:06     ` [PATCH 03/11] xen: Introduce architecture independent data for kexec/kdump Daniel Kiper
  2012-09-27 18:06       ` [PATCH 04/11] x86/xen: Introduce architecture dependent " Daniel Kiper
@ 2012-09-28 16:10       ` Konrad Rzeszutek Wilk
  2012-10-01 13:34         ` Daniel Kiper
  1 sibling, 1 reply; 32+ messages in thread
From: Konrad Rzeszutek Wilk @ 2012-09-28 16:10 UTC (permalink / raw)
  To: Daniel Kiper; +Cc: andrew.cooper3, jbeulich, linux-kernel, xen-devel

On Thu, Sep 27, 2012 at 08:06:30PM +0200, Daniel Kiper wrote:
> Introduce architecture independent constants and structures

Don't you mean 'dependent constants'?

> required by Xen kexec/kdump implementation.
> 
> Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
> ---
>  include/xen/interface/xen.h |   33 +++++++++++++++++++++++++++++++++
>  1 files changed, 33 insertions(+), 0 deletions(-)
> 
> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
> index 0801468..ac19f9e 100644
> --- a/include/xen/interface/xen.h
> +++ b/include/xen/interface/xen.h
> @@ -58,6 +58,7 @@
>  #define __HYPERVISOR_event_channel_op     32
>  #define __HYPERVISOR_physdev_op           33
>  #define __HYPERVISOR_hvm_op               34
> +#define __HYPERVISOR_kexec_op             37
>  #define __HYPERVISOR_tmem_op              38
>  
>  /* Architecture-specific hypercall definitions. */
> @@ -232,7 +233,39 @@ DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
>  #define VMASST_TYPE_pae_extended_cr3     3
>  #define MAX_VMASST_TYPE 3
>  
> +/*
> + * Commands to HYPERVISOR_kexec_op().
> + */
> +#define KEXEC_CMD_kexec			0
> +#define KEXEC_CMD_kexec_load		1
> +#define KEXEC_CMD_kexec_unload		2
> +#define KEXEC_CMD_kexec_get_range	3
> +
> +/*
> + * Memory ranges for kdump (utilized by HYPERVISOR_kexec_op()).
> + */
> +#define KEXEC_RANGE_MA_CRASH		0
> +#define KEXEC_RANGE_MA_XEN		1
> +#define KEXEC_RANGE_MA_CPU		2
> +#define KEXEC_RANGE_MA_XENHEAP		3
> +#define KEXEC_RANGE_MA_BOOT_PARAM	4
> +#define KEXEC_RANGE_MA_EFI_MEMMAP	5
> +#define KEXEC_RANGE_MA_VMCOREINFO	6
> +
>  #ifndef __ASSEMBLY__
> +struct xen_kexec_exec {
> +	int type;
> +};
> +
> +struct xen_kexec_range {
> +	int range;
> +	int nr;
> +	unsigned long size;
> +	unsigned long start;
> +};

Might want to include an little blurb saying what we expect
in case of running a 32-bit domain on a 64-bit hypervisor
where the start might be past 4GB?

> +
> +extern unsigned long xen_vmcoreinfo_maddr;
> +extern unsigned long xen_vmcoreinfo_max_size;
>  
>  typedef uint16_t domid_t;
>  
> -- 
> 1.5.6.5

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 05/11] x86/xen: Register resources required by kexec-tools
  2012-09-27 18:06         ` [PATCH 05/11] x86/xen: Register resources required by kexec-tools Daniel Kiper
  2012-09-27 18:06           ` [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation Daniel Kiper
@ 2012-09-28 16:21           ` Konrad Rzeszutek Wilk
  2012-10-01  9:40             ` Jan Beulich
  2012-10-01 13:21             ` Daniel Kiper
  1 sibling, 2 replies; 32+ messages in thread
From: Konrad Rzeszutek Wilk @ 2012-09-28 16:21 UTC (permalink / raw)
  To: Daniel Kiper; +Cc: andrew.cooper3, jbeulich, linux-kernel, xen-devel

On Thu, Sep 27, 2012 at 08:06:32PM +0200, Daniel Kiper wrote:
> Register resources required by kexec-tools.
> 
> Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
> ---
>  arch/x86/xen/kexec.c |  150 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 files changed, 150 insertions(+), 0 deletions(-)
>  create mode 100644 arch/x86/xen/kexec.c
> 
> diff --git a/arch/x86/xen/kexec.c b/arch/x86/xen/kexec.c
> new file mode 100644
> index 0000000..eb0108b
> --- /dev/null
> +++ b/arch/x86/xen/kexec.c
> @@ -0,0 +1,150 @@
> +/*
> + * Copyright (c) 2011 Daniel Kiper
> + * Copyright (c) 2012 Daniel Kiper, Oracle Corporation
> + *
> + * kexec/kdump implementation for Xen was written by Daniel Kiper.
> + * Initial work on it was sponsored by Google under Google Summer
> + * of Code 2011 program and Citrix. Konrad Rzeszutek Wilk from Oracle
> + * was the mentor for this project.
> + *
> + * Some ideas are taken from:
> + *   - native kexec/kdump implementation,
> + *   - kexec/kdump implementation for Xen Linux Kernel Ver. 2.6.18,
> + *   - PV-GRUB.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <linux/errno.h>
> +#include <linux/init.h>
> +#include <linux/ioport.h>
> +#include <linux/kernel.h>
> +#include <linux/kexec.h>
> +#include <linux/slab.h>
> +#include <linux/string.h>
> +
> +#include <xen/interface/platform.h>
> +#include <xen/interface/xen.h>
> +#include <xen/xen.h>
> +
> +#include <asm/xen/hypercall.h>
> +
> +unsigned long xen_vmcoreinfo_maddr = 0;
> +unsigned long xen_vmcoreinfo_max_size = 0;
> +
> +static int __init xen_init_kexec_resources(void)
> +{
> +	int rc;
> +	static struct resource xen_hypervisor_res = {
> +		.name = "Hypervisor code and data",
> +		.flags = IORESOURCE_BUSY | IORESOURCE_MEM
> +	};
> +	struct resource *cpu_res;
> +	struct xen_kexec_range xkr;
> +	struct xen_platform_op cpuinfo_op;
> +	uint32_t cpus, i;
> +
> +	if (!xen_initial_domain())
> +		return 0;
> +
> +	if (strstr(boot_command_line, "crashkernel="))
> +		pr_info("kexec: Ignoring crashkernel option. "

pr_warn?

> +			"It should be passed to Xen hypervisor.\n");
> +
> +	/* Register Crash kernel resource. */
> +	xkr.range = KEXEC_RANGE_MA_CRASH;
> +	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &xkr);
> +
> +	if (rc) {
> +		pr_info("kexec: %s: HYPERVISOR_kexec_op(KEXEC_RANGE_MA_CRASH)"
> +			": %i\n", __func__, rc);

Perhaps pr_warn?
> +		return rc;
> +	}
> +
> +	if (!xkr.size)
> +		return 0;
> +
> +	crashk_res.start = xkr.start;
> +	crashk_res.end = xkr.start + xkr.size - 1;
> +	insert_resource(&iomem_resource, &crashk_res);
> +
> +	/* Register Hypervisor code and data resource. */
> +	xkr.range = KEXEC_RANGE_MA_XEN;
> +	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &xkr);
> +
> +	if (rc) {
> +		pr_info("kexec: %s: HYPERVISOR_kexec_op(KEXEC_RANGE_MA_XEN)"

pr_warn
> +			": %i\n", __func__, rc);
> +		return rc;
> +	}
> +
> +	xen_hypervisor_res.start = xkr.start;
> +	xen_hypervisor_res.end = xkr.start + xkr.size - 1;
> +	insert_resource(&iomem_resource, &xen_hypervisor_res);
> +
> +	/* Determine maximum number of physical CPUs. */
> +	cpuinfo_op.cmd = XENPF_get_cpuinfo;
> +	cpuinfo_op.u.pcpu_info.xen_cpuid = 0;
> +	rc = HYPERVISOR_dom0_op(&cpuinfo_op);
> +
> +	if (rc) {
> +		pr_info("kexec: %s: HYPERVISOR_dom0_op(): %i\n", __func__, rc);

pr_warn.
> +		return rc;
> +	}
> +
> +	cpus = cpuinfo_op.u.pcpu_info.max_present + 1;

Do we care about the hotplug CPUs?
> +
> +	/* Register CPUs Crash note resources. */
> +	cpu_res = kcalloc(cpus, sizeof(struct resource), GFP_KERNEL);
> +
> +	if (!cpu_res) {
> +		pr_info("kexec: %s: kcalloc(): %i\n", __func__, -ENOMEM);
> +		return -ENOMEM;
> +	}
> +
> +	for (i = 0; i < cpus; ++i) {

Any specific reason for using '++i' instead of 'i++' ?

> +		xkr.range = KEXEC_RANGE_MA_CPU;
> +		xkr.nr = i;
> +		rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &xkr);
> +
> +		if (rc) {
> +			pr_info("kexec: %s: cpu: %u: HYPERVISOR_kexec_op"
> +				"(KEXEC_RANGE_MA_XEN): %i\n", __func__, i, rc);
> +			continue;
> +		}
> +
> +		cpu_res->name = "Crash note";
> +		cpu_res->start = xkr.start;
> +		cpu_res->end = xkr.start + xkr.size - 1;
> +		cpu_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
> +		insert_resource(&iomem_resource, cpu_res++);
> +	}
> +
> +	/* Get vmcoreinfo address and maximum allowed size. */
> +	xkr.range = KEXEC_RANGE_MA_VMCOREINFO;
> +	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &xkr);
> +
> +	if (rc) {
> +		pr_info("kexec: %s: HYPERVISOR_kexec_op(KEXEC_RANGE_MA_VMCOREINFO)"
> +			": %i\n", __func__, rc);
> +		return rc;
> +	}
> +
> +	xen_vmcoreinfo_maddr = xkr.start;
> +	xen_vmcoreinfo_max_size = xkr.size;
> +
> +	return 0;
> +}
> +
> +core_initcall(xen_init_kexec_resources);
> -- 
> 1.5.6.5

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation
  2012-09-27 18:06           ` [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation Daniel Kiper
  2012-09-27 18:06             ` [PATCH 07/11] x86/xen: Add x86_64 " Daniel Kiper
  2012-09-28  8:11             ` [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation Jan Beulich
@ 2012-09-28 16:39             ` Konrad Rzeszutek Wilk
  2012-10-01 13:16               ` Daniel Kiper
  2 siblings, 1 reply; 32+ messages in thread
From: Konrad Rzeszutek Wilk @ 2012-09-28 16:39 UTC (permalink / raw)
  To: Daniel Kiper; +Cc: andrew.cooper3, jbeulich, linux-kernel, xen-devel

On Thu, Sep 27, 2012 at 08:06:33PM +0200, Daniel Kiper wrote:
> Add i386 kexec/kdump implementation.
> 
> Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
> ---
>  arch/x86/xen/machine_kexec_32.c   |  245 ++++++++++++++++++++++++++++
>  arch/x86/xen/relocate_kernel_32.S |  323 +++++++++++++++++++++++++++++++++++++
>  2 files changed, 568 insertions(+), 0 deletions(-)
>  create mode 100644 arch/x86/xen/machine_kexec_32.c
>  create mode 100644 arch/x86/xen/relocate_kernel_32.S
> 
> diff --git a/arch/x86/xen/machine_kexec_32.c b/arch/x86/xen/machine_kexec_32.c
> new file mode 100644
> index 0000000..6b5141e
> --- /dev/null
> +++ b/arch/x86/xen/machine_kexec_32.c
> @@ -0,0 +1,245 @@
> +/*
> + * Copyright (c) 2011 Daniel Kiper
> + * Copyright (c) 2012 Daniel Kiper, Oracle Corporation
> + *
> + * kexec/kdump implementation for Xen was written by Daniel Kiper.
> + * Initial work on it was sponsored by Google under Google Summer
> + * of Code 2011 program and Citrix. Konrad Rzeszutek Wilk from Oracle
> + * was the mentor for this project.
> + *
> + * Some ideas are taken from:
> + *   - native kexec/kdump implementation,
> + *   - kexec/kdump implementation for Xen Linux Kernel Ver. 2.6.18,
> + *   - PV-GRUB.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <linux/errno.h>
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/kexec.h>
> +#include <linux/mm.h>
> +#include <linux/string.h>
> +
> +#include <xen/xen.h>
> +#include <xen/xen-ops.h>
> +
> +#include <asm/xen/hypercall.h>
> +#include <asm/xen/kexec.h>
> +#include <asm/xen/page.h>
> +
> +#define __ma(vaddr)	(virt_to_machine(vaddr).maddr)
> +
> +static struct page *kimage_alloc_pages(gfp_t gfp_mask,
> +					unsigned int order,
> +					unsigned long limit)
> +{
> +	struct page *pages;
> +	unsigned int address_bits, i;
> +
> +	pages = alloc_pages(gfp_mask, order);
> +
> +	if (!pages)
> +		return NULL;
> +
> +	address_bits = (limit == ULONG_MAX) ? BITS_PER_LONG : ilog2(limit);
> +
> +	/* Relocate set of pages below given limit. */
> +	if (xen_create_contiguous_region((unsigned long)page_address(pages),
> +							order, address_bits)) {
> +		__free_pages(pages, order);
> +		return NULL;
> +	}
> +
> +	pages->mapping = NULL;

It shouldn't matter (as you did the alloc_page) but could you
add:
	BUG_ON(PagePrivate(pages))
in case somebody did do something weird beforehand.

> +	set_page_private(pages, order);
> +
> +	for (i = 0; i < (1 << order); ++i)
> +		SetPageReserved(pages + i);
> +
> +	return pages;
> +}
> +
> +static void kimage_free_pages(struct page *page)
> +{
> +	unsigned int i, order;
> +
> +	order = page_private(page);
> +
> +	for (i = 0; i < (1 << order); ++i)
> +		ClearPageReserved(page + i);
> +
> +	xen_destroy_contiguous_region((unsigned long)page_address(page), order);
> +	__free_pages(page, order);
> +}
> +
> +static unsigned long xen_page_to_mfn(struct page *page)
> +{
> +	return pfn_to_mfn(page_to_pfn(page));
> +}
> +
> +static struct page *xen_mfn_to_page(unsigned long mfn)
> +{
> +	return pfn_to_page(mfn_to_pfn(mfn));
> +}
> +
> +static unsigned long xen_virt_to_machine(volatile void *address)
> +{
> +	return virt_to_machine(address).maddr;
> +}
> +
> +static void *xen_machine_to_virt(unsigned long address)
> +{
> +	return phys_to_virt(machine_to_phys(XMADDR(address)).paddr);
> +}
> +
> +static void free_transition_pgtable(struct kimage *image)
> +{
> +	free_page((unsigned long)image->arch.pgd);
> +	free_page((unsigned long)image->arch.pmd0);
> +	free_page((unsigned long)image->arch.pmd1);
> +	free_page((unsigned long)image->arch.pte0);
> +	free_page((unsigned long)image->arch.pte1);
> +}
> +
> +static int alloc_transition_pgtable(struct kimage *image)
> +{
> +	image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
> +
> +	if (!image->arch.pgd)
> +		goto err;
> +
> +	image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
> +
> +	if (!image->arch.pmd0)
> +		goto err;
> +
> +	image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
> +
> +	if (!image->arch.pmd1)
> +		goto err;
> +
> +	image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL);
> +
> +	if (!image->arch.pte0)
> +		goto err;
> +
> +	image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL);
> +
> +	if (!image->arch.pte1)
> +		goto err;
> +
> +	return 0;
> +
> +err:
> +	free_transition_pgtable(image);
> +
> +	return -ENOMEM;
> +}
> +
> +static int machine_xen_kexec_prepare(struct kimage *image)
> +{
> +#ifdef CONFIG_KEXEC_JUMP
> +	if (image->preserve_context) {
> +		pr_info_once("kexec: Context preservation is not "
> +				"supported in Xen domains.\n");
> +		return -ENOSYS;
> +	}
> +#endif
> +
> +	return alloc_transition_pgtable(image);
> +}
> +
> +static int machine_xen_kexec_load(struct kimage *image)
> +{
> +	void *control_page;
> +	struct xen_kexec_load xkl = {};
> +
> +	if (!image)
> +		return 0;

Not -EINVAL?

> +
> +	control_page = page_address(image->control_code_page);
> +	memcpy(control_page, xen_relocate_kernel, xen_kexec_control_code_size);
> +
> +	xkl.type = image->type;
> +	xkl.image.page_list[XK_MA_CONTROL_PAGE] = __ma(control_page);
> +	xkl.image.page_list[XK_MA_TABLE_PAGE] = 0; /* Unused. */
> +	xkl.image.page_list[XK_MA_PGD_PAGE] = __ma(image->arch.pgd);
> +	xkl.image.page_list[XK_MA_PUD0_PAGE] = 0; /* Unused. */
> +	xkl.image.page_list[XK_MA_PUD1_PAGE] = 0; /* Unused. */
> +	xkl.image.page_list[XK_MA_PMD0_PAGE] = __ma(image->arch.pmd0);
> +	xkl.image.page_list[XK_MA_PMD1_PAGE] = __ma(image->arch.pmd1);
> +	xkl.image.page_list[XK_MA_PTE0_PAGE] = __ma(image->arch.pte0);
> +	xkl.image.page_list[XK_MA_PTE1_PAGE] = __ma(image->arch.pte1);
> +	xkl.image.indirection_page = image->head;
> +	xkl.image.start_address = image->start;
> +
> +	return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl);
> +}
> +
> +static void machine_xen_kexec_cleanup(struct kimage *image)
> +{
> +	free_transition_pgtable(image);
> +}
> +
> +static void machine_xen_kexec_unload(struct kimage *image)
> +{
> +	int rc;
> +	struct xen_kexec_load xkl = {};
> +
> +	if (!image)
> +		return;
> +
> +	xkl.type = image->type;
> +	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl);
> +
> +	WARN(rc, "kexec: %s: HYPERVISOR_kexec_op(): %i\n", __func__, rc);
> +}
> +
> +static void machine_xen_kexec_shutdown(void)
> +{
> +}
> +
> +static void machine_xen_kexec(struct kimage *image)
> +{
> +	int rc;
> +	struct xen_kexec_exec xke = {};
> +
> +	xke.type = image->type;
> +	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke);
> +
> +	pr_emerg("kexec: %s: HYPERVISOR_kexec_op(): %i\n", __func__, rc);
> +	BUG();
> +}
> +
> +void __init xen_init_kexec_ops(void)
> +{
> +	if (!xen_initial_domain())
> +		return;
> +
> +	kexec_ops.always_use_normal_alloc = true;
> +	kexec_ops.kimage_alloc_pages = kimage_alloc_pages;
> +	kexec_ops.kimage_free_pages = kimage_free_pages;
> +	kexec_ops.page_to_pfn = xen_page_to_mfn;
> +	kexec_ops.pfn_to_page = xen_mfn_to_page;
> +	kexec_ops.virt_to_phys = xen_virt_to_machine;
> +	kexec_ops.phys_to_virt = xen_machine_to_virt;
> +	kexec_ops.machine_kexec_prepare = machine_xen_kexec_prepare;
> +	kexec_ops.machine_kexec_load = machine_xen_kexec_load;
> +	kexec_ops.machine_kexec_cleanup = machine_xen_kexec_cleanup;
> +	kexec_ops.machine_kexec_unload = machine_xen_kexec_unload;
> +	kexec_ops.machine_kexec_shutdown = machine_xen_kexec_shutdown;
> +	kexec_ops.machine_kexec = machine_xen_kexec;
> +}
> diff --git a/arch/x86/xen/relocate_kernel_32.S b/arch/x86/xen/relocate_kernel_32.S
> new file mode 100644
> index 0000000..0e81830
> --- /dev/null
> +++ b/arch/x86/xen/relocate_kernel_32.S
> @@ -0,0 +1,323 @@
> +/*
> + * Copyright (c) 2002-2005 Eric Biederman <ebiederm@xmission.com>
> + * Copyright (c) 2011 Daniel Kiper
> + * Copyright (c) 2012 Daniel Kiper, Oracle Corporation
> + *
> + * kexec/kdump implementation for Xen was written by Daniel Kiper.
> + * Initial work on it was sponsored by Google under Google Summer
> + * of Code 2011 program and Citrix. Konrad Rzeszutek Wilk from Oracle
> + * was the mentor for this project.
> + *
> + * Some ideas are taken from:
> + *   - native kexec/kdump implementation,
> + *   - kexec/kdump implementation for Xen Linux Kernel Ver. 2.6.18,
> + *   - PV-GRUB.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either veesion 2 of the License, or
> + * (at your option) any later veesion.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <asm/cache.h>
> +#include <asm/page_types.h>
> +#include <asm/pgtable_types.h>
> +#include <asm/processor-flags.h>
> +
> +#include <asm/xen/kexec.h>
> +
> +#define ARG_INDIRECTION_PAGE	0x4
> +#define ARG_PAGE_LIST		0x8
> +#define ARG_START_ADDRESS	0xc
> +
> +#define PTR(x)	(x << 2)
> +
> +	.text
> +	.align	PAGE_SIZE
> +	.globl	xen_kexec_control_code_size, xen_relocate_kernel
> +
> +xen_relocate_kernel:
> +	/*
> +	 * Must be relocatable PIC code callable as a C function.
> +	 *
> +	 * This function is called by Xen but here hypervisor is dead.
> +	 * We are playing on bare metal.
> +	 *
> +	 * Every machine address passed to this function through
> +	 * page_list (e.g. XK_MA_CONTROL_PAGE) is established
> +	 * by dom0 during kexec load phase.
> +	 *
> +	 * Every virtual address passed to this function through page_list
> +	 * (e.g. XK_VA_CONTROL_PAGE) is established by hypervisor during
> +	 * HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load) hypercall.
> +	 *
> +	 * 0x4(%esp) - indirection_page,
> +	 * 0x8(%esp) - page_list,
> +	 * 0xc(%esp) - start_address,
> +	 * 0x10(%esp) - cpu_has_pae (ignored),
> +	 * 0x14(%esp) - preserve_context (ignored).
> +	 */
> +
> +	/* Zero out flags, and disable interrupts. */
> +	pushl	$0
> +	popfl
> +
> +	/* Get page_list address. */
> +	movl	ARG_PAGE_LIST(%esp), %esi
> +
> +	/*
> +	 * Map the control page at its virtual address
> +	 * in transition page table.
> +	 */
> +	movl	PTR(XK_VA_CONTROL_PAGE)(%esi), %eax
> +
> +	/* Get PGD address and PGD entry index. */
> +	movl	PTR(XK_VA_PGD_PAGE)(%esi), %ebx
> +	movl	%eax, %ecx
> +	shrl	$PGDIR_SHIFT, %ecx
> +	andl	$(PTRS_PER_PGD - 1), %ecx
> +
> +	/* Fill PGD entry with PMD0 reference. */
> +	movl	PTR(XK_MA_PMD0_PAGE)(%esi), %edx
> +	orl	$_PAGE_PRESENT, %edx
> +	movl	%edx, (%ebx, %ecx, 8)
> +
> +	/* Get PMD0 address and PMD0 entry index. */
> +	movl	PTR(XK_VA_PMD0_PAGE)(%esi), %ebx
> +	movl	%eax, %ecx
> +	shrl	$PMD_SHIFT, %ecx
> +	andl	$(PTRS_PER_PMD - 1), %ecx
> +
> +	/* Fill PMD0 entry with PTE0 reference. */
> +	movl	PTR(XK_MA_PTE0_PAGE)(%esi), %edx
> +	orl	$_KERNPG_TABLE, %edx
> +	movl	%edx, (%ebx, %ecx, 8)
> +
> +	/* Get PTE0 address and PTE0 entry index. */
> +	movl	PTR(XK_VA_PTE0_PAGE)(%esi), %ebx
> +	movl	%eax, %ecx
> +	shrl	$PAGE_SHIFT, %ecx
> +	andl	$(PTRS_PER_PTE - 1), %ecx
> +
> +	/* Fill PTE0 entry with control page reference. */
> +	movl	PTR(XK_MA_CONTROL_PAGE)(%esi), %edx
> +	orl	$__PAGE_KERNEL_EXEC, %edx
> +	movl	%edx, (%ebx, %ecx, 8)
> +
> +	/*
> +	 * Identity map the control page at its machine address
> +	 * in transition page table.
> +	 */
> +	movl	PTR(XK_MA_CONTROL_PAGE)(%esi), %eax
> +
> +	/* Get PGD address and PGD entry index. */
> +	movl	PTR(XK_VA_PGD_PAGE)(%esi), %ebx
> +	movl	%eax, %ecx
> +	shrl	$PGDIR_SHIFT, %ecx
> +	andl	$(PTRS_PER_PGD - 1), %ecx
> +
> +	/* Fill PGD entry with PMD1 reference. */
> +	movl	PTR(XK_MA_PMD1_PAGE)(%esi), %edx
> +	orl	$_PAGE_PRESENT, %edx
> +	movl	%edx, (%ebx, %ecx, 8)
> +
> +	/* Get PMD1 address and PMD1 entry index. */
> +	movl	PTR(XK_VA_PMD1_PAGE)(%esi), %ebx
> +	movl	%eax, %ecx
> +	shrl	$PMD_SHIFT, %ecx
> +	andl	$(PTRS_PER_PMD - 1), %ecx
> +
> +	/* Fill PMD1 entry with PTE1 reference. */
> +	movl	PTR(XK_MA_PTE1_PAGE)(%esi), %edx
> +	orl	$_KERNPG_TABLE, %edx
> +	movl	%edx, (%ebx, %ecx, 8)
> +
> +	/* Get PTE1 address and PTE1 entry index. */
> +	movl	PTR(XK_VA_PTE1_PAGE)(%esi), %ebx
> +	movl	%eax, %ecx
> +	shrl	$PAGE_SHIFT, %ecx
> +	andl	$(PTRS_PER_PTE - 1), %ecx
> +
> +	/* Fill PTE1 entry with control page reference. */
> +	movl	PTR(XK_MA_CONTROL_PAGE)(%esi), %edx
> +	orl	$__PAGE_KERNEL_EXEC, %edx
> +	movl	%edx, (%ebx, %ecx, 8)
> +
> +	/*
> +	 * Get machine address of control page now.
> +	 * This is impossible after page table switch.
> +	 */
> +	movl	PTR(XK_MA_CONTROL_PAGE)(%esi), %ebx
> +
> +	/* Get machine address of transition page table now too. */
> +	movl	PTR(XK_MA_PGD_PAGE)(%esi), %ecx
> +
> +	/* Get start_address too. */
> +	movl	ARG_START_ADDRESS(%esp), %edx
> +
> +	/* Get indirection_page address too. */
> +	movl	ARG_INDIRECTION_PAGE(%esp), %edi
> +
> +	/* Switch to transition page table. */
> +	movl	%ecx, %cr3
> +
> +	/* Load IDT. */
> +	lidtl	(idt_48 - xen_relocate_kernel)(%ebx)
> +
> +	/* Load GDT. */
> +	leal	(gdt - xen_relocate_kernel)(%ebx), %eax
> +	movl	%eax, (gdt_48 - xen_relocate_kernel + 2)(%ebx)
> +	lgdtl	(gdt_48 - xen_relocate_kernel)(%ebx)
> +
> +	/* Load data segment registers. */
> +	movl	$(gdt_ds - gdt), %eax
> +	movl	%eax, %ds
> +	movl	%eax, %es
> +	movl	%eax, %fs
> +	movl	%eax, %gs
> +	movl	%eax, %ss
> +
> +	/* Setup a new stack at the end of machine address of control page. */
> +	leal	PAGE_SIZE(%ebx), %esp
> +
> +	/* Store start_address on the stack. */
> +	pushl   %edx
> +
> +	/* Jump to identity mapped page. */
> +	pushl	$0
> +	pushl	$(gdt_cs - gdt)
> +	addl	$(identity_mapped - xen_relocate_kernel), %ebx
> +	pushl	%ebx
> +	iretl
> +
> +identity_mapped:
> +	/*
> +	 * Set %cr0 to a known state:
> +	 *   - disable alignment check,
> +	 *   - disable floating point emulation,
> +	 *   - disable paging,
> +	 *   - no task switch,
> +	 *   - disable write protect,
> +	 *   - enable protected mode.
> +	 */
> +	movl	%cr0, %eax
> +	andl	$~(X86_CR0_AM | X86_CR0_EM | X86_CR0_PG | X86_CR0_TS | X86_CR0_WP), %eax
> +	orl	$(X86_CR0_PE), %eax
> +	movl	%eax, %cr0
> +
> +	/* Set %cr4 to a known state. */
> +	xorl	%eax, %eax
> +	movl	%eax, %cr4
> +
> +	jmp	1f
> +
> +1:
> +	/* Flush the TLB (needed?). */
> +	movl	%eax, %cr3
> +
> +	/* Do the copies. */
> +	movl	%edi, %ecx	/* Put the indirection_page in %ecx. */
> +	xorl	%edi, %edi
> +	xorl	%esi, %esi
> +	jmp	1f
> +
> +0:
> +	/*
> +	 * Top, read another doubleword from the indirection page.
> +	 * Indirection page is an array which contains source
> +	 * and destination address pairs. If all pairs could
> +	 * not fit in one page then at the end of given
> +	 * indirection page is pointer to next one.
> +	 * Copy is stopped when done indicator
> +	 * is found in indirection page.
> +	 */
> +	movl	(%ebx), %ecx
> +	addl	$4, %ebx
> +
> +1:
> +	testl	$0x1, %ecx	/* Is it a destination page? */
> +	jz	2f
> +
> +	movl	%ecx, %edi
> +	andl	$PAGE_MASK, %edi
> +	jmp	0b
> +
> +2:
> +	testl	$0x2, %ecx	/* Is it an indirection page? */
> +	jz	2f
> +
> +	movl	%ecx, %ebx
> +	andl	$PAGE_MASK, %ebx
> +	jmp	0b
> +
> +2:
> +	testl	$0x4, %ecx	/* Is it the done indicator? */
> +	jz	2f
> +	jmp	3f
> +
> +2:
> +	testl	$0x8, %ecx	/* Is it the source indicator? */
> +	jz	0b		/* Ignore it otherwise. */
> +
> +	movl	%ecx, %esi
> +	andl	$PAGE_MASK, %esi
> +	movl	$1024, %ecx
> +
> +	/* Copy page. */
> +	rep	movsl
> +	jmp	0b
> +
> +3:
> +	/*
> +	 * To be certain of avoiding problems with self-modifying code
> +	 * I need to execute a serializing instruction here.
> +	 * So I flush the TLB by reloading %cr3 here, it's handy,
> +	 * and not processor dependent.
> +	 */
> +	xorl	%eax, %eax
> +	movl	%eax, %cr3
> +
> +	/*
> +	 * Set all of the registers to known values.
> +	 * Leave %esp alone.
> +	 */
> +	xorl	%ebx, %ebx
> +	xorl    %ecx, %ecx
> +	xorl    %edx, %edx
> +	xorl    %esi, %esi
> +	xorl    %edi, %edi
> +	xorl    %ebp, %ebp
> +
> +	/* Jump to start_address. */
> +	retl
> +
> +	.align	L1_CACHE_BYTES
> +
> +gdt:
> +	.quad	0x0000000000000000	/* NULL descriptor. */
> +
> +gdt_cs:
> +	.quad	0x00cf9a000000ffff	/* 4 GiB code segment at 0x00000000. */
> +
> +gdt_ds:
> +	.quad	0x00cf92000000ffff	/* 4 GiB data segment at 0x00000000. */
> +gdt_end:
> +
> +gdt_48:
> +	.word	gdt_end - gdt - 1	/* GDT limit. */
> +	.long	0			/* GDT base - filled in by code above. */
> +
> +idt_48:
> +	.word	0			/* IDT limit. */
> +	.long	0			/* IDT base. */
> +
> +xen_kexec_control_code_size:
> +	.long	. - xen_relocate_kernel
> -- 
> 1.5.6.5

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 05/11] x86/xen: Register resources required by kexec-tools
  2012-09-28 16:21           ` [PATCH 05/11] x86/xen: Register resources required by kexec-tools Konrad Rzeszutek Wilk
@ 2012-10-01  9:40             ` Jan Beulich
  2012-10-01 13:28               ` Daniel Kiper
  2012-10-01 13:21             ` Daniel Kiper
  1 sibling, 1 reply; 32+ messages in thread
From: Jan Beulich @ 2012-10-01  9:40 UTC (permalink / raw)
  To: Daniel Kiper, Konrad Rzeszutek Wilk
  Cc: andrew.cooper3, xen-devel, linux-kernel

>>> On 28.09.12 at 18:21, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> On Thu, Sep 27, 2012 at 08:06:32PM +0200, Daniel Kiper wrote:
>> +	for (i = 0; i < cpus; ++i) {
> 
> Any specific reason for using '++i' instead of 'i++' ?

For people occasionally also writing C++ code this is the
canonical form.

Jan


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 01/11] kexec: introduce kexec_ops struct
  2012-09-28  7:49   ` [PATCH 01/11] kexec: introduce kexec_ops struct Jan Beulich
@ 2012-10-01 11:36     ` Daniel Kiper
  2012-10-05 13:27       ` [Xen-devel] " Ian Campbell
  0 siblings, 1 reply; 32+ messages in thread
From: Daniel Kiper @ 2012-10-01 11:36 UTC (permalink / raw)
  To: Jan Beulich; +Cc: andrew.cooper3, xen-devel, konrad.wilk, linux-kernel

On Fri, Sep 28, 2012 at 08:49:16AM +0100, Jan Beulich wrote:
> >>> On 27.09.12 at 20:06, Daniel Kiper <daniel.kiper@oracle.com> wrote:
> > Some kexec/kdump implementations (e.g. Xen PVOPS) on different archs could
> > not use default functions or require some changes in behavior of kexec/kdump
> > generic code. To cope with that problem kexec_ops struct was introduced.
> > It allows a developer to replace all or some functions and control some
> > functionality of kexec/kdump generic code.
>
> I'm not convinced that doing this at the architecture independent
> layer is really necessary/desirable. Nevertheless, if that's the right
> place, then everything else looks good to me, except for a
> cosmetic thing:

I do not like this patch, too. However, this is the simplest
solution. If you do not do that in that way then you must
duplicate most of kernel/kexec.c functionality in architecture
depndent files.

> > @@ -392,7 +435,7 @@ static void kimage_free_page_list(struct list_head *list)
> >
> >  		page = list_entry(pos, struct page, lru);
> >  		list_del(&page->lru);
> > -		kimage_free_pages(page);
> > +		(*kexec_ops.kimage_free_pages)(page);
>
> These constructs are generally better readable without the
> explicit yet redundant indirection:
>
> 		kexec_ops.kimage_free_pages(page);

I have done that in that way because during my work on memory hotplug
Andrew Morton aligned my patches to that syntax. However,
I do not insist on staying with it.

Daniel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation
  2012-09-28  8:11             ` [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation Jan Beulich
@ 2012-10-01 12:52               ` Daniel Kiper
  2012-10-01 13:55                 ` Jan Beulich
  0 siblings, 1 reply; 32+ messages in thread
From: Daniel Kiper @ 2012-10-01 12:52 UTC (permalink / raw)
  To: Jan Beulich; +Cc: andrew.cooper3, xen-devel, konrad.wilk, linux-kernel

On Fri, Sep 28, 2012 at 09:11:47AM +0100, Jan Beulich wrote:
> >>> On 27.09.12 at 20:06, Daniel Kiper <daniel.kiper@oracle.com> wrote:
> > Add i386 kexec/kdump implementation.
>
> So this as well as the subsequent patch introduces quite a bit of
> duplicate code. The old 2.6.18 kernel had an initial pair of cleanup
> patches (attached in their forward ported form for 3.6-rc6) that
> would allow reducing the amount of duplication, particularly by
> eliminating the need to clone relocate_kernel_??.S altogether.

Thanks. Please look below for more details.

> Additionally, in the PAE case (which is the only relevant one for
> a 32-bit Xen kernel) I'm missing the address restriction
> enforcement for the PGD, without which the __ma() conversion
> result may not fit into the field it gets stored into.

Right.

> Finally, as noticed in an earlier patch already, you appear to
> re-introduce stuff long dropped from the kernel - the forward
> ported kernels get away with just setting PA_CONTROL_PAGE,
> PA_PGD, and PA_SWAP_PAGE in the page list. Since the number
> and purpose of the pages is established entirely by the guest
> kernel, all you need to obey is that the hypervisor expects
> alternating PA_/VA_ pairs (where the VA_ ones can be left
> unpopulated). Perhaps taking a look at a recent SLES kernel
> would help...

I have got ftp://ftp.suse.com/pub/projects/kernel/kotd/SLE11-SP2/src/kernel-source-3.0.43-6.1.src.rpm.
Does kexec/kdump work in your environment? In my it does not.
At least there is wrong assumption that
vaddr = (unsigned long)relocate_kernel
gets virtual address of relocate_kernel in Xen
(I have tested only x86_64 implementation but
as I saw i386 has similar problem). In real it is
fix mapped in hypervisor which is completely
different than address calculated in dom0 kernel.
Virtual address of control page (and others) is
only known by hypervisor kexec/kdump functions.
It means that transition page table could be
established by relocate_kernel code only.
If you would like to do optimistation as you
mentioned above you must reintroduce code
for page table establishment into generic
relocate_kernel_??.S. However, another
problem arises. New generic code utilizes
additional arguments such as swap page
(and potentially could use others in the future).
As I saw it is not possible to pass extra addresses
through page_list[] in struct xen_kexec_image
because its has insufficient size (I mean
x86_64 because i386 is a bit different story).
That is why relocate kernel code for Xen
should stay (sadly) in separate files.

Daniel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE
  2012-09-28  7:56     ` [PATCH 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE Jan Beulich
@ 2012-10-01 13:01       ` Daniel Kiper
  0 siblings, 0 replies; 32+ messages in thread
From: Daniel Kiper @ 2012-10-01 13:01 UTC (permalink / raw)
  To: Jan Beulich; +Cc: andrew.cooper3, xen-devel, konrad.wilk, linux-kernel

On Fri, Sep 28, 2012 at 08:56:50AM +0100, Jan Beulich wrote:
> >>> On 27.09.12 at 20:06, Daniel Kiper <daniel.kiper@oracle.com> wrote:
> > Some implementations (e.g. Xen PVOPS) could not use part of identity page
> > table to construct transition page table. It means that they require separate
> > PUDs, PMDs and PTEs for virtual and physical (identity) mapping. To satisfy that
> > requirement add extra pointer to PGD, PUD, PMD and PTE and align existing
> > code.
>
> I'm puzzled by this - why would you need to reintroduce what had
> been dropped a long time ago, when the forward ported kernels
> don't need it? Xen itself doesn't need the extra entries, their
> presence is purely a requirement of the specific kernel
> implementation afaict.

Hmmm... I will try to optimize that a bit.

Daniel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation
  2012-09-28 16:39             ` Konrad Rzeszutek Wilk
@ 2012-10-01 13:16               ` Daniel Kiper
  0 siblings, 0 replies; 32+ messages in thread
From: Daniel Kiper @ 2012-10-01 13:16 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: andrew.cooper3, jbeulich, linux-kernel, xen-devel

On Fri, Sep 28, 2012 at 12:39:44PM -0400, Konrad Rzeszutek Wilk wrote:
> On Thu, Sep 27, 2012 at 08:06:33PM +0200, Daniel Kiper wrote:
> > Add i386 kexec/kdump implementation.
> >
> > Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
> > ---
> >  arch/x86/xen/machine_kexec_32.c   |  245 ++++++++++++++++++++++++++++
> >  arch/x86/xen/relocate_kernel_32.S |  323 +++++++++++++++++++++++++++++++++++++
> >  2 files changed, 568 insertions(+), 0 deletions(-)
> >  create mode 100644 arch/x86/xen/machine_kexec_32.c
> >  create mode 100644 arch/x86/xen/relocate_kernel_32.S
> >
> > diff --git a/arch/x86/xen/machine_kexec_32.c b/arch/x86/xen/machine_kexec_32.c
> > new file mode 100644
> > index 0000000..6b5141e
> > --- /dev/null
> > +++ b/arch/x86/xen/machine_kexec_32.c
> > @@ -0,0 +1,245 @@
> > +/*
> > + * Copyright (c) 2011 Daniel Kiper
> > + * Copyright (c) 2012 Daniel Kiper, Oracle Corporation
> > + *
> > + * kexec/kdump implementation for Xen was written by Daniel Kiper.
> > + * Initial work on it was sponsored by Google under Google Summer
> > + * of Code 2011 program and Citrix. Konrad Rzeszutek Wilk from Oracle
> > + * was the mentor for this project.
> > + *
> > + * Some ideas are taken from:
> > + *   - native kexec/kdump implementation,
> > + *   - kexec/kdump implementation for Xen Linux Kernel Ver. 2.6.18,
> > + *   - PV-GRUB.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program.  If not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#include <linux/errno.h>
> > +#include <linux/init.h>
> > +#include <linux/kernel.h>
> > +#include <linux/kexec.h>
> > +#include <linux/mm.h>
> > +#include <linux/string.h>
> > +
> > +#include <xen/xen.h>
> > +#include <xen/xen-ops.h>
> > +
> > +#include <asm/xen/hypercall.h>
> > +#include <asm/xen/kexec.h>
> > +#include <asm/xen/page.h>
> > +
> > +#define __ma(vaddr)	(virt_to_machine(vaddr).maddr)
> > +
> > +static struct page *kimage_alloc_pages(gfp_t gfp_mask,
> > +					unsigned int order,
> > +					unsigned long limit)
> > +{
> > +	struct page *pages;
> > +	unsigned int address_bits, i;
> > +
> > +	pages = alloc_pages(gfp_mask, order);
> > +
> > +	if (!pages)
> > +		return NULL;
> > +
> > +	address_bits = (limit == ULONG_MAX) ? BITS_PER_LONG : ilog2(limit);
> > +
> > +	/* Relocate set of pages below given limit. */
> > +	if (xen_create_contiguous_region((unsigned long)page_address(pages),
> > +							order, address_bits)) {
> > +		__free_pages(pages, order);
> > +		return NULL;
> > +	}
> > +
> > +	pages->mapping = NULL;
>
> It shouldn't matter (as you did the alloc_page) but could you
> add:
> 	BUG_ON(PagePrivate(pages))
> in case somebody did do something weird beforehand.

OK.

> > +	set_page_private(pages, order);
> > +
> > +	for (i = 0; i < (1 << order); ++i)
> > +		SetPageReserved(pages + i);
> > +
> > +	return pages;
> > +}
> > +
> > +static void kimage_free_pages(struct page *page)
> > +{
> > +	unsigned int i, order;
> > +
> > +	order = page_private(page);
> > +
> > +	for (i = 0; i < (1 << order); ++i)
> > +		ClearPageReserved(page + i);
> > +
> > +	xen_destroy_contiguous_region((unsigned long)page_address(page), order);
> > +	__free_pages(page, order);
> > +}
> > +
> > +static unsigned long xen_page_to_mfn(struct page *page)
> > +{
> > +	return pfn_to_mfn(page_to_pfn(page));
> > +}
> > +
> > +static struct page *xen_mfn_to_page(unsigned long mfn)
> > +{
> > +	return pfn_to_page(mfn_to_pfn(mfn));
> > +}
> > +
> > +static unsigned long xen_virt_to_machine(volatile void *address)
> > +{
> > +	return virt_to_machine(address).maddr;
> > +}
> > +
> > +static void *xen_machine_to_virt(unsigned long address)
> > +{
> > +	return phys_to_virt(machine_to_phys(XMADDR(address)).paddr);
> > +}
> > +
> > +static void free_transition_pgtable(struct kimage *image)
> > +{
> > +	free_page((unsigned long)image->arch.pgd);
> > +	free_page((unsigned long)image->arch.pmd0);
> > +	free_page((unsigned long)image->arch.pmd1);
> > +	free_page((unsigned long)image->arch.pte0);
> > +	free_page((unsigned long)image->arch.pte1);
> > +}
> > +
> > +static int alloc_transition_pgtable(struct kimage *image)
> > +{
> > +	image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
> > +
> > +	if (!image->arch.pgd)
> > +		goto err;
> > +
> > +	image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
> > +
> > +	if (!image->arch.pmd0)
> > +		goto err;
> > +
> > +	image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
> > +
> > +	if (!image->arch.pmd1)
> > +		goto err;
> > +
> > +	image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL);
> > +
> > +	if (!image->arch.pte0)
> > +		goto err;
> > +
> > +	image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL);
> > +
> > +	if (!image->arch.pte1)
> > +		goto err;
> > +
> > +	return 0;
> > +
> > +err:
> > +	free_transition_pgtable(image);
> > +
> > +	return -ENOMEM;
> > +}
> > +
> > +static int machine_xen_kexec_prepare(struct kimage *image)
> > +{
> > +#ifdef CONFIG_KEXEC_JUMP
> > +	if (image->preserve_context) {
> > +		pr_info_once("kexec: Context preservation is not "
> > +				"supported in Xen domains.\n");
> > +		return -ENOSYS;
> > +	}
> > +#endif
> > +
> > +	return alloc_transition_pgtable(image);
> > +}
> > +
> > +static int machine_xen_kexec_load(struct kimage *image)
> > +{
> > +	void *control_page;
> > +	struct xen_kexec_load xkl = {};
> > +
> > +	if (!image)
> > +		return 0;
>
> Not -EINVAL?

No, if image == NULL then it means that image is unloaded from memory
and there is nothing to do by machine_xen_kexec_load().

Daniel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 05/11] x86/xen: Register resources required by kexec-tools
  2012-09-28 16:21           ` [PATCH 05/11] x86/xen: Register resources required by kexec-tools Konrad Rzeszutek Wilk
  2012-10-01  9:40             ` Jan Beulich
@ 2012-10-01 13:21             ` Daniel Kiper
  1 sibling, 0 replies; 32+ messages in thread
From: Daniel Kiper @ 2012-10-01 13:21 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: andrew.cooper3, jbeulich, linux-kernel, xen-devel

On Fri, Sep 28, 2012 at 12:21:35PM -0400, Konrad Rzeszutek Wilk wrote:
> On Thu, Sep 27, 2012 at 08:06:32PM +0200, Daniel Kiper wrote:
> > Register resources required by kexec-tools.
> >
> > Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
> > ---
> >  arch/x86/xen/kexec.c |  150 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 files changed, 150 insertions(+), 0 deletions(-)
> >  create mode 100644 arch/x86/xen/kexec.c
> >
> > diff --git a/arch/x86/xen/kexec.c b/arch/x86/xen/kexec.c
> > new file mode 100644
> > index 0000000..eb0108b
> > --- /dev/null
> > +++ b/arch/x86/xen/kexec.c
> > @@ -0,0 +1,150 @@
> > +/*
> > + * Copyright (c) 2011 Daniel Kiper
> > + * Copyright (c) 2012 Daniel Kiper, Oracle Corporation
> > + *
> > + * kexec/kdump implementation for Xen was written by Daniel Kiper.
> > + * Initial work on it was sponsored by Google under Google Summer
> > + * of Code 2011 program and Citrix. Konrad Rzeszutek Wilk from Oracle
> > + * was the mentor for this project.
> > + *
> > + * Some ideas are taken from:
> > + *   - native kexec/kdump implementation,
> > + *   - kexec/kdump implementation for Xen Linux Kernel Ver. 2.6.18,
> > + *   - PV-GRUB.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program.  If not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#include <linux/errno.h>
> > +#include <linux/init.h>
> > +#include <linux/ioport.h>
> > +#include <linux/kernel.h>
> > +#include <linux/kexec.h>
> > +#include <linux/slab.h>
> > +#include <linux/string.h>
> > +
> > +#include <xen/interface/platform.h>
> > +#include <xen/interface/xen.h>
> > +#include <xen/xen.h>
> > +
> > +#include <asm/xen/hypercall.h>
> > +
> > +unsigned long xen_vmcoreinfo_maddr = 0;
> > +unsigned long xen_vmcoreinfo_max_size = 0;
> > +
> > +static int __init xen_init_kexec_resources(void)
> > +{
> > +	int rc;
> > +	static struct resource xen_hypervisor_res = {
> > +		.name = "Hypervisor code and data",
> > +		.flags = IORESOURCE_BUSY | IORESOURCE_MEM
> > +	};
> > +	struct resource *cpu_res;
> > +	struct xen_kexec_range xkr;
> > +	struct xen_platform_op cpuinfo_op;
> > +	uint32_t cpus, i;
> > +
> > +	if (!xen_initial_domain())
> > +		return 0;
> > +
> > +	if (strstr(boot_command_line, "crashkernel="))
> > +		pr_info("kexec: Ignoring crashkernel option. "
>
> pr_warn?

OK.

> > +			"It should be passed to Xen hypervisor.\n");
> > +
> > +	/* Register Crash kernel resource. */
> > +	xkr.range = KEXEC_RANGE_MA_CRASH;
> > +	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &xkr);
> > +
> > +	if (rc) {
> > +		pr_info("kexec: %s: HYPERVISOR_kexec_op(KEXEC_RANGE_MA_CRASH)"
> > +			": %i\n", __func__, rc);
>
> Perhaps pr_warn?

Ditto.

> > +		return rc;
> > +	}
> > +
> > +	if (!xkr.size)
> > +		return 0;
> > +
> > +	crashk_res.start = xkr.start;
> > +	crashk_res.end = xkr.start + xkr.size - 1;
> > +	insert_resource(&iomem_resource, &crashk_res);
> > +
> > +	/* Register Hypervisor code and data resource. */
> > +	xkr.range = KEXEC_RANGE_MA_XEN;
> > +	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &xkr);
> > +
> > +	if (rc) {
> > +		pr_info("kexec: %s: HYPERVISOR_kexec_op(KEXEC_RANGE_MA_XEN)"
>
> pr_warn

Ditto.

> > +			": %i\n", __func__, rc);
> > +		return rc;
> > +	}
> > +
> > +	xen_hypervisor_res.start = xkr.start;
> > +	xen_hypervisor_res.end = xkr.start + xkr.size - 1;
> > +	insert_resource(&iomem_resource, &xen_hypervisor_res);
> > +
> > +	/* Determine maximum number of physical CPUs. */
> > +	cpuinfo_op.cmd = XENPF_get_cpuinfo;
> > +	cpuinfo_op.u.pcpu_info.xen_cpuid = 0;
> > +	rc = HYPERVISOR_dom0_op(&cpuinfo_op);
> > +
> > +	if (rc) {
> > +		pr_info("kexec: %s: HYPERVISOR_dom0_op(): %i\n", __func__, rc);
>
> pr_warn.

Ditto.

> > +		return rc;
> > +	}
> > +
> > +	cpus = cpuinfo_op.u.pcpu_info.max_present + 1;
>
> Do we care about the hotplug CPUs?

Good point. I have not considered it yet.

> > +
> > +	/* Register CPUs Crash note resources. */
> > +	cpu_res = kcalloc(cpus, sizeof(struct resource), GFP_KERNEL);
> > +
> > +	if (!cpu_res) {
> > +		pr_info("kexec: %s: kcalloc(): %i\n", __func__, -ENOMEM);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	for (i = 0; i < cpus; ++i) {
>
> Any specific reason for using '++i' instead of 'i++' ?

It does not matter here but I prefer '++i' form.
That is it.

Daniel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 05/11] x86/xen: Register resources required by kexec-tools
  2012-10-01  9:40             ` Jan Beulich
@ 2012-10-01 13:28               ` Daniel Kiper
  0 siblings, 0 replies; 32+ messages in thread
From: Daniel Kiper @ 2012-10-01 13:28 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Konrad Rzeszutek Wilk, andrew.cooper3, xen-devel, linux-kernel

On Mon, Oct 01, 2012 at 10:40:01AM +0100, Jan Beulich wrote:
> >>> On 28.09.12 at 18:21, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> > On Thu, Sep 27, 2012 at 08:06:32PM +0200, Daniel Kiper wrote:
> >> +	for (i = 0; i < cpus; ++i) {
> >
> > Any specific reason for using '++i' instead of 'i++' ?
>
> For people occasionally also writing C++ code this is the
> canonical form.

Heh... I have not written any C++ code since the end
of my C++ course at my university (~18 years).
I am just prefer '++i' instead of 'i++'.
That is it.

Daniel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 03/11] xen: Introduce architecture independent data for kexec/kdump
  2012-09-28 16:10       ` [PATCH 03/11] xen: Introduce architecture independent data for kexec/kdump Konrad Rzeszutek Wilk
@ 2012-10-01 13:34         ` Daniel Kiper
  0 siblings, 0 replies; 32+ messages in thread
From: Daniel Kiper @ 2012-10-01 13:34 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: andrew.cooper3, jbeulich, linux-kernel, xen-devel

On Fri, Sep 28, 2012 at 12:10:17PM -0400, Konrad Rzeszutek Wilk wrote:
> On Thu, Sep 27, 2012 at 08:06:30PM +0200, Daniel Kiper wrote:
> > Introduce architecture independent constants and structures
>
> Don't you mean 'dependent constants'?

Right.

> > required by Xen kexec/kdump implementation.
> >
> > Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
> > ---
> >  include/xen/interface/xen.h |   33 +++++++++++++++++++++++++++++++++
> >  1 files changed, 33 insertions(+), 0 deletions(-)
> >
> > diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
> > index 0801468..ac19f9e 100644
> > --- a/include/xen/interface/xen.h
> > +++ b/include/xen/interface/xen.h
> > @@ -58,6 +58,7 @@
> >  #define __HYPERVISOR_event_channel_op     32
> >  #define __HYPERVISOR_physdev_op           33
> >  #define __HYPERVISOR_hvm_op               34
> > +#define __HYPERVISOR_kexec_op             37
> >  #define __HYPERVISOR_tmem_op              38
> >
> >  /* Architecture-specific hypercall definitions. */
> > @@ -232,7 +233,39 @@ DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
> >  #define VMASST_TYPE_pae_extended_cr3     3
> >  #define MAX_VMASST_TYPE 3
> >
> > +/*
> > + * Commands to HYPERVISOR_kexec_op().
> > + */
> > +#define KEXEC_CMD_kexec			0
> > +#define KEXEC_CMD_kexec_load		1
> > +#define KEXEC_CMD_kexec_unload		2
> > +#define KEXEC_CMD_kexec_get_range	3
> > +
> > +/*
> > + * Memory ranges for kdump (utilized by HYPERVISOR_kexec_op()).
> > + */
> > +#define KEXEC_RANGE_MA_CRASH		0
> > +#define KEXEC_RANGE_MA_XEN		1
> > +#define KEXEC_RANGE_MA_CPU		2
> > +#define KEXEC_RANGE_MA_XENHEAP		3
> > +#define KEXEC_RANGE_MA_BOOT_PARAM	4
> > +#define KEXEC_RANGE_MA_EFI_MEMMAP	5
> > +#define KEXEC_RANGE_MA_VMCOREINFO	6
> > +
> >  #ifndef __ASSEMBLY__
> > +struct xen_kexec_exec {
> > +	int type;
> > +};
> > +
> > +struct xen_kexec_range {
> > +	int range;
> > +	int nr;
> > +	unsigned long size;
> > +	unsigned long start;
> > +};
>
> Might want to include an little blurb saying what we expect
> in case of running a 32-bit domain on a 64-bit hypervisor
> where the start might be past 4GB?

This is not true. All needed pages used by i386 relocate kernel
code are always below 4 GiB.

Daniel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 01/11] kexec: introduce kexec_ops struct
  2012-09-28 16:07   ` Konrad Rzeszutek Wilk
@ 2012-10-01 13:40     ` Daniel Kiper
  0 siblings, 0 replies; 32+ messages in thread
From: Daniel Kiper @ 2012-10-01 13:40 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: andrew.cooper3, jbeulich, linux-kernel, xen-devel

On Fri, Sep 28, 2012 at 12:07:42PM -0400, Konrad Rzeszutek Wilk wrote:
> On Thu, Sep 27, 2012 at 08:06:28PM +0200, Daniel Kiper wrote:
> > Some kexec/kdump implementations (e.g. Xen PVOPS) on different archs could
> > not use default functions or require some changes in behavior of kexec/kdump
> > generic code. To cope with that problem kexec_ops struct was introduced.
> > It allows a developer to replace all or some functions and control some
> > functionality of kexec/kdump generic code.
> >
> > Default behavior of kexec/kdump generic code is not changed.
> >
> > Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
> > ---
> >  include/linux/kexec.h |   18 +++++++
> >  kernel/kexec.c        |  125 ++++++++++++++++++++++++++++++++++++-------------
> >  2 files changed, 111 insertions(+), 32 deletions(-)
> >
> > diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> > index 37c5f72..beb08ca 100644
> > --- a/include/linux/kexec.h
> > +++ b/include/linux/kexec.h
> > @@ -165,7 +165,25 @@ struct kimage {
> >  #endif
> >  };
> >
> > +struct kexec_ops {
> > +	bool always_use_normal_alloc;
>
> So most of these are self-explanatory. But the bool is not that clear
> to me. Could you include a documentation comment explaining
> its purpose and its implications?

OK.

> > +	struct page *(*kimage_alloc_pages)(gfp_t gfp_mask,
> > +						unsigned int order,
> > +						unsigned long limit);
> > +	void (*kimage_free_pages)(struct page *page);
> > +	unsigned long (*page_to_pfn)(struct page *page);
> > +	struct page *(*pfn_to_page)(unsigned long pfn);
> > +	unsigned long (*virt_to_phys)(volatile void *address);
> > +	void *(*phys_to_virt)(unsigned long address);
> > +	int (*machine_kexec_prepare)(struct kimage *image);
> > +	int (*machine_kexec_load)(struct kimage *image);
> > +	void (*machine_kexec_cleanup)(struct kimage *image);
> > +	void (*machine_kexec_unload)(struct kimage *image);
> > +	void (*machine_kexec_shutdown)(void);
> > +	void (*machine_kexec)(struct kimage *image);
> > +};
> >
> > +extern struct kexec_ops kexec_ops;
>
> Is this neccessary?

Yes, because it is used by Xen machine_kexec_??.c files.

> >  /* kexec interface functions */
> >  extern void machine_kexec(struct kimage *image);
> > diff --git a/kernel/kexec.c b/kernel/kexec.c
> > index 0668d58..98556f3 100644
> > --- a/kernel/kexec.c
> > +++ b/kernel/kexec.c
> > @@ -56,6 +56,47 @@ struct resource crashk_res = {
> >  	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
> >  };
> >
> > +static struct page *kimage_alloc_pages(gfp_t gfp_mask,
> > +					unsigned int order,
> > +					unsigned long limit);
> > +static void kimage_free_pages(struct page *page);
> > +
> > +static unsigned long generic_page_to_pfn(struct page *page)
> > +{
> > +	return page_to_pfn(page);
> > +}
> > +
> > +static struct page *generic_pfn_to_page(unsigned long pfn)
> > +{
> > +	return pfn_to_page(pfn);
> > +}
> > +
> > +static unsigned long generic_virt_to_phys(volatile void *address)
> > +{
> > +	return virt_to_phys(address);
> > +}
> > +
> > +static void *generic_phys_to_virt(unsigned long address)
> > +{
> > +	return phys_to_virt(address);
> > +}
> > +
> > +struct kexec_ops kexec_ops = {
> > +	.always_use_normal_alloc = false,
> > +	.kimage_alloc_pages = kimage_alloc_pages,
> > +	.kimage_free_pages = kimage_free_pages,
> > +	.page_to_pfn = generic_page_to_pfn,
> > +	.pfn_to_page = generic_pfn_to_page,
> > +	.virt_to_phys = generic_virt_to_phys,
> > +	.phys_to_virt = generic_phys_to_virt,
> > +	.machine_kexec_prepare = machine_kexec_prepare,
> > +	.machine_kexec_load = NULL,
>
> Instead of NULL should they just point to some nop function?

OK.

Daniel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation
  2012-10-01 12:52               ` Daniel Kiper
@ 2012-10-01 13:55                 ` Jan Beulich
  2012-10-01 17:33                   ` Daniel Kiper
  0 siblings, 1 reply; 32+ messages in thread
From: Jan Beulich @ 2012-10-01 13:55 UTC (permalink / raw)
  To: Daniel Kiper; +Cc: andrew.cooper3, xen-devel, konrad.wilk, linux-kernel

>>> On 01.10.12 at 14:52, Daniel Kiper <daniel.kiper@oracle.com> wrote:
> On Fri, Sep 28, 2012 at 09:11:47AM +0100, Jan Beulich wrote:
>> Finally, as noticed in an earlier patch already, you appear to
>> re-introduce stuff long dropped from the kernel - the forward
>> ported kernels get away with just setting PA_CONTROL_PAGE,
>> PA_PGD, and PA_SWAP_PAGE in the page list. Since the number
>> and purpose of the pages is established entirely by the guest
>> kernel, all you need to obey is that the hypervisor expects
>> alternating PA_/VA_ pairs (where the VA_ ones can be left
>> unpopulated). Perhaps taking a look at a recent SLES kernel
>> would help...
> 
> I have got 
> ftp://ftp.suse.com/pub/projects/kernel/kotd/SLE11-SP2/src/kernel-source-3.0.
> 43-6.1.src.rpm.
> Does kexec/kdump work in your environment? In my it does not.

While I never ran it myself, I know kdump has been working on
SLE for quite a long while (leaving aside hardware or firmware
quirks requiring extra workarounds).

> At least there is wrong assumption that
> vaddr = (unsigned long)relocate_kernel
> gets virtual address of relocate_kernel in Xen

Where did you spot that? Afaics the only thing done with
relocate_kernel is that it gets copied into the control page.

> (I have tested only x86_64 implementation but
> as I saw i386 has similar problem). In real it is
> fix mapped in hypervisor which is completely
> different than address calculated in dom0 kernel.
> Virtual address of control page (and others) is
> only known by hypervisor kexec/kdump functions.
> It means that transition page table could be
> established by relocate_kernel code only.
> If you would like to do optimistation as you
> mentioned above you must reintroduce code
> for page table establishment into generic
> relocate_kernel_??.S. However, another
> problem arises. New generic code utilizes
> additional arguments such as swap page
> (and potentially could use others in the future).
> As I saw it is not possible to pass extra addresses
> through page_list[] in struct xen_kexec_image
> because its has insufficient size (I mean
> x86_64 because i386 is a bit different story).

No - there's no meaning assigned by Xen to any of the slots,
except for said assumption about them representing alternating
PA_/VA_ pairs. Hence, as long as old entries get removed, new
slots can easily be added (and the number of slots currently
needed is far lower than what was used originally [2.6.18]).

Jan


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation
  2012-10-01 13:55                 ` Jan Beulich
@ 2012-10-01 17:33                   ` Daniel Kiper
  0 siblings, 0 replies; 32+ messages in thread
From: Daniel Kiper @ 2012-10-01 17:33 UTC (permalink / raw)
  To: Jan Beulich; +Cc: andrew.cooper3, xen-devel, konrad.wilk, linux-kernel

On Mon, Oct 01, 2012 at 02:55:01PM +0100, Jan Beulich wrote:
> >>> On 01.10.12 at 14:52, Daniel Kiper <daniel.kiper@oracle.com> wrote:
> > On Fri, Sep 28, 2012 at 09:11:47AM +0100, Jan Beulich wrote:
> >> Finally, as noticed in an earlier patch already, you appear to
> >> re-introduce stuff long dropped from the kernel - the forward
> >> ported kernels get away with just setting PA_CONTROL_PAGE,
> >> PA_PGD, and PA_SWAP_PAGE in the page list. Since the number
> >> and purpose of the pages is established entirely by the guest
> >> kernel, all you need to obey is that the hypervisor expects
> >> alternating PA_/VA_ pairs (where the VA_ ones can be left
> >> unpopulated). Perhaps taking a look at a recent SLES kernel
> >> would help...
> >
> > I have got ftp://ftp.suse.com/pub/projects/kernel/kotd/SLE11-SP2/src/kernel-source-3.0.43-6.1.src.rpm.
> > Does kexec/kdump work in your environment? In my it does not.
>
> While I never ran it myself, I know kdump has been working on
> SLE for quite a long while (leaving aside hardware or firmware
> quirks requiring extra workarounds).

It should work on baremetal without any issue. However,
I am almost sure that it does not work on Xen dom0
in any way. But maybe I missed something.

> > At least there is wrong assumption that
> > vaddr = (unsigned long)relocate_kernel
> > gets virtual address of relocate_kernel in Xen
>
> Where did you spot that? Afaics the only thing done with
> relocate_kernel is that it gets copied into the control page.

arch/x86/kernel/machine_kexec_64.c:270
There is also similar thing in i386 code.

> > (I have tested only x86_64 implementation but
> > as I saw i386 has similar problem). In real it is
> > fix mapped in hypervisor which is completely
> > different than address calculated in dom0 kernel.
> > Virtual address of control page (and others) is
> > only known by hypervisor kexec/kdump functions.
> > It means that transition page table could be
> > established by relocate_kernel code only.
> > If you would like to do optimistation as you
> > mentioned above you must reintroduce code
> > for page table establishment into generic
> > relocate_kernel_??.S. However, another
> > problem arises. New generic code utilizes
> > additional arguments such as swap page
> > (and potentially could use others in the future).
> > As I saw it is not possible to pass extra addresses
> > through page_list[] in struct xen_kexec_image
> > because its has insufficient size (I mean
> > x86_64 because i386 is a bit different story).
>
> No - there's no meaning assigned by Xen to any of the slots,
> except for said assumption about them representing alternating

I know that.

> PA_/VA_ pairs. Hence, as long as old entries get removed, new
> slots can easily be added (and the number of slots currently
> needed is far lower than what was used originally [2.6.18]).

As I said in other email I will try to optimize page table code.
We will see what could be done.

Daniel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xen-devel] [PATCH 01/11] kexec: introduce kexec_ops struct
  2012-10-01 11:36     ` Daniel Kiper
@ 2012-10-05 13:27       ` Ian Campbell
  0 siblings, 0 replies; 32+ messages in thread
From: Ian Campbell @ 2012-10-05 13:27 UTC (permalink / raw)
  To: Daniel Kiper
  Cc: Jan Beulich, Andrew Cooper, konrad.wilk, linux-kernel, xen-devel

On Mon, 2012-10-01 at 12:36 +0100, Daniel Kiper wrote:
> On Fri, Sep 28, 2012 at 08:49:16AM +0100, Jan Beulich wrote:
> > >>> On 27.09.12 at 20:06, Daniel Kiper <daniel.kiper@oracle.com> wrote:
> > > Some kexec/kdump implementations (e.g. Xen PVOPS) on different archs could
> > > not use default functions or require some changes in behavior of kexec/kdump
> > > generic code. To cope with that problem kexec_ops struct was introduced.
> > > It allows a developer to replace all or some functions and control some
> > > functionality of kexec/kdump generic code.
> >
> > I'm not convinced that doing this at the architecture independent
> > layer is really necessary/desirable. Nevertheless, if that's the right
> > place, then everything else looks good to me, except for a
> > cosmetic thing:
> 
> I do not like this patch, too. However, this is the simplest
> solution. If you do not do that in that way then you must
> duplicate most of kernel/kexec.c functionality in architecture
> depndent files.

It would have been a good idea to CC the maintainer of those files
directly with at least this patch if not the whole series.

If they don't like this approach then there not much point in doing a
thorough reviewing of the other 10 patches I don't think, since I would
expect they will be required to change pretty substantially under those
circumstances.

Ian.



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xen-devel] [PATCH 01/11] kexec: introduce kexec_ops struct
@ 2012-10-08 11:54 Daniel Kiper
  0 siblings, 0 replies; 32+ messages in thread
From: Daniel Kiper @ 2012-10-08 11:54 UTC (permalink / raw)
  To: Ian.Campbell
  Cc: konrad.wilk, Andrew.Cooper3, xen-devel, linux-kernel, JBeulich

Hi,

> On Mon, 2012-10-01 at 12:36 +0100, Daniel Kiper wrote:
> > On Fri, Sep 28, 2012 at 08:49:16AM +0100, Jan Beulich wrote:
> > > >>> On 27.09.12 at 20:06, Daniel Kiper <daniel.kiper@oracle.com> wrote:
> > > > Some kexec/kdump implementations (e.g. Xen PVOPS) on different archs could
> > > > not use default functions or require some changes in behavior of kexec/kdump
> > > > generic code. To cope with that problem kexec_ops struct was introduced.
> > > > It allows a developer to replace all or some functions and control some
> > > > functionality of kexec/kdump generic code.
> > >
> > > I'm not convinced that doing this at the architecture independent
> > > layer is really necessary/desirable. Nevertheless, if that's the right
> > > place, then everything else looks good to me, except for a
> > > cosmetic thing:
> >
> > I do not like this patch, too. However, this is the simplest
> > solution. If you do not do that in that way then you must
> > duplicate most of kernel/kexec.c functionality in architecture
> > depndent files.
>
> It would have been a good idea to CC the maintainer of those files
> directly with at least this patch if not the whole series.

Thanks. I spotted later that maintainers are not in CC.
I am going to prepare next version of patches with
minor suggested fixes and repost them once again
for review next week.

> If they don't like this approach then there not much point in doing a
> thorough reviewing of the other 10 patches I don't think, since I would
> expect they will be required to change pretty substantially under those
> circumstances.

Sure.

Daniel

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2012-10-08 11:54 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-09-27 18:06 [PATCH 00/11] xen: Initial kexec/kdump implementation Daniel Kiper
2012-09-27 18:06 ` [PATCH 01/11] kexec: introduce kexec_ops struct Daniel Kiper
2012-09-27 18:06   ` [PATCH 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE Daniel Kiper
2012-09-27 18:06     ` [PATCH 03/11] xen: Introduce architecture independent data for kexec/kdump Daniel Kiper
2012-09-27 18:06       ` [PATCH 04/11] x86/xen: Introduce architecture dependent " Daniel Kiper
2012-09-27 18:06         ` [PATCH 05/11] x86/xen: Register resources required by kexec-tools Daniel Kiper
2012-09-27 18:06           ` [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation Daniel Kiper
2012-09-27 18:06             ` [PATCH 07/11] x86/xen: Add x86_64 " Daniel Kiper
2012-09-27 18:06               ` [PATCH 08/11] x86/xen: Add kexec/kdump makefile rules Daniel Kiper
2012-09-27 18:06                 ` [PATCH 09/11] x86/xen/enlighten: Add init and crash kexec/kdump hooks Daniel Kiper
2012-09-27 18:06                   ` [PATCH 10/11] drivers/xen: Export vmcoreinfo through sysfs Daniel Kiper
2012-09-27 18:06                     ` [PATCH 11/11] x86: Add Xen kexec control code size check to linker script Daniel Kiper
2012-09-28  8:11             ` [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation Jan Beulich
2012-10-01 12:52               ` Daniel Kiper
2012-10-01 13:55                 ` Jan Beulich
2012-10-01 17:33                   ` Daniel Kiper
2012-09-28 16:39             ` Konrad Rzeszutek Wilk
2012-10-01 13:16               ` Daniel Kiper
2012-09-28 16:21           ` [PATCH 05/11] x86/xen: Register resources required by kexec-tools Konrad Rzeszutek Wilk
2012-10-01  9:40             ` Jan Beulich
2012-10-01 13:28               ` Daniel Kiper
2012-10-01 13:21             ` Daniel Kiper
2012-09-28 16:10       ` [PATCH 03/11] xen: Introduce architecture independent data for kexec/kdump Konrad Rzeszutek Wilk
2012-10-01 13:34         ` Daniel Kiper
2012-09-28  7:56     ` [PATCH 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE Jan Beulich
2012-10-01 13:01       ` Daniel Kiper
2012-09-28  7:49   ` [PATCH 01/11] kexec: introduce kexec_ops struct Jan Beulich
2012-10-01 11:36     ` Daniel Kiper
2012-10-05 13:27       ` [Xen-devel] " Ian Campbell
2012-09-28 16:07   ` Konrad Rzeszutek Wilk
2012-10-01 13:40     ` Daniel Kiper
2012-10-08 11:54 [Xen-devel] " Daniel Kiper

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).