All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH/RFC 2/2] s390 virtualization interface.
@ 2007-04-27 13:40 Carsten Otte
       [not found] ` <1177681235.5770.22.camel-WIxn4w2hgUz3YA32ykw5MLlKpX0K8NHHQQ4Iyu8u01E@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Carsten Otte @ 2007-04-27 13:40 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
  Cc: cborntra-tA70FqPdS9bQT0dZR+AlfA, schwidefsky-tA70FqPdS9bQT0dZR+AlfA

Add interface which allows a process to start a virtual machine.

To keep things easy each thread group is allowed to have only one
virtual machine and each thread of the thread group can only control
one virtual cpu of the virtual machine. All the information about
the virtual machines/cpus can be found via the thread_info structures
of the participating threads.

This patch adds three new s390 specific system calls:

long sys_s390host_add_cpu(unsigned long addr, unsigned long flags,
			  struct sie_block __user *sie_template)

Adds a new cpu to a the virtual machine that belongs to the current
thread group. If no virtual machine exists it will be created. In
addition two pages will be allocated and mapped at <addr> into the
address space of the process. These two pages are used so user space
and kernel space can easily exchange/modify the state of the
corresponding virtual cpu without a ton of copy_from/to_user calls.
The sie_template is a pointer to a data structure that contains
initial information how the virtual cpu should be setup. The
resulting block will be used as a parameter to issue the sie (start
interpretive execution) instruction which starts a virtual cpu.

int sys_s390host_remove_cpu(void)

Removes a virtual cpu from a virtual machine.

int sys_s390host_sie(unsigned long action)

Starts / re-enters the virtual cpu of the virtual machine that the
thread belongs to, if any.

Please note that this patch is nothing more than a proof-of-concept
and may contain quite a few bugs.
Since we want to convert to use kvm instead, most of this will be
dropped anyway. But maybe this is of interest for others as well.
---
 arch/s390/Kconfig              |    7 
 arch/s390/Makefile             |    2 
 arch/s390/host/Makefile        |    5 
 arch/s390/host/s390host.c      |  411 +++++++++++++++++++++++++++++++++++++++++
 arch/s390/host/sie64a.S        |   37 +++
 arch/s390/kernel/asm-offsets.c |    2 
 arch/s390/kernel/process.c     |   15 +
 arch/s390/kernel/setup.c       |    4 
 arch/s390/kernel/syscalls.S    |    3 
 include/asm-s390/sie64.h       |  276 +++++++++++++++++++++++++++
 include/asm-s390/thread_info.h |    8 
 include/asm-s390/unistd.h      |    5 
 kernel/sys_ni.c                |    3 
 13 files changed, 773 insertions(+), 5 deletions(-)

Index: linux-2.6/arch/s390/kernel/asm-offsets.c
===================================================================
--- linux-2.6.orig/arch/s390/kernel/asm-offsets.c
+++ linux-2.6/arch/s390/kernel/asm-offsets.c
@@ -44,5 +44,7 @@ int main(void)
 	DEFINE(__SF_BACKCHAIN, offsetof(struct stack_frame, back_chain),);
 	DEFINE(__SF_GPRS, offsetof(struct stack_frame, gprs),);
 	DEFINE(__SF_EMPTY, offsetof(struct stack_frame, empty1),);
+	BLANK();
+	DEFINE(__SIE_USER_gprs, offsetof(struct sie_user, gprs),);
 	return 0;
 }
Index: linux-2.6/arch/s390/kernel/syscalls.S
===================================================================
--- linux-2.6.orig/arch/s390/kernel/syscalls.S
+++ linux-2.6/arch/s390/kernel/syscalls.S
@@ -322,3 +322,6 @@ NI_SYSCALL							/* 310 sys_move_pages *
 SYSCALL(sys_getcpu,sys_getcpu,sys_getcpu_wrapper)
 SYSCALL(sys_epoll_pwait,sys_epoll_pwait,compat_sys_epoll_pwait_wrapper)
 SYSCALL(sys_utimes,sys_utimes,compat_sys_utimes_wrapper)
+SYSCALL(sys_ni_syscall,sys_s390host_add_cpu,sys_ni_syscall)
+SYSCALL(sys_ni_syscall,sys_s390host_remove_cpu,sys_ni_syscall)
+SYSCALL(sys_ni_syscall,sys_s390host_sie,sys_ni_syscall)
Index: linux-2.6/arch/s390/host/Makefile
===================================================================
--- /dev/null
+++ linux-2.6/arch/s390/host/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the s390host components.
+#
+
+obj-$(CONFIG_S390_HOST)	+= s390host.o sie64a.o
Index: linux-2.6/arch/s390/host/sie64a.S
===================================================================
--- /dev/null
+++ linux-2.6/arch/s390/host/sie64a.S
@@ -0,0 +1,37 @@
+/*
+ *  arch/s390/host/sie64a.S
+ *    low level sie call
+ *
+ *    Copyright IBM Corp. 2007
+ *    Author(s): Heiko Carstens <heiko.carstens-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>
+ */
+
+#include <linux/errno.h>
+#include <asm/asm-offsets.h>
+
+SP_R6 =	6 * 8	# offset into stackframe
+
+	.globl	sie64a
+sie64a:
+	stmg	%r6,%r15,SP_R6(%r15)		# save register on entry
+	lgr	%r14,%r2			# pointer to program parms
+	aghi	%r2,4096
+	lmg	%r0,%r13,__SIE_USER_gprs(%r2)	# load guest gprs 0-13
+sie_inst:
+	sie	0(%r14)
+	aghi	%r14,4096
+	stmg	%r0,%r13,__SIE_USER_gprs(%r14)	# save guest gprs 0-13
+	lghi	%r2,0
+	lmg	%r6,%r15,SP_R6(%r15)
+	br	%r14
+
+sie_err:
+	aghi	%r14,4096
+	stmg	%r0,%r13,__SIE_USER_gprs(%r14)	# save guest gprs 0-13
+	lghi	%r2,-EFAULT
+	lmg	%r6,%r15,SP_R6(%r15)
+	br	%r14
+
+	.section __ex_table,"a"
+	.quad	sie_inst,sie_err
+	.previous
Index: linux-2.6/arch/s390/host/s390host.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/s390/host/s390host.c
@@ -0,0 +1,411 @@
+/*
+ * s390host.c --  hosting zSeries Linux virtual engines
+ *
+ * Copyright IBM Corp. 2007
+ *   Author(s): Carsten Otte <cotte-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>,
+ *		Christian Borntraeger <cborntra-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>,
+ *		Heiko Carstens <heiko.carstens-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>
+ */
+
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/mman.h>
+#include <linux/mutex.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/semaphore.h>
+#include <asm/sie64.h>
+
+static int s390host_do_action(unsigned long, struct sie_io *);
+
+static DEFINE_MUTEX(s390host_init_mutex);
+
+static void s390host_get_data(struct s390host_data *data)
+{
+	atomic_inc(&data->count);
+}
+
+void s390host_put_data(struct s390host_data *data)
+{
+	int cpu;
+
+	if (atomic_dec_return(&data->count))
+		return;
+
+	for (cpu = 0; cpu < S390HOST_MAX_CPUS; cpu++)
+		if (data->sie_io[cpu])
+			free_page((unsigned long)data->sie_io[cpu]);
+
+	if (data->sca_block)
+		free_page((unsigned long)data->sca_block);
+
+	kfree(data);
+}
+
+static void s390host_vma_close(struct vm_area_struct *vma)
+{
+	s390host_put_data(vma->vm_private_data);
+}
+
+static struct page *s390host_vma_nopage(struct vm_area_struct *vma,
+					unsigned long address, int *type)
+{
+	return NOPAGE_SIGBUS;
+}
+
+static struct vm_operations_struct s390host_vmops = {
+	.close = s390host_vma_close,
+	.nopage = s390host_vma_nopage,
+};
+
+static struct s390host_data *get_s390host_context(void)
+{
+	struct thread_info *tif;
+	struct sca_block *sca_block = NULL;
+	struct s390host_data *data = NULL;
+	struct task_struct *tsk;
+
+	/* zlh context for current thread already created? */
+	tif = current_thread_info();
+	if (tif->s390host_data)
+		return tif->s390host_data;
+
+	/* zlh context in thread group available? */
+	write_lock_irq(&tasklist_lock);
+	tsk = next_thread(current);
+	for (; tsk != current; tsk = next_thread(tsk)) {
+		data = tsk->thread_info->s390host_data;
+		if (data) {
+			s390host_get_data(data);
+			tif->s390host_data = data;
+			break;
+		}
+	}
+	write_unlock_irq(&tasklist_lock);
+
+	if (data)
+		return data;
+
+	/* create new context */
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+
+	if (!data)
+		return NULL;
+
+	sca_block = (struct sca_block *)get_zeroed_page(GFP_KERNEL);
+
+	if (!sca_block) {
+		kfree(data);
+		return NULL;
+	}
+
+	data->sca_block = sca_block;
+	tif->s390host_data = data;
+	s390host_get_data(data);
+
+	return data;
+}
+
+static unsigned long
+s390host_create_io_area(unsigned long addr, unsigned long flags,
+			unsigned long io_addr, struct s390host_data *data)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long ret;
+
+	flags &= MAP_FIXED;
+	addr = get_unmapped_area(NULL, addr, 2 * PAGE_SIZE, 0, flags);
+
+	if (addr & ~PAGE_MASK)
+		return addr;
+
+	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+
+	if (!vma)
+		return -ENOMEM;
+
+	vma->vm_mm = mm;
+	vma->vm_start = addr;
+	vma->vm_end = addr + 2 * PAGE_SIZE;
+	vma->vm_flags =	VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED;
+	vma->vm_flags |= VM_SHARED | VM_MAYSHARE | VM_DONTCOPY;
+
+#if 1	/* FIXME: write access until sys_s390host_sie interface is final */
+	vma->vm_flags |= VM_WRITE | VM_MAYWRITE;
+#endif
+
+	vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
+	vma->vm_private_data = data;
+	vma->vm_ops = &s390host_vmops;
+
+	down_write(&mm->mmap_sem);
+	ret = insert_vm_struct(mm, vma);
+	if (ret) {
+		kmem_cache_free(vm_area_cachep, vma);
+		goto out;
+	}
+	s390host_get_data(data);
+	mm->total_vm += 2;
+	vm_insert_page(vma, addr, virt_to_page(io_addr));
+
+	ret = split_vma(mm, vma, addr + PAGE_SIZE, 0);
+	if (ret)
+		goto out;
+	s390host_get_data(data);
+
+	vma = find_vma(mm, addr + PAGE_SIZE);
+	vma->vm_flags |= VM_WRITE | VM_MAYWRITE;
+	vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
+	vm_insert_page(vma, addr + PAGE_SIZE,
+		       virt_to_page(io_addr + PAGE_SIZE));
+	ret = addr;
+out:
+	up_write(&mm->mmap_sem);
+	return ret;
+}
+
+long sys_s390host_add_cpu(unsigned long addr, unsigned long flags,
+			  struct sie_block __user *sie_template)
+{
+	struct sie_block *sie_block;
+	struct sie_io *sie_io;
+	struct sca_block *sca_block;
+	struct s390host_data *data = NULL;
+	unsigned long ret;
+	__u16 cpu;
+
+	if (current_thread_info()->sie_cpu != -1)
+		return -EINVAL;
+
+	if (copy_from_user(&cpu, &sie_template->icpua, sizeof(u16)))
+		return -EFAULT;
+
+	if (cpu >= S390HOST_MAX_CPUS)
+		return -EINVAL;
+
+	mutex_lock(&s390host_init_mutex);
+
+	data = get_s390host_context();
+	if (!data) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	sca_block = data->sca_block;
+	if (sca_block->mcn & (1UL << (S390HOST_MAX_CPUS - 1 - cpu))) {
+		ret = -EINVAL;
+		goto out_err;
+	}
+
+	if (!data->sie_io[cpu]) {
+		unsigned long tmp;
+
+		/* allocate two pages: 1st is r/o 2nd r/w area */
+		tmp = __get_free_pages(GFP_KERNEL, 1);
+		if (!tmp) {
+			ret = -ENOMEM;
+			goto out_err;
+		}
+		split_page(virt_to_page(tmp), 1);
+		data->sie_io[cpu] = (struct sie_io *)tmp;
+	}
+
+	sie_io = data->sie_io[cpu];
+	memset(sie_io, 0, 2 * PAGE_SIZE);
+
+	sie_block = &sie_io->sie_kernel.sie_block;
+	sca_block->cpu[cpu].sda = (__u64)sie_block;
+
+	if (copy_from_user(sie_block, sie_template, sizeof(struct sie_block))) {
+		ret = -EFAULT;
+		goto out_err;
+	}
+	sie_block->icpua = cpu;
+
+	ret = s390host_create_io_area(addr, flags, (unsigned long)sie_io, data);
+
+	if (ret & ~PAGE_MASK)
+		goto out_err;
+
+	sca_block->mcn |= 1UL << (S390HOST_MAX_CPUS - 1 - cpu);
+	sie_block->scaoh = (__u32)(((__u64)sca_block) >> 32);
+	sie_block->scaol = (__u32)(__u64)sca_block;
+	current_thread_info()->sie_cpu = cpu;
+	goto out;
+out_err:
+	if (data)
+		s390host_put_data(data);
+out:
+	mutex_unlock(&s390host_init_mutex);
+	return ret;
+}
+
+int sys_s390host_remove_cpu(void)
+{
+	struct sca_block *sca_block;
+	int cpu;
+
+	cpu = current_thread_info()->sie_cpu;
+	if (cpu == -1)
+		return -EINVAL;
+
+	mutex_lock(&s390host_init_mutex);
+	sca_block = current_thread_info()->s390host_data->sca_block;
+	sca_block->mcn &= ~(1UL << (S390HOST_MAX_CPUS - 1 - cpu));
+	current_thread_info()->sie_cpu = -1;
+	mutex_unlock(&s390host_init_mutex);
+	return 0;
+}
+
+int sys_s390host_sie(unsigned long action)
+{
+	struct sie_kernel *sie_kernel;
+	struct sie_user *sie_user;
+	struct sie_io *sie_io;
+	int cpu;
+	int ret = 0;
+
+	cpu = current_thread_info()->sie_cpu;
+	if (cpu == -1)
+		return -EINVAL;
+
+	sie_io = current_thread_info()->s390host_data->sie_io[cpu];
+
+	if (action)
+		ret = s390host_do_action(action, sie_io);
+	if (ret)
+		goto out_err;
+	sie_kernel = &sie_io->sie_kernel;
+	sie_user = &sie_io->sie_user;
+
+	save_fp_regs(&sie_kernel->host_fpregs);
+	save_access_regs(sie_kernel->host_acrs);
+	sie_user->guest_fpregs.fpc &= FPC_VALID_MASK;
+	restore_fp_regs(&sie_user->guest_fpregs);
+	restore_access_regs(sie_user->guest_acrs);
+	memcpy(&sie_kernel->sie_block.gg14, &sie_user->gprs[14], 16);
+again:
+	if (need_resched())
+		schedule();
+
+	sie_kernel->sie_block.icptcode = 0;
+	ret = sie64a(sie_kernel);
+	if (ret)
+		goto out;
+
+	if (signal_pending(current)) {
+		ret = -EINTR;
+		goto out;
+	}
+
+	switch (sie_kernel->sie_block.icptcode) {
+	case 0x00:
+	case 0x24:
+		goto again;
+	}
+out:
+	memcpy(&sie_user->gprs[14], &sie_kernel->sie_block.gg14, 16);
+	save_fp_regs(&sie_user->guest_fpregs);
+	save_access_regs(sie_user->guest_acrs);
+	restore_fp_regs(&sie_kernel->host_fpregs);
+	restore_access_regs(sie_kernel->host_acrs);
+out_err:
+	return ret;
+}
+
+static void s390host_vsmxm_local_update(struct sie_io *sie_io)
+{
+	struct sie_kernel *local_sie_kernel;
+	struct sie_user *sie_user;
+	atomic_t *cpuflags;
+	int old, new;
+
+	mutex_lock(&s390host_init_mutex);
+
+	sie_user = &sie_io->sie_user;
+	local_sie_kernel = &sie_io->sie_kernel;
+
+	cpuflags = &local_sie_kernel->sie_block.cpuflags;
+	do {
+		old = atomic_read(cpuflags);
+		new = old | sie_user->vsmxm_or_local;
+		new &= sie_user->vsmxm_and_local;
+	} while (atomic_cmpxchg(cpuflags, old, new) != old);
+
+	mutex_unlock(&s390host_init_mutex);
+	return;
+}
+
+static int s390host_vsmxm_dist_update(struct sie_io *sie_io)
+{
+	struct sie_kernel *dist_sie_kernel;
+	struct sie_user *sie_user;
+	struct sca_block *sca_block;
+	struct thread_info *tif;
+	atomic_t *cpuflags;
+	int cpu;
+	int old, new;
+	int rc = -EINVAL;
+
+	mutex_lock(&s390host_init_mutex);
+
+	sie_user = &sie_io->sie_user;
+	cpu = sie_user->vsmxm_cpuid;
+
+	if (cpu >= S390HOST_MAX_CPUS)
+		goto out;
+
+	tif = current_thread_info();
+	sca_block = tif->s390host_data->sca_block;
+	if (!(sca_block->mcn & (1UL << (S390HOST_MAX_CPUS - 1 - cpu))))
+		goto out;
+
+	dist_sie_kernel = &((tif->s390host_data->sie_io[cpu])->sie_kernel);
+
+	cpuflags = &dist_sie_kernel->sie_block.cpuflags;
+	do {
+		old = atomic_read(cpuflags);
+		new = old | sie_user->vsmxm_or;
+		new &= sie_user->vsmxm_and;
+	} while (atomic_cmpxchg(cpuflags, old, new) != old);
+
+	rc = 0;
+out:
+	mutex_unlock(&s390host_init_mutex);
+	return rc;
+}
+
+static int s390host_do_action(unsigned long action, struct sie_io *sie_io)
+{
+	void *src;
+	void *dest;
+	int rc = 0;
+
+	if (action & SIE_BLOCK_UPDATE) {
+		src  = &(sie_io->sie_user.sie_block);
+		dest = &(sie_io->sie_kernel.sie_block);
+
+		memcpy(dest + 4, src +  4, 88);
+		memcpy(dest + 96, src + 96, 4);
+		memcpy(dest + 104, src + 104, 408);
+	}
+
+	if (action & SIE_UPDATE_PSW)
+		sie_io->sie_kernel.sie_block.psw.gpsw = sie_io->sie_user.psw;
+
+	if (action & SIE_FLUSH_TLB)
+		flush_tlb_mm(current->mm);
+
+	if (action & SIE_VSMXM_LOCAL_UPDATE)
+		s390host_vsmxm_local_update(sie_io);
+
+	if (action & SIE_VSMXM_DIST_UPDATE)
+		rc = s390host_vsmxm_dist_update(sie_io);
+	return rc;
+}
Index: linux-2.6/include/asm-s390/unistd.h
===================================================================
--- linux-2.6.orig/include/asm-s390/unistd.h
+++ linux-2.6/include/asm-s390/unistd.h
@@ -251,8 +251,11 @@
 #define __NR_getcpu		311
 #define __NR_epoll_pwait	312
 #define __NR_utimes		313
+#define __NR_s390host_add_cpu	314
+#define __NR_s390host_remove_cpu	315
+#define __NR_s390host_sie		316
 
-#define NR_syscalls 314
+#define NR_syscalls 317
 
 /* 
  * There are some system calls that are not present on 64 bit, some
Index: linux-2.6/include/asm-s390/sie64.h
===================================================================
--- /dev/null
+++ linux-2.6/include/asm-s390/sie64.h
@@ -0,0 +1,276 @@
+/*
+ *  include/asm-s390/sie64.h
+ *
+ *    Copyright IBM Corp. 2007
+ *    Author(s): Carsten Otte <cotte-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>,
+ *		 Christian Borntraeger <cborntra-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>,
+ *		 Heiko Carstens <heiko.carstens-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>
+ *
+ */
+
+#ifndef _ASM_S390_SIE64_H
+#define _ASM_S390_SIE64_H
+
+struct sie_block {
+	atomic_t cpuflags;	/* 0x0000 */
+	__u32	prefix;		/* 0x0004 */
+	__u32	:32;		/* 0x0008 */
+	__u32	:32;		/* 0x000c */
+	__u64	:64;		/* 0x0010 */
+	__u64	:64;		/* 0x0018 */
+	__u64	:64;		/* 0x0020 */
+	__u64	cputm;		/* 0x0028 */
+	__u64	ckc;		/* 0x0030 */
+	__u64	epoch;		/* 0x0038 */
+	__u8	svcnn	:1,	/* 0x0040 */
+		svc1c	:1,
+		svc2c	:1,
+		svc3c	:1,
+		:4;
+	__u8	svc1n;		/* 0x0041 */
+	__u8	svc2n;		/* 0x0042 */
+	__u8	svc3n;		/* 0x0043 */
+	__u16	lctl0	:1,	/* 0x0044 */
+		lctl1	:1,
+		lctl2	:1,
+		lctl3	:1,
+		lctl4	:1,
+		lctl5	:1,
+		lctl6	:1,
+		lctl7	:1,
+		lctl8	:1,
+		lctl9	:1,
+		lctla	:1,
+		lctlb	:1,
+		lctlc	:1,
+		lctld	:1,
+		lctle	:1,
+		lctlf	:1;
+	__s16	icpua;		/* 0x0046 */
+	__u32	icpop	:1,	/* 0x0048 */
+		icpro	:1,
+		icprg	:1,
+		:4,
+		icipte	:1,
+		:1,		/* 0x0049 */
+		iclpsw	:1,
+		icptlb	:1,
+		icssm	:1,
+		icbsa	:1,
+		icstctl	:1,
+		icstnsm	:1,
+		icstosm	:1,
+		icstck	:1,	/* 0x004a */
+		iciske	:1,
+		icsske	:1,
+		icrrbe	:1,
+		icpc	:1,
+		icpt	:1,
+		ictprot	:1,
+		iclasp	:1,
+		:1,		/* 0x004b */
+		icstpt	:1,
+		icsckc	:1,
+		:1,
+		icpr	:1,
+		icbakr	:1,
+		icpg	:1,
+		:1;
+	__u32	ecext	:1,	/* 0x004c */
+		ecint	:1,
+		ecwait	:1,
+		ecsigp	:1,
+		ecalt	:1,
+		ecio2	:1,
+		:1,
+		ecmvp	:1;
+	__u8	eca1;		/* 0x004d */
+	__u8	eca2;		/* 0x004e */
+	__u8	eca3;		/* 0x004f */
+	__u8	icptcode;	/* 0x0050 */
+	__u8	:6,		/* 0x0051 */
+		icif	:1,
+		icex	:1;
+	__u16	ihcpu;		/* 0x0052 */
+	__u16	:16;		/* 0x0054 */
+	struct {
+		union {
+			__u16	ipa;	/* 0x0056 */
+			__u16	inst;	/* 0x0056 */
+			struct {
+				union  {
+					__u8	ipa0;	/* 0x0056 */
+					__u8	viwho;	/* 0x0056 */
+				};
+				union  {
+					__u8	ipa1;	/* 0x0057 */
+					__u8	viwhen;	/* 0x0057 */
+				};
+			};
+		};
+		union {
+			__u32	ipb;	/* 0x0058 */
+			struct {
+				union  {
+					__u16	ipbh0;	/* 0x0058 */
+					__u16	viwhy;	/* 0x0058 */
+					struct {
+						__u8	ipb0;	/* 0x0058 */
+						__u8	ipb1;	/* 0x0059 */
+					};
+				};
+				union  {
+					__u16	ipbh1;	/* 0x005a */
+					struct {
+						__u8	ipb2;	/* 0x005a */
+						__u8	ipb3;	/* 0x005b */
+					};
+				};
+			};
+		};
+	} __attribute__((packed));
+	__u32	scaoh;	/* 0x005c */
+	union {
+		__u32	rcp;	/* 0x0060 */
+		struct {
+			__u8	ska	:1,	/* 0x0060 */
+				skaip	:1,
+				:6;
+			__u8	ecb	:8;	/* 0x0061 */
+			__u8	:3,		/* 0x0062 */
+				cpby	:1,
+				:4;
+			__u8	:8;		/* 0x0063 */
+		};
+	};
+	__u32	scaol;	/* 0x0064 */
+	__u32	:32;	/* 0x0068 */
+	union {
+		__u32	todpr;	/* 0x006c */
+		struct {
+			__u16	:16;	/* 0x006c */
+			__u16	todpf;	/* 0x006e */
+		};
+	} __attribute__((packed));
+	__u32	gisa;	/* 0x0070 */
+	__u32	iopct;	/* 0x0074 */
+	__u32	rsvd3;	/* 0x0078 */
+	__u32	:32;	/* 0x007c */
+	__u64	gmsor;	/* 0x0080 */
+	__u64	gmslm;	/* 0x0088 */
+	union {
+		psw_t	gpsw;	/* 0x0090 */
+		struct {
+			__u64	pswh;	/* 0x0090 */
+
+			__u64	pswl;	/* 0x0098 */
+		};
+	} psw;
+	__u64	gg14;	/* 0x00a0 */
+	__u64	gg15;	/* 0x00a8 */
+	__u64	:64;	/* 0x00b0 */
+	__u64	:16,	/* 0x00b8 */
+		xso	:24,
+		xsl	:24;
+	union {
+		__u8	uzp0[56];	/* 0x00c0 */
+		struct {
+			__u32	exmsf;	/* 0x00c0 */
+			union  {
+				__u32	iexcf;	/* 0x00c4 */
+				struct {
+					__u16	iexca;	/* 0x00c4 */
+					__u16	iexcd;	/* 0x00c6 */
+				};
+			};
+			__u16	svcil;	/* 0x00c8 */
+			__u16	svcnt;	/* 0x00ca */
+			__u16	iprcl;	/* 0x00cc */
+			__u16	iprcc;	/* 0x00ce */
+			__u32	itrad;	/* 0x00d0 */
+			__u32	imncl;	/* 0x00d4 */
+			__u64	gpera;	/* 0x00d8 */
+			__u8	excpar; /* 0x00e0 */
+			__u8	perar;	/* 0x00e1 */
+			__u8	oprid;	/* 0x00e2 */
+			__u8	:8;	/* 0x00e3 */
+			__u32	:32;	/* 0x00e4 */
+			__u64	gtrad;	/* 0x00e8 */
+			__u32	:32;	/* 0x00f0 */
+			__u32	:32;	/* 0x00f4 */
+		};
+	};
+	__u16	:16;		/* 0x00f8 */
+	__u16	ief;		/* 0x00fa */
+	__u32	apcbk;		/* 0x00fc */
+	__u64	gcr[16];	/* 0x0100 */
+	__u8	reserved[128];	/* 0x0180 */
+} __attribute__((packed));
+
+struct sie_kernel {
+	struct sie_block	sie_block;
+	s390_fp_regs	host_fpregs;
+	int		host_acrs[NUM_ACRS];
+} __attribute__((packed,aligned(4096)));
+
+#define SIE_UPDATE_PSW		(1UL << 0)
+#define SIE_FLUSH_TLB		(1UL << 1)
+#define SIE_ISKE		(1UL << 2)
+#define SIE_SSKE		(1UL << 3)
+#define SIE_BLOCK_UPDATE	(1UL << 4)
+#define SIE_VSMXM_LOCAL_UPDATE	(1UL << 5)
+#define SIE_VSMXM_DIST_UPDATE   (1UL << 6)
+
+struct sie_skey_parm {
+	unsigned long sk_reg;
+	unsigned long sk_addr;
+};
+
+struct sie_user {
+	struct sie_block	sie_block;
+	psw_t			psw;
+	unsigned long		gprs[NUM_GPRS];
+	s390_fp_regs		guest_fpregs;
+	int			guest_acrs[NUM_ACRS];
+	struct sie_skey_parm	iske_parm;
+	struct sie_skey_parm	sske_parm;
+	int			vsmxm_or_local;
+	int			vsmxm_and_local;
+	int			vsmxm_or;
+	int			vsmxm_and;
+	int			vsmxm_cpuid;
+} __attribute__((packed,aligned(4096)));
+
+struct sie_io {
+	struct sie_kernel sie_kernel;
+	struct sie_user sie_user;
+};
+
+struct sca_entry {
+	atomic_t scn;
+	__u64	reserved;
+	__u64	sda;
+	__u64	reserved2[2];
+}__attribute__((packed));
+
+struct sca_block {
+	__u64	ipte_control;
+	__u64	reserved[5];
+	__u64	mcn;
+	__u64	reserved2;
+	struct	sca_entry cpu[64];
+}__attribute__((packed));
+
+#define S390HOST_MAX_CPUS 64
+
+struct s390host_data {
+	atomic_t	 count;
+	struct sie_io	 *sie_io[S390HOST_MAX_CPUS];
+	struct sca_block *sca_block;
+};
+
+/* function definitions */
+extern int sie64a(struct sie_kernel *);
+extern void s390host_put_data(struct s390host_data *);
+
+#endif /* _ASM_S390_SIE64_H */
Index: linux-2.6/arch/s390/Makefile
===================================================================
--- linux-2.6.orig/arch/s390/Makefile
+++ linux-2.6/arch/s390/Makefile
@@ -85,7 +85,7 @@ LDFLAGS_vmlinux := -e start
 head-y		:= arch/s390/kernel/head.o arch/s390/kernel/init_task.o
 
 core-y		+= arch/s390/mm/ arch/s390/kernel/ arch/s390/crypto/ \
-		   arch/s390/appldata/ arch/s390/hypfs/
+		   arch/s390/appldata/ arch/s390/hypfs/ arch/s390/host/
 libs-y		+= arch/s390/lib/
 drivers-y	+= drivers/s390/
 drivers-$(CONFIG_MATHEMU) += arch/s390/math-emu/
Index: linux-2.6/kernel/sys_ni.c
===================================================================
--- linux-2.6.orig/kernel/sys_ni.c
+++ linux-2.6/kernel/sys_ni.c
@@ -122,6 +122,9 @@ cond_syscall(sys32_sysctl);
 cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
+cond_syscall(sys_s390host_add_cpu);
+cond_syscall(sys_s390host_remove_cpu);
+cond_syscall(sys_s390host_sie);
 
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
Index: linux-2.6/arch/s390/kernel/process.c
===================================================================
--- linux-2.6.orig/arch/s390/kernel/process.c
+++ linux-2.6/arch/s390/kernel/process.c
@@ -274,12 +274,23 @@ int copy_thread(int nr, unsigned long cl
 #endif /* CONFIG_64BIT */
 	/* start new process with ar4 pointing to the correct address space */
 	p->thread.mm_segment = get_fs();
-        /* Don't copy debug registers */
-        memset(&p->thread.per_info,0,sizeof(p->thread.per_info));
+	/* Don't copy debug registers */
+	memset(&p->thread.per_info,0,sizeof(p->thread.per_info));
+	p->thread_info->s390host_data = NULL;
+	p->thread_info->sie_cpu = -1;
 
         return 0;
 }
 
+void free_thread_info(struct thread_info *ti)
+{
+#ifdef CONFIG_S390_HOST
+	if (ti->s390host_data)
+		s390host_put_data(ti->s390host_data);
+#endif
+	free_pages((unsigned long) (ti),THREAD_ORDER);
+}
+
 asmlinkage long sys_fork(struct pt_regs regs)
 {
 	return do_fork(SIGCHLD, regs.gprs[15], &regs, 0, NULL, NULL);
Index: linux-2.6/include/asm-s390/thread_info.h
===================================================================
--- linux-2.6.orig/include/asm-s390/thread_info.h
+++ linux-2.6/include/asm-s390/thread_info.h
@@ -38,6 +38,7 @@
 #ifndef __ASSEMBLY__
 #include <asm/processor.h>
 #include <asm/lowcore.h>
+#include <asm/sie64.h>
 
 /*
  * low level task data that entry.S needs immediate access to
@@ -52,6 +53,8 @@ struct thread_info {
 	unsigned int		cpu;		/* current CPU */
 	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 	struct restart_block	restart_block;
+	struct s390host_data	*s390host_data;	/* s390host data */
+	int			sie_cpu;	/* sie cpu number */
 };
 
 /*
@@ -67,6 +70,8 @@ struct thread_info {
 	.restart_block	= {			\
 		.fn = do_no_restart_syscall,	\
 	},					\
+	.s390host_data	= NULL,			\
+	.sie_cpu	= 0,			\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
@@ -81,7 +86,8 @@ static inline struct thread_info *curren
 /* thread information allocation */
 #define alloc_thread_info(tsk) ((struct thread_info *) \
 	__get_free_pages(GFP_KERNEL,THREAD_ORDER))
-#define free_thread_info(ti) free_pages((unsigned long) (ti),THREAD_ORDER)
+
+extern void free_thread_info(struct thread_info *);
 
 #endif
 
Index: linux-2.6/arch/s390/Kconfig
===================================================================
--- linux-2.6.orig/arch/s390/Kconfig
+++ linux-2.6/arch/s390/Kconfig
@@ -153,6 +153,7 @@ config S390_SWITCH_AMODE
 
 config S390_EXEC_PROTECT
 	bool "Data execute protection"
+	depends on !S390_HOST
 	select S390_SWITCH_AMODE
 	help
 	  This option allows to enable a buffer overflow protection for user
@@ -514,6 +515,12 @@ config KEXEC
 	  current kernel, and to start another kernel.  It is like a reboot
 	  but is independent of hardware/microcode support.
 
+config S390_HOST
+	bool "s390 host support (EXPERIMENTAL)"
+	depends on 64BIT && EXPERIMENTAL
+	select S390_SWITCH_AMODE
+	help
+	  Select this option if you want to host s390 guest Linux images
 endmenu
 
 source "net/Kconfig"
Index: linux-2.6/arch/s390/kernel/setup.c
===================================================================
--- linux-2.6.orig/arch/s390/kernel/setup.c
+++ linux-2.6/arch/s390/kernel/setup.c
@@ -394,7 +394,11 @@ static int __init early_parse_ipldelay(c
 early_param("ipldelay", early_parse_ipldelay);
 
 #ifdef CONFIG_S390_SWITCH_AMODE
+#ifdef CONFIG_S390_HOST
+unsigned int switch_amode = 1;
+#else
 unsigned int switch_amode = 0;
+#endif
 EXPORT_SYMBOL_GPL(switch_amode);
 
 static void set_amode_and_uaccess(unsigned long user_amode,



-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH/RFC 2/2] s390 virtualization interface.
       [not found] ` <1177681235.5770.22.camel-WIxn4w2hgUz3YA32ykw5MLlKpX0K8NHHQQ4Iyu8u01E@public.gmane.org>
@ 2007-04-27 15:23   ` Anthony Liguori
       [not found]     ` <4632155C.8030503-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2007-04-27 16:53   ` Arnd Bergmann
  1 sibling, 1 reply; 8+ messages in thread
From: Anthony Liguori @ 2007-04-27 15:23 UTC (permalink / raw)
  To: Carsten Otte
  Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	cborntra-tA70FqPdS9bQT0dZR+AlfA,
	schwidefsky-tA70FqPdS9bQT0dZR+AlfA

Can you provide a high-level description of how this virtualization 
mechanism works?  How does MMU virtualization work?

Carsten Otte wrote:
> +
> +#ifndef _ASM_S390_SIE64_H
> +#define _ASM_S390_SIE64_H
> +
> +struct sie_block {
> +	atomic_t cpuflags;	/* 0x0000 */
> +	__u32	prefix;		/* 0x0004 */
> +	__u32	:32;		/* 0x0008 */
> +	__u32	:32;		/* 0x000c */
> +	__u64	:64;		/* 0x0010 */
> +	__u64	:64;		/* 0x0018 */
> +	__u64	:64;		/* 0x0020 */
> +	__u64	cputm;		/* 0x0028 */
> +	__u64	ckc;		/* 0x0030 */
> +	__u64	epoch;		/* 0x0038 */
> +	__u8	svcnn	:1,	/* 0x0040 */
> +		svc1c	:1,
> +		svc2c	:1,
> +		svc3c	:1,
> +		:4;
>   

I think bitfields can have weird padding issues.

Regards,

Anthony Liguori

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH/RFC 2/2] s390 virtualization interface.
       [not found]     ` <4632155C.8030503-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2007-04-27 15:45       ` Carsten Otte
  0 siblings, 0 replies; 8+ messages in thread
From: Carsten Otte @ 2007-04-27 15:45 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	Christian Borntraeger, mschwid2-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8


Anthony Liguori wrote:
> Can you provide a high-level description of how this virtualization 
> mechanism works? 
Very well. First of all, userspace allocates regular memory via 
malloc() or mmap()s a file. The user address space will equals the 
virtual machines address space, user space can define an address 
offset where the virtual machine starts and can define a maximum size 
of the virtual machine. Both is set in each of the SIE control blocks.
Userspace uses open/read/close to read a kernel image into its proper 
location.

Next, userspace launches a set of pthreads. Each pthread corresponds 
with a virtual CPU, and each pthread calls the sys_s390host_add_cpu 
system call.
Now the first CPU enters sys_sie to start interpretive execution via 
sys_s3900host_sie. The CPU will run the virtual cpu context for a 
while and exit again with an intercept code that indicates the reason 
for the exit. Now userspace deals with that, and enters 
sys_s390host_sie again. Later on, all other CPUs get started via an 
interprocessor signal SIGP. Userspace will also receive this as SIE 
intercept. Now all pthreads representing the different virtual CPUs 
will enter the virtual machine context simultaneously.
Interprocessor signals (need_resched and such) are handled by the 
hardware as long as the sender and receiver are both in virtual 
context. Otherwise, userspace will get control again.

In the end, when the machine is halted (or panics), all CPUs will 
return with an intercept code indicating that execution has ended for 
this CPU. Userspace will now call sys_s390host_remove_cpu to free 
things up, then userspace will exit().

> How does MMU virtualization work?
s390 does have hardware support for memory management for virutalized 
environments. The first patch of this series introduces an extension 
to the page table entry, that tracks dirty/reference bits for host and 
virtual machine seperately. For the guest, our CPU does transparently 
merge the real dirty and reference bits stored in a storage key and in 
the page table entry extension.  The virutal machine can set up its 
own page tables inside the box and use dynamic address translation. 
This does not require intervention by the host, except that it needs 
to do its part of dirty and reference tracking in the extended page 
table entry explicitly.

cheers,
Carsten

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH/RFC 2/2] s390 virtualization interface.
       [not found] ` <1177681235.5770.22.camel-WIxn4w2hgUz3YA32ykw5MLlKpX0K8NHHQQ4Iyu8u01E@public.gmane.org>
  2007-04-27 15:23   ` Anthony Liguori
@ 2007-04-27 16:53   ` Arnd Bergmann
       [not found]     ` <200704271853.41523.arnd-r2nGTMty4D4@public.gmane.org>
  1 sibling, 1 reply; 8+ messages in thread
From: Arnd Bergmann @ 2007-04-27 16:53 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
  Cc: cborntra-tA70FqPdS9bQT0dZR+AlfA, schwidefsky-tA70FqPdS9bQT0dZR+AlfA

On Friday 27 April 2007, Carsten Otte wrote:
> Add interface which allows a process to start a virtual machine.
 
Looks pretty good to me already.

It seems a lot closer to lguest than to kvm at the moment concerning
the kernel interfaces (guest real address == host process, state
is attached to the thread_info, ...).

Now for the nitpicking:

> +static DEFINE_MUTEX(s390host_init_mutex);
> +
> +static void s390host_get_data(struct s390host_data *data)
> +{
> +	atomic_inc(&data->count);
> +}
> +
> +void s390host_put_data(struct s390host_data *data)
> +{
> +	int cpu;
> +
> +	if (atomic_dec_return(&data->count))
> +		return;
> +
> +	for (cpu = 0; cpu < S390HOST_MAX_CPUS; cpu++)
> +		if (data->sie_io[cpu])
> +			free_page((unsigned long)data->sie_io[cpu]);
> +
> +	if (data->sca_block)
> +		free_page((unsigned long)data->sca_block);
> +
> +	kfree(data);
> +}

These should probably use a struct kref instead of an atomic_t
to count the references.

> +static unsigned long
> +s390host_create_io_area(unsigned long addr, unsigned long flags,
> +			unsigned long io_addr, struct s390host_data *data)
> +{
> +	struct mm_struct *mm = current->mm;
> +	struct vm_area_struct *vma;
> +	unsigned long ret;
> +
> +	flags &= MAP_FIXED;
> +	addr = get_unmapped_area(NULL, addr, 2 * PAGE_SIZE, 0, flags);
> +
> +	if (addr & ~PAGE_MASK)
> +		return addr;
> +
> +	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
> +
> +	if (!vma)
> +		return -ENOMEM;
> +
> +	vma->vm_mm = mm;
> +	vma->vm_start = addr;
> +	vma->vm_end = addr + 2 * PAGE_SIZE;
> +	vma->vm_flags =	VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED;
> +	vma->vm_flags |= VM_SHARED | VM_MAYSHARE | VM_DONTCOPY;
> +
> +#if 1	/* FIXME: write access until sys_s390host_sie interface is final */
> +	vma->vm_flags |= VM_WRITE | VM_MAYWRITE;
> +#endif
> +
> +	vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
> +	vma->vm_private_data = data;
> +	vma->vm_ops = &s390host_vmops;
> +
> +	down_write(&mm->mmap_sem);
> +	ret = insert_vm_struct(mm, vma);
> +	if (ret) {
> +		kmem_cache_free(vm_area_cachep, vma);
> +		goto out;
> +	}
> +	s390host_get_data(data);
> +	mm->total_vm += 2;
> +	vm_insert_page(vma, addr, virt_to_page(io_addr));
> +
> +	ret = split_vma(mm, vma, addr + PAGE_SIZE, 0);
> +	if (ret)
> +		goto out;
> +	s390host_get_data(data);
> +
> +	vma = find_vma(mm, addr + PAGE_SIZE);
> +	vma->vm_flags |= VM_WRITE | VM_MAYWRITE;
> +	vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
> +	vm_insert_page(vma, addr + PAGE_SIZE,
> +		       virt_to_page(io_addr + PAGE_SIZE));
> +	ret = addr;
> +out:
> +	up_write(&mm->mmap_sem);
> +	return ret;
> +}

This open-coded mmap looks a bit scary. I can't see anything wrong in it,
but it may conflict with a user application that tries to do other strange
things with memory maps.

Maybe you can either make this a real call to mmap(), or have it automatically
mapped using the vdso mechanism.

> +long sys_s390host_add_cpu(unsigned long addr, unsigned long flags,
> +			  struct sie_block __user *sie_template)
> +{
> +	struct sie_block *sie_block;
> +	struct sie_io *sie_io;
> +	struct sca_block *sca_block;
> +	struct s390host_data *data = NULL;
> +	unsigned long ret;
> +	__u16 cpu;

__u16 is a type for user space interfaces. In the kernel, use u16.

> +	if (current_thread_info()->sie_cpu != -1)
> +		return -EINVAL;

-EBUSY?

> +	if (copy_from_user(&cpu, &sie_template->icpua, sizeof(u16)))
> +		return -EFAULT;

get_user()?

> +	if (cpu >= S390HOST_MAX_CPUS)
> +		return -EINVAL;
> +
> +	mutex_lock(&s390host_init_mutex);
> +
> +	data = get_s390host_context();
> +	if (!data) {
> +		ret = -ENOMEM;
> +		goto out_err;
> +	}
> +
> +	sca_block = data->sca_block;
> +	if (sca_block->mcn & (1UL << (S390HOST_MAX_CPUS - 1 - cpu))) {

__test_bit()?

> +		ret = -EINVAL;

-EBUSY?

> +int sys_s390host_remove_cpu(void)
> +{
> +	struct sca_block *sca_block;
> +	int cpu;
> +
> +	cpu = current_thread_info()->sie_cpu;
> +	if (cpu == -1)
> +		return -EINVAL;
> +
> +	mutex_lock(&s390host_init_mutex);
> +	sca_block = current_thread_info()->s390host_data->sca_block;
> +	sca_block->mcn &= ~(1UL << (S390HOST_MAX_CPUS - 1 - cpu));
> +	current_thread_info()->sie_cpu = -1;
> +	mutex_unlock(&s390host_init_mutex);
> +	return 0;
> +}

Shouldn't this free the resources that were allocated by add_cpu?
If not, this syscall seems pretty useless and I'd simply not
do it at all -- just wait until exit() to clean up.

> +	sie_io = current_thread_info()->s390host_data->sie_io[cpu];
> +
> +	if (action)
> +		ret = s390host_do_action(action, sie_io);
> +	if (ret)
> +		goto out_err;
> +	sie_kernel = &sie_io->sie_kernel;
> +	sie_user = &sie_io->sie_user;
> +
> +	save_fp_regs(&sie_kernel->host_fpregs);
> +	save_access_regs(sie_kernel->host_acrs);
> +	sie_user->guest_fpregs.fpc &= FPC_VALID_MASK;
> +	restore_fp_regs(&sie_user->guest_fpregs);
> +	restore_access_regs(sie_user->guest_acrs);
> +	memcpy(&sie_kernel->sie_block.gg14, &sie_user->gprs[14], 16);
> +again:
> +	if (need_resched())
> +		schedule();
> +
> +	sie_kernel->sie_block.icptcode = 0;
> +	ret = sie64a(sie_kernel);
> +	if (ret)
> +		goto out;
> +
> +	if (signal_pending(current)) {
> +		ret = -EINTR;
> +		goto out;
> +	}

Can't you handle interrupted system calls? I think it would be nicer
to return -ERESTARTSYS here so you don't have to go back to your
user space after a signal.

> +struct sie_kernel {
> +	struct sie_block	sie_block;
> +	s390_fp_regs	host_fpregs;
> +	int		host_acrs[NUM_ACRS];
> +} __attribute__((packed,aligned(4096)));

Why packed?

> +#define SIE_UPDATE_PSW		(1UL << 0)
> +#define SIE_FLUSH_TLB		(1UL << 1)
> +#define SIE_ISKE		(1UL << 2)
> +#define SIE_SSKE		(1UL << 3)
> +#define SIE_BLOCK_UPDATE	(1UL << 4)
> +#define SIE_VSMXM_LOCAL_UPDATE	(1UL << 5)
> +#define SIE_VSMXM_DIST_UPDATE   (1UL << 6)

I find it much more readable to define these like

#define SIE_UPDATE_PSW		1
#define SIE_FLUSH_TLB		2
#define SIE_ISKE		4

especially when trying to interpret values in memory.

> +struct sie_user {
> +	struct sie_block	sie_block;
> +	psw_t			psw;
> +	unsigned long		gprs[NUM_GPRS];
> +	s390_fp_regs		guest_fpregs;
> +	int			guest_acrs[NUM_ACRS];
> +	struct sie_skey_parm	iske_parm;
> +	struct sie_skey_parm	sske_parm;
> +	int			vsmxm_or_local;
> +	int			vsmxm_and_local;
> +	int			vsmxm_or;
> +	int			vsmxm_and;
> +	int			vsmxm_cpuid;
> +} __attribute__((packed,aligned(4096)));

Is this data structure extensible? If it is, you probably need
some sort of versioning information to make sure that user space
doesn't rely on fields that the kernel doesn't know about.

> +struct sca_entry {
> +	atomic_t scn;
> +	__u64	reserved;
> +	__u64	sda;
> +	__u64	reserved2[2];
> +}__attribute__((packed));

Is this a hardware data structure? If not, you shouldn't pack it
because the first word is only 32 bits and consequently all following
ones are unaligned.

> +/* function definitions */
> +extern int sie64a(struct sie_kernel *);
> +extern void s390host_put_data(struct s390host_data *);

These should be inside of '#ifdef __KERNEL__', like all declarations that user
space doesn't need to see.

> @@ -52,6 +53,8 @@ struct thread_info {
>  	unsigned int		cpu;		/* current CPU */
>  	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
>  	struct restart_block	restart_block;
> +	struct s390host_data	*s390host_data;	/* s390host data */
> +	int			sie_cpu;	/* sie cpu number */
>  };

What happens to these variables on execve?

	Arnd <><

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH/RFC 2/2] s390 virtualization interface.
       [not found]     ` <200704271853.41523.arnd-r2nGTMty4D4@public.gmane.org>
@ 2007-04-29  8:00       ` Heiko Carstens
       [not found]         ` <20070429080039.GA8332-5VkHqLvV2o3MbYB6QlFGEg@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Heiko Carstens @ 2007-04-29  8:00 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	cborntra-tA70FqPdS9bQT0dZR+AlfA,
	schwidefsky-tA70FqPdS9bQT0dZR+AlfA

On Fri, Apr 27, 2007 at 06:53:40PM +0200, Arnd Bergmann wrote:
> On Friday 27 April 2007, Carsten Otte wrote:
> > Add interface which allows a process to start a virtual machine.
> 
> Looks pretty good to me already.
> 
> It seems a lot closer to lguest than to kvm at the moment concerning
> the kernel interfaces (guest real address == host process, state
> is attached to the thread_info, ...).

Guess I should take a look at lguest then :)

> > +static DEFINE_MUTEX(s390host_init_mutex);
> > +
> > +static void s390host_get_data(struct s390host_data *data)
> > +{
> > +	atomic_inc(&data->count);
> > +}
> > +
> > +void s390host_put_data(struct s390host_data *data)
> > +{
> > +	int cpu;
> > +
> > +	if (atomic_dec_return(&data->count))
> > +		return;
> 
> These should probably use a struct kref instead of an atomic_t
> to count the references.

That makes sense.

> 
> > +static unsigned long
> > +s390host_create_io_area(unsigned long addr, unsigned long flags,
> > +			unsigned long io_addr, struct s390host_data *data)
> > +{
[...]
> > +	down_write(&mm->mmap_sem);
> > +	ret = insert_vm_struct(mm, vma);
> > +	if (ret) {
> > +		kmem_cache_free(vm_area_cachep, vma);
> > +		goto out;
> > +	}
> > +	s390host_get_data(data);
> > +	mm->total_vm += 2;
> > +	vm_insert_page(vma, addr, virt_to_page(io_addr));
> > +
> > +	ret = split_vma(mm, vma, addr + PAGE_SIZE, 0);
> > +	if (ret)
> > +		goto out;
> 
> This open-coded mmap looks a bit scary. I can't see anything wrong in it,
> but it may conflict with a user application that tries to do other strange
> things with memory maps.

Well, there is at least already one bug in it. If the call to split_vma fails
there is no cleanup of the vm_insert_page that has already happened.

> Maybe you can either make this a real call to mmap(), or have it automatically
> mapped using the vdso mechanism.

So, yes we need to find something better, since this is the code that I
don't like at all.

> > +	__u16 cpu;
> 
> __u16 is a type for user space interfaces. In the kernel, use u16.

Yes.

> > +	if (current_thread_info()->sie_cpu != -1)
> > +		return -EINVAL;
> 
> -EBUSY?

Yes.

> > +	if (copy_from_user(&cpu, &sie_template->icpua, sizeof(u16)))
> > +		return -EFAULT;
> 
> get_user()?

Yes.

> > +	sca_block = data->sca_block;
> > +	if (sca_block->mcn & (1UL << (S390HOST_MAX_CPUS - 1 - cpu))) {
> 
> __test_bit()?

Yes, but I think we should probably add some function to access the mcn
bits. Each access looks a bit complicated.
 
> > +		ret = -EINVAL;
> 
> -EBUSY?

Yes.

> > +int sys_s390host_remove_cpu(void)
> > +{
> > +	struct sca_block *sca_block;
> > +	int cpu;
> > +
> > +	cpu = current_thread_info()->sie_cpu;
> > +	if (cpu == -1)
> > +		return -EINVAL;
> > +
> > +	mutex_lock(&s390host_init_mutex);
> > +	sca_block = current_thread_info()->s390host_data->sca_block;
> > +	sca_block->mcn &= ~(1UL << (S390HOST_MAX_CPUS - 1 - cpu));
> > +	current_thread_info()->sie_cpu = -1;
> > +	mutex_unlock(&s390host_init_mutex);
> > +	return 0;
> > +}
> 
> Shouldn't this free the resources that were allocated by add_cpu?

Yes, it was supposed to do that. But since it was only a prototype I was
too lazy to code it up and so I decided to free things on exit.

> If not, this syscall seems pretty useless and I'd simply not
> do it at all -- just wait until exit() to clean up.

This reveals that we are not clearing the mcn bit on exit without
pior call to this syscall. BUG().

> > +	if (signal_pending(current)) {
> > +		ret = -EINTR;
> > +		goto out;
> > +	}
> 
> Can't you handle interrupted system calls? I think it would be nicer
> to return -ERESTARTSYS here so you don't have to go back to your
> user space after a signal.

That should work and it is better.

> > +struct sie_kernel {
> > +	struct sie_block	sie_block;
> > +	s390_fp_regs	host_fpregs;
> > +	int		host_acrs[NUM_ACRS];
> > +} __attribute__((packed,aligned(4096)));
> 
> Why packed?

Not needed.

> > +#define SIE_UPDATE_PSW		(1UL << 0)
> > +#define SIE_FLUSH_TLB		(1UL << 1)
> > +#define SIE_ISKE		(1UL << 2)
> > +#define SIE_SSKE		(1UL << 3)
> > +#define SIE_BLOCK_UPDATE	(1UL << 4)
> > +#define SIE_VSMXM_LOCAL_UPDATE	(1UL << 5)
> > +#define SIE_VSMXM_DIST_UPDATE   (1UL << 6)
> 
> I find it much more readable to define these like
> 
> #define SIE_UPDATE_PSW		1
> #define SIE_FLUSH_TLB		2
> #define SIE_ISKE		4
> 
> especially when trying to interpret values in memory.

Hmm... don't know. Both make sense, but I have no preference on this.
 
> > +struct sie_user {
> > +	struct sie_block	sie_block;
> > +	psw_t			psw;
> > +	unsigned long		gprs[NUM_GPRS];
> > +	s390_fp_regs		guest_fpregs;
> > +	int			guest_acrs[NUM_ACRS];
> > +	struct sie_skey_parm	iske_parm;
> > +	struct sie_skey_parm	sske_parm;
> > +	int			vsmxm_or_local;
> > +	int			vsmxm_and_local;
> > +	int			vsmxm_or;
> > +	int			vsmxm_and;
> > +	int			vsmxm_cpuid;
> > +} __attribute__((packed,aligned(4096)));
> 
> Is this data structure extensible? If it is, you probably need
> some sort of versioning information to make sure that user space
> doesn't rely on fields that the kernel doesn't know about.

I don't think we can put in some versioning information here. If
the kernel decides to increase the version then old userspace
code would break?
We rather need some mechanism so userpace can ask the kernel
"do you support feature x?" and dependent on the answer some
fields are used or unused.

> > +struct sca_entry {
> > +	atomic_t scn;
> > +	__u64	reserved;
> > +	__u64	sda;
> > +	__u64	reserved2[2];
> > +}__attribute__((packed));
> 
> Is this a hardware data structure? If not, you shouldn't pack it
> because the first word is only 32 bits and consequently all following
> ones are unaligned.

It's a hardware structure. Guess the unaligned u64s were generated when
this structure was extended to support 64bit.
But of course we shouldn't put an atomic_t in a hardware structure, since
it's implementation might change.
 
> > +/* function definitions */
> > +extern int sie64a(struct sie_kernel *);
> > +extern void s390host_put_data(struct s390host_data *);
> 
> These should be inside of '#ifdef __KERNEL__', like all declarations that user
> space doesn't need to see.

Yes, of course.

> 
> > @@ -52,6 +53,8 @@ struct thread_info {
> >  	unsigned int		cpu;		/* current CPU */
> >  	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
> >  	struct restart_block	restart_block;
> > +	struct s390host_data	*s390host_data;	/* s390host data */
> > +	int			sie_cpu;	/* sie cpu number */
> >  };
> 
> What happens to these variables on execve?

They stay... We probably should do some cleanup here e.g. by adding some
code to init_new_context().

Thanks for reviewing!

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH/RFC 2/2] s390 virtualization interface.
       [not found]         ` <20070429080039.GA8332-5VkHqLvV2o3MbYB6QlFGEg@public.gmane.org>
@ 2007-05-01 21:12           ` Arnd Bergmann
       [not found]             ` <200705012312.11692.arnd-r2nGTMty4D4@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Arnd Bergmann @ 2007-05-01 21:12 UTC (permalink / raw)
  To: Heiko Carstens
  Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	cborntra-tA70FqPdS9bQT0dZR+AlfA,
	schwidefsky-tA70FqPdS9bQT0dZR+AlfA

On Sunday 29 April 2007, Heiko Carstens wrote:

> > Is this data structure extensible? If it is, you probably need
> > some sort of versioning information to make sure that user space
> > doesn't rely on fields that the kernel doesn't know about.
> 
> I don't think we can put in some versioning information here. If
> the kernel decides to increase the version then old userspace
> code would break?
> We rather need some mechanism so userpace can ask the kernel
> "do you support feature x?" and dependent on the answer some
> fields are used or unused.

You could do it the way that ext2 handles compatible and incompatible
features in the on-disk layout:

Assign a number of bits in the read-only part of the mapping to flags
that the user application can test. A bit in the compatible range mean
that a feature is available to the user application if it wants to
use it. A bit in the incompatible range means that the user space needs
to understand how to use a feature in order to run correctly.

> > > +struct sca_entry {
> > > +   atomic_t scn;
> > > +   __u64   reserved;
> > > +   __u64   sda;
> > > +   __u64   reserved2[2];
> > > +}__attribute__((packed));
> > 
> > Is this a hardware data structure? If not, you shouldn't pack it
> > because the first word is only 32 bits and consequently all following
> > ones are unaligned.
> 
> It's a hardware structure. Guess the unaligned u64s were generated when
> this structure was extended to support 64bit.
> But of course we shouldn't put an atomic_t in a hardware structure, since
> it's implementation might change.

Not sure about that point. If you need to do atomic operations on the
first 32 bits, you shouldn't need to invent your own abstractions for
those, and it's highly unlikely that the implementation of atomic_t changes.

	Arnd <><

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH/RFC 2/2] s390 virtualization interface.
       [not found]             ` <200705012312.11692.arnd-r2nGTMty4D4@public.gmane.org>
@ 2007-05-02  4:46               ` Avi Kivity
  2007-05-02  8:25               ` Christoph Hellwig
  1 sibling, 0 replies; 8+ messages in thread
From: Avi Kivity @ 2007-05-02  4:46 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	cborntra-tA70FqPdS9bQT0dZR+AlfA,
	schwidefsky-tA70FqPdS9bQT0dZR+AlfA

Arnd Bergmann wrote:
> On Sunday 29 April 2007, Heiko Carstens wrote:
>
>   
>>> Is this data structure extensible? If it is, you probably need
>>> some sort of versioning information to make sure that user space
>>> doesn't rely on fields that the kernel doesn't know about.
>>>       
>> I don't think we can put in some versioning information here. If
>> the kernel decides to increase the version then old userspace
>> code would break?
>> We rather need some mechanism so userpace can ask the kernel
>> "do you support feature x?" and dependent on the answer some
>> fields are used or unused.
>>     
>
> You could do it the way that ext2 handles compatible and incompatible
> features in the on-disk layout:
>
> Assign a number of bits in the read-only part of the mapping to flags
> that the user application can test. A bit in the compatible range mean
> that a feature is available to the user application if it wants to
> use it. A bit in the incompatible range means that the user space needs
> to understand how to use a feature in order to run correctly.
>   

The current ioctl() interface has a feature test call, basically you
send down a feature number get see if it's supported or not.  Fairly
similar to feature bit except it isn't limited in the number of bits.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH/RFC 2/2] s390 virtualization interface.
       [not found]             ` <200705012312.11692.arnd-r2nGTMty4D4@public.gmane.org>
  2007-05-02  4:46               ` Avi Kivity
@ 2007-05-02  8:25               ` Christoph Hellwig
  1 sibling, 0 replies; 8+ messages in thread
From: Christoph Hellwig @ 2007-05-02  8:25 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	cborntra-tA70FqPdS9bQT0dZR+AlfA,
	schwidefsky-tA70FqPdS9bQT0dZR+AlfA

On Tue, May 01, 2007 at 11:12:11PM +0200, Arnd Bergmann wrote:
> Not sure about that point. If you need to do atomic operations on the
> first 32 bits, you shouldn't need to invent your own abstractions for
> those, and it's highly unlikely that the implementation of atomic_t changes.

I disagree.  Using an atomic_t in a hardware structure is against all
the abstractions we've built.  It's much better to have separate macros
to atomically modify a word in this hardware strcuture, even if they end
up exactly the same as the atomic_ macros - at least this way we clearly
document their intent.


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2007-05-02  8:25 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-04-27 13:40 [PATCH/RFC 2/2] s390 virtualization interface Carsten Otte
     [not found] ` <1177681235.5770.22.camel-WIxn4w2hgUz3YA32ykw5MLlKpX0K8NHHQQ4Iyu8u01E@public.gmane.org>
2007-04-27 15:23   ` Anthony Liguori
     [not found]     ` <4632155C.8030503-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2007-04-27 15:45       ` Carsten Otte
2007-04-27 16:53   ` Arnd Bergmann
     [not found]     ` <200704271853.41523.arnd-r2nGTMty4D4@public.gmane.org>
2007-04-29  8:00       ` Heiko Carstens
     [not found]         ` <20070429080039.GA8332-5VkHqLvV2o3MbYB6QlFGEg@public.gmane.org>
2007-05-01 21:12           ` Arnd Bergmann
     [not found]             ` <200705012312.11692.arnd-r2nGTMty4D4@public.gmane.org>
2007-05-02  4:46               ` Avi Kivity
2007-05-02  8:25               ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.