* [PATCH/RFC 2/2] s390 virtualization interface.
@ 2007-04-27 13:40 Carsten Otte
[not found] ` <1177681235.5770.22.camel-WIxn4w2hgUz3YA32ykw5MLlKpX0K8NHHQQ4Iyu8u01E@public.gmane.org>
0 siblings, 1 reply; 8+ messages in thread
From: Carsten Otte @ 2007-04-27 13:40 UTC (permalink / raw)
To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
Cc: cborntra-tA70FqPdS9bQT0dZR+AlfA, schwidefsky-tA70FqPdS9bQT0dZR+AlfA
Add interface which allows a process to start a virtual machine.
To keep things easy each thread group is allowed to have only one
virtual machine and each thread of the thread group can only control
one virtual cpu of the virtual machine. All the information about
the virtual machines/cpus can be found via the thread_info structures
of the participating threads.
This patch adds three new s390 specific system calls:
long sys_s390host_add_cpu(unsigned long addr, unsigned long flags,
struct sie_block __user *sie_template)
Adds a new cpu to a the virtual machine that belongs to the current
thread group. If no virtual machine exists it will be created. In
addition two pages will be allocated and mapped at <addr> into the
address space of the process. These two pages are used so user space
and kernel space can easily exchange/modify the state of the
corresponding virtual cpu without a ton of copy_from/to_user calls.
The sie_template is a pointer to a data structure that contains
initial information how the virtual cpu should be setup. The
resulting block will be used as a parameter to issue the sie (start
interpretive execution) instruction which starts a virtual cpu.
int sys_s390host_remove_cpu(void)
Removes a virtual cpu from a virtual machine.
int sys_s390host_sie(unsigned long action)
Starts / re-enters the virtual cpu of the virtual machine that the
thread belongs to, if any.
Please note that this patch is nothing more than a proof-of-concept
and may contain quite a few bugs.
Since we want to convert to use kvm instead, most of this will be
dropped anyway. But maybe this is of interest for others as well.
---
arch/s390/Kconfig | 7
arch/s390/Makefile | 2
arch/s390/host/Makefile | 5
arch/s390/host/s390host.c | 411 +++++++++++++++++++++++++++++++++++++++++
arch/s390/host/sie64a.S | 37 +++
arch/s390/kernel/asm-offsets.c | 2
arch/s390/kernel/process.c | 15 +
arch/s390/kernel/setup.c | 4
arch/s390/kernel/syscalls.S | 3
include/asm-s390/sie64.h | 276 +++++++++++++++++++++++++++
include/asm-s390/thread_info.h | 8
include/asm-s390/unistd.h | 5
kernel/sys_ni.c | 3
13 files changed, 773 insertions(+), 5 deletions(-)
Index: linux-2.6/arch/s390/kernel/asm-offsets.c
===================================================================
--- linux-2.6.orig/arch/s390/kernel/asm-offsets.c
+++ linux-2.6/arch/s390/kernel/asm-offsets.c
@@ -44,5 +44,7 @@ int main(void)
DEFINE(__SF_BACKCHAIN, offsetof(struct stack_frame, back_chain),);
DEFINE(__SF_GPRS, offsetof(struct stack_frame, gprs),);
DEFINE(__SF_EMPTY, offsetof(struct stack_frame, empty1),);
+ BLANK();
+ DEFINE(__SIE_USER_gprs, offsetof(struct sie_user, gprs),);
return 0;
}
Index: linux-2.6/arch/s390/kernel/syscalls.S
===================================================================
--- linux-2.6.orig/arch/s390/kernel/syscalls.S
+++ linux-2.6/arch/s390/kernel/syscalls.S
@@ -322,3 +322,6 @@ NI_SYSCALL /* 310 sys_move_pages *
SYSCALL(sys_getcpu,sys_getcpu,sys_getcpu_wrapper)
SYSCALL(sys_epoll_pwait,sys_epoll_pwait,compat_sys_epoll_pwait_wrapper)
SYSCALL(sys_utimes,sys_utimes,compat_sys_utimes_wrapper)
+SYSCALL(sys_ni_syscall,sys_s390host_add_cpu,sys_ni_syscall)
+SYSCALL(sys_ni_syscall,sys_s390host_remove_cpu,sys_ni_syscall)
+SYSCALL(sys_ni_syscall,sys_s390host_sie,sys_ni_syscall)
Index: linux-2.6/arch/s390/host/Makefile
===================================================================
--- /dev/null
+++ linux-2.6/arch/s390/host/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the s390host components.
+#
+
+obj-$(CONFIG_S390_HOST) += s390host.o sie64a.o
Index: linux-2.6/arch/s390/host/sie64a.S
===================================================================
--- /dev/null
+++ linux-2.6/arch/s390/host/sie64a.S
@@ -0,0 +1,37 @@
+/*
+ * arch/s390/host/sie64a.S
+ * low level sie call
+ *
+ * Copyright IBM Corp. 2007
+ * Author(s): Heiko Carstens <heiko.carstens-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>
+ */
+
+#include <linux/errno.h>
+#include <asm/asm-offsets.h>
+
+SP_R6 = 6 * 8 # offset into stackframe
+
+ .globl sie64a
+sie64a:
+ stmg %r6,%r15,SP_R6(%r15) # save register on entry
+ lgr %r14,%r2 # pointer to program parms
+ aghi %r2,4096
+ lmg %r0,%r13,__SIE_USER_gprs(%r2) # load guest gprs 0-13
+sie_inst:
+ sie 0(%r14)
+ aghi %r14,4096
+ stmg %r0,%r13,__SIE_USER_gprs(%r14) # save guest gprs 0-13
+ lghi %r2,0
+ lmg %r6,%r15,SP_R6(%r15)
+ br %r14
+
+sie_err:
+ aghi %r14,4096
+ stmg %r0,%r13,__SIE_USER_gprs(%r14) # save guest gprs 0-13
+ lghi %r2,-EFAULT
+ lmg %r6,%r15,SP_R6(%r15)
+ br %r14
+
+ .section __ex_table,"a"
+ .quad sie_inst,sie_err
+ .previous
Index: linux-2.6/arch/s390/host/s390host.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/s390/host/s390host.c
@@ -0,0 +1,411 @@
+/*
+ * s390host.c -- hosting zSeries Linux virtual engines
+ *
+ * Copyright IBM Corp. 2007
+ * Author(s): Carsten Otte <cotte-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>,
+ * Christian Borntraeger <cborntra-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>,
+ * Heiko Carstens <heiko.carstens-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>
+ */
+
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/mman.h>
+#include <linux/mutex.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/semaphore.h>
+#include <asm/sie64.h>
+
+static int s390host_do_action(unsigned long, struct sie_io *);
+
+static DEFINE_MUTEX(s390host_init_mutex);
+
+static void s390host_get_data(struct s390host_data *data)
+{
+ atomic_inc(&data->count);
+}
+
+void s390host_put_data(struct s390host_data *data)
+{
+ int cpu;
+
+ if (atomic_dec_return(&data->count))
+ return;
+
+ for (cpu = 0; cpu < S390HOST_MAX_CPUS; cpu++)
+ if (data->sie_io[cpu])
+ free_page((unsigned long)data->sie_io[cpu]);
+
+ if (data->sca_block)
+ free_page((unsigned long)data->sca_block);
+
+ kfree(data);
+}
+
+static void s390host_vma_close(struct vm_area_struct *vma)
+{
+ s390host_put_data(vma->vm_private_data);
+}
+
+static struct page *s390host_vma_nopage(struct vm_area_struct *vma,
+ unsigned long address, int *type)
+{
+ return NOPAGE_SIGBUS;
+}
+
+static struct vm_operations_struct s390host_vmops = {
+ .close = s390host_vma_close,
+ .nopage = s390host_vma_nopage,
+};
+
+static struct s390host_data *get_s390host_context(void)
+{
+ struct thread_info *tif;
+ struct sca_block *sca_block = NULL;
+ struct s390host_data *data = NULL;
+ struct task_struct *tsk;
+
+ /* zlh context for current thread already created? */
+ tif = current_thread_info();
+ if (tif->s390host_data)
+ return tif->s390host_data;
+
+ /* zlh context in thread group available? */
+ write_lock_irq(&tasklist_lock);
+ tsk = next_thread(current);
+ for (; tsk != current; tsk = next_thread(tsk)) {
+ data = tsk->thread_info->s390host_data;
+ if (data) {
+ s390host_get_data(data);
+ tif->s390host_data = data;
+ break;
+ }
+ }
+ write_unlock_irq(&tasklist_lock);
+
+ if (data)
+ return data;
+
+ /* create new context */
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+
+ if (!data)
+ return NULL;
+
+ sca_block = (struct sca_block *)get_zeroed_page(GFP_KERNEL);
+
+ if (!sca_block) {
+ kfree(data);
+ return NULL;
+ }
+
+ data->sca_block = sca_block;
+ tif->s390host_data = data;
+ s390host_get_data(data);
+
+ return data;
+}
+
+static unsigned long
+s390host_create_io_area(unsigned long addr, unsigned long flags,
+ unsigned long io_addr, struct s390host_data *data)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long ret;
+
+ flags &= MAP_FIXED;
+ addr = get_unmapped_area(NULL, addr, 2 * PAGE_SIZE, 0, flags);
+
+ if (addr & ~PAGE_MASK)
+ return addr;
+
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+
+ if (!vma)
+ return -ENOMEM;
+
+ vma->vm_mm = mm;
+ vma->vm_start = addr;
+ vma->vm_end = addr + 2 * PAGE_SIZE;
+ vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED;
+ vma->vm_flags |= VM_SHARED | VM_MAYSHARE | VM_DONTCOPY;
+
+#if 1 /* FIXME: write access until sys_s390host_sie interface is final */
+ vma->vm_flags |= VM_WRITE | VM_MAYWRITE;
+#endif
+
+ vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
+ vma->vm_private_data = data;
+ vma->vm_ops = &s390host_vmops;
+
+ down_write(&mm->mmap_sem);
+ ret = insert_vm_struct(mm, vma);
+ if (ret) {
+ kmem_cache_free(vm_area_cachep, vma);
+ goto out;
+ }
+ s390host_get_data(data);
+ mm->total_vm += 2;
+ vm_insert_page(vma, addr, virt_to_page(io_addr));
+
+ ret = split_vma(mm, vma, addr + PAGE_SIZE, 0);
+ if (ret)
+ goto out;
+ s390host_get_data(data);
+
+ vma = find_vma(mm, addr + PAGE_SIZE);
+ vma->vm_flags |= VM_WRITE | VM_MAYWRITE;
+ vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
+ vm_insert_page(vma, addr + PAGE_SIZE,
+ virt_to_page(io_addr + PAGE_SIZE));
+ ret = addr;
+out:
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
+long sys_s390host_add_cpu(unsigned long addr, unsigned long flags,
+ struct sie_block __user *sie_template)
+{
+ struct sie_block *sie_block;
+ struct sie_io *sie_io;
+ struct sca_block *sca_block;
+ struct s390host_data *data = NULL;
+ unsigned long ret;
+ __u16 cpu;
+
+ if (current_thread_info()->sie_cpu != -1)
+ return -EINVAL;
+
+ if (copy_from_user(&cpu, &sie_template->icpua, sizeof(u16)))
+ return -EFAULT;
+
+ if (cpu >= S390HOST_MAX_CPUS)
+ return -EINVAL;
+
+ mutex_lock(&s390host_init_mutex);
+
+ data = get_s390host_context();
+ if (!data) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ sca_block = data->sca_block;
+ if (sca_block->mcn & (1UL << (S390HOST_MAX_CPUS - 1 - cpu))) {
+ ret = -EINVAL;
+ goto out_err;
+ }
+
+ if (!data->sie_io[cpu]) {
+ unsigned long tmp;
+
+ /* allocate two pages: 1st is r/o 2nd r/w area */
+ tmp = __get_free_pages(GFP_KERNEL, 1);
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ split_page(virt_to_page(tmp), 1);
+ data->sie_io[cpu] = (struct sie_io *)tmp;
+ }
+
+ sie_io = data->sie_io[cpu];
+ memset(sie_io, 0, 2 * PAGE_SIZE);
+
+ sie_block = &sie_io->sie_kernel.sie_block;
+ sca_block->cpu[cpu].sda = (__u64)sie_block;
+
+ if (copy_from_user(sie_block, sie_template, sizeof(struct sie_block))) {
+ ret = -EFAULT;
+ goto out_err;
+ }
+ sie_block->icpua = cpu;
+
+ ret = s390host_create_io_area(addr, flags, (unsigned long)sie_io, data);
+
+ if (ret & ~PAGE_MASK)
+ goto out_err;
+
+ sca_block->mcn |= 1UL << (S390HOST_MAX_CPUS - 1 - cpu);
+ sie_block->scaoh = (__u32)(((__u64)sca_block) >> 32);
+ sie_block->scaol = (__u32)(__u64)sca_block;
+ current_thread_info()->sie_cpu = cpu;
+ goto out;
+out_err:
+ if (data)
+ s390host_put_data(data);
+out:
+ mutex_unlock(&s390host_init_mutex);
+ return ret;
+}
+
+int sys_s390host_remove_cpu(void)
+{
+ struct sca_block *sca_block;
+ int cpu;
+
+ cpu = current_thread_info()->sie_cpu;
+ if (cpu == -1)
+ return -EINVAL;
+
+ mutex_lock(&s390host_init_mutex);
+ sca_block = current_thread_info()->s390host_data->sca_block;
+ sca_block->mcn &= ~(1UL << (S390HOST_MAX_CPUS - 1 - cpu));
+ current_thread_info()->sie_cpu = -1;
+ mutex_unlock(&s390host_init_mutex);
+ return 0;
+}
+
+int sys_s390host_sie(unsigned long action)
+{
+ struct sie_kernel *sie_kernel;
+ struct sie_user *sie_user;
+ struct sie_io *sie_io;
+ int cpu;
+ int ret = 0;
+
+ cpu = current_thread_info()->sie_cpu;
+ if (cpu == -1)
+ return -EINVAL;
+
+ sie_io = current_thread_info()->s390host_data->sie_io[cpu];
+
+ if (action)
+ ret = s390host_do_action(action, sie_io);
+ if (ret)
+ goto out_err;
+ sie_kernel = &sie_io->sie_kernel;
+ sie_user = &sie_io->sie_user;
+
+ save_fp_regs(&sie_kernel->host_fpregs);
+ save_access_regs(sie_kernel->host_acrs);
+ sie_user->guest_fpregs.fpc &= FPC_VALID_MASK;
+ restore_fp_regs(&sie_user->guest_fpregs);
+ restore_access_regs(sie_user->guest_acrs);
+ memcpy(&sie_kernel->sie_block.gg14, &sie_user->gprs[14], 16);
+again:
+ if (need_resched())
+ schedule();
+
+ sie_kernel->sie_block.icptcode = 0;
+ ret = sie64a(sie_kernel);
+ if (ret)
+ goto out;
+
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+
+ switch (sie_kernel->sie_block.icptcode) {
+ case 0x00:
+ case 0x24:
+ goto again;
+ }
+out:
+ memcpy(&sie_user->gprs[14], &sie_kernel->sie_block.gg14, 16);
+ save_fp_regs(&sie_user->guest_fpregs);
+ save_access_regs(sie_user->guest_acrs);
+ restore_fp_regs(&sie_kernel->host_fpregs);
+ restore_access_regs(sie_kernel->host_acrs);
+out_err:
+ return ret;
+}
+
+static void s390host_vsmxm_local_update(struct sie_io *sie_io)
+{
+ struct sie_kernel *local_sie_kernel;
+ struct sie_user *sie_user;
+ atomic_t *cpuflags;
+ int old, new;
+
+ mutex_lock(&s390host_init_mutex);
+
+ sie_user = &sie_io->sie_user;
+ local_sie_kernel = &sie_io->sie_kernel;
+
+ cpuflags = &local_sie_kernel->sie_block.cpuflags;
+ do {
+ old = atomic_read(cpuflags);
+ new = old | sie_user->vsmxm_or_local;
+ new &= sie_user->vsmxm_and_local;
+ } while (atomic_cmpxchg(cpuflags, old, new) != old);
+
+ mutex_unlock(&s390host_init_mutex);
+ return;
+}
+
+static int s390host_vsmxm_dist_update(struct sie_io *sie_io)
+{
+ struct sie_kernel *dist_sie_kernel;
+ struct sie_user *sie_user;
+ struct sca_block *sca_block;
+ struct thread_info *tif;
+ atomic_t *cpuflags;
+ int cpu;
+ int old, new;
+ int rc = -EINVAL;
+
+ mutex_lock(&s390host_init_mutex);
+
+ sie_user = &sie_io->sie_user;
+ cpu = sie_user->vsmxm_cpuid;
+
+ if (cpu >= S390HOST_MAX_CPUS)
+ goto out;
+
+ tif = current_thread_info();
+ sca_block = tif->s390host_data->sca_block;
+ if (!(sca_block->mcn & (1UL << (S390HOST_MAX_CPUS - 1 - cpu))))
+ goto out;
+
+ dist_sie_kernel = &((tif->s390host_data->sie_io[cpu])->sie_kernel);
+
+ cpuflags = &dist_sie_kernel->sie_block.cpuflags;
+ do {
+ old = atomic_read(cpuflags);
+ new = old | sie_user->vsmxm_or;
+ new &= sie_user->vsmxm_and;
+ } while (atomic_cmpxchg(cpuflags, old, new) != old);
+
+ rc = 0;
+out:
+ mutex_unlock(&s390host_init_mutex);
+ return rc;
+}
+
+static int s390host_do_action(unsigned long action, struct sie_io *sie_io)
+{
+ void *src;
+ void *dest;
+ int rc = 0;
+
+ if (action & SIE_BLOCK_UPDATE) {
+ src = &(sie_io->sie_user.sie_block);
+ dest = &(sie_io->sie_kernel.sie_block);
+
+ memcpy(dest + 4, src + 4, 88);
+ memcpy(dest + 96, src + 96, 4);
+ memcpy(dest + 104, src + 104, 408);
+ }
+
+ if (action & SIE_UPDATE_PSW)
+ sie_io->sie_kernel.sie_block.psw.gpsw = sie_io->sie_user.psw;
+
+ if (action & SIE_FLUSH_TLB)
+ flush_tlb_mm(current->mm);
+
+ if (action & SIE_VSMXM_LOCAL_UPDATE)
+ s390host_vsmxm_local_update(sie_io);
+
+ if (action & SIE_VSMXM_DIST_UPDATE)
+ rc = s390host_vsmxm_dist_update(sie_io);
+ return rc;
+}
Index: linux-2.6/include/asm-s390/unistd.h
===================================================================
--- linux-2.6.orig/include/asm-s390/unistd.h
+++ linux-2.6/include/asm-s390/unistd.h
@@ -251,8 +251,11 @@
#define __NR_getcpu 311
#define __NR_epoll_pwait 312
#define __NR_utimes 313
+#define __NR_s390host_add_cpu 314
+#define __NR_s390host_remove_cpu 315
+#define __NR_s390host_sie 316
-#define NR_syscalls 314
+#define NR_syscalls 317
/*
* There are some system calls that are not present on 64 bit, some
Index: linux-2.6/include/asm-s390/sie64.h
===================================================================
--- /dev/null
+++ linux-2.6/include/asm-s390/sie64.h
@@ -0,0 +1,276 @@
+/*
+ * include/asm-s390/sie64.h
+ *
+ * Copyright IBM Corp. 2007
+ * Author(s): Carsten Otte <cotte-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>,
+ * Christian Borntraeger <cborntra-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>,
+ * Heiko Carstens <heiko.carstens-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>
+ *
+ */
+
+#ifndef _ASM_S390_SIE64_H
+#define _ASM_S390_SIE64_H
+
+struct sie_block {
+ atomic_t cpuflags; /* 0x0000 */
+ __u32 prefix; /* 0x0004 */
+ __u32 :32; /* 0x0008 */
+ __u32 :32; /* 0x000c */
+ __u64 :64; /* 0x0010 */
+ __u64 :64; /* 0x0018 */
+ __u64 :64; /* 0x0020 */
+ __u64 cputm; /* 0x0028 */
+ __u64 ckc; /* 0x0030 */
+ __u64 epoch; /* 0x0038 */
+ __u8 svcnn :1, /* 0x0040 */
+ svc1c :1,
+ svc2c :1,
+ svc3c :1,
+ :4;
+ __u8 svc1n; /* 0x0041 */
+ __u8 svc2n; /* 0x0042 */
+ __u8 svc3n; /* 0x0043 */
+ __u16 lctl0 :1, /* 0x0044 */
+ lctl1 :1,
+ lctl2 :1,
+ lctl3 :1,
+ lctl4 :1,
+ lctl5 :1,
+ lctl6 :1,
+ lctl7 :1,
+ lctl8 :1,
+ lctl9 :1,
+ lctla :1,
+ lctlb :1,
+ lctlc :1,
+ lctld :1,
+ lctle :1,
+ lctlf :1;
+ __s16 icpua; /* 0x0046 */
+ __u32 icpop :1, /* 0x0048 */
+ icpro :1,
+ icprg :1,
+ :4,
+ icipte :1,
+ :1, /* 0x0049 */
+ iclpsw :1,
+ icptlb :1,
+ icssm :1,
+ icbsa :1,
+ icstctl :1,
+ icstnsm :1,
+ icstosm :1,
+ icstck :1, /* 0x004a */
+ iciske :1,
+ icsske :1,
+ icrrbe :1,
+ icpc :1,
+ icpt :1,
+ ictprot :1,
+ iclasp :1,
+ :1, /* 0x004b */
+ icstpt :1,
+ icsckc :1,
+ :1,
+ icpr :1,
+ icbakr :1,
+ icpg :1,
+ :1;
+ __u32 ecext :1, /* 0x004c */
+ ecint :1,
+ ecwait :1,
+ ecsigp :1,
+ ecalt :1,
+ ecio2 :1,
+ :1,
+ ecmvp :1;
+ __u8 eca1; /* 0x004d */
+ __u8 eca2; /* 0x004e */
+ __u8 eca3; /* 0x004f */
+ __u8 icptcode; /* 0x0050 */
+ __u8 :6, /* 0x0051 */
+ icif :1,
+ icex :1;
+ __u16 ihcpu; /* 0x0052 */
+ __u16 :16; /* 0x0054 */
+ struct {
+ union {
+ __u16 ipa; /* 0x0056 */
+ __u16 inst; /* 0x0056 */
+ struct {
+ union {
+ __u8 ipa0; /* 0x0056 */
+ __u8 viwho; /* 0x0056 */
+ };
+ union {
+ __u8 ipa1; /* 0x0057 */
+ __u8 viwhen; /* 0x0057 */
+ };
+ };
+ };
+ union {
+ __u32 ipb; /* 0x0058 */
+ struct {
+ union {
+ __u16 ipbh0; /* 0x0058 */
+ __u16 viwhy; /* 0x0058 */
+ struct {
+ __u8 ipb0; /* 0x0058 */
+ __u8 ipb1; /* 0x0059 */
+ };
+ };
+ union {
+ __u16 ipbh1; /* 0x005a */
+ struct {
+ __u8 ipb2; /* 0x005a */
+ __u8 ipb3; /* 0x005b */
+ };
+ };
+ };
+ };
+ } __attribute__((packed));
+ __u32 scaoh; /* 0x005c */
+ union {
+ __u32 rcp; /* 0x0060 */
+ struct {
+ __u8 ska :1, /* 0x0060 */
+ skaip :1,
+ :6;
+ __u8 ecb :8; /* 0x0061 */
+ __u8 :3, /* 0x0062 */
+ cpby :1,
+ :4;
+ __u8 :8; /* 0x0063 */
+ };
+ };
+ __u32 scaol; /* 0x0064 */
+ __u32 :32; /* 0x0068 */
+ union {
+ __u32 todpr; /* 0x006c */
+ struct {
+ __u16 :16; /* 0x006c */
+ __u16 todpf; /* 0x006e */
+ };
+ } __attribute__((packed));
+ __u32 gisa; /* 0x0070 */
+ __u32 iopct; /* 0x0074 */
+ __u32 rsvd3; /* 0x0078 */
+ __u32 :32; /* 0x007c */
+ __u64 gmsor; /* 0x0080 */
+ __u64 gmslm; /* 0x0088 */
+ union {
+ psw_t gpsw; /* 0x0090 */
+ struct {
+ __u64 pswh; /* 0x0090 */
+
+ __u64 pswl; /* 0x0098 */
+ };
+ } psw;
+ __u64 gg14; /* 0x00a0 */
+ __u64 gg15; /* 0x00a8 */
+ __u64 :64; /* 0x00b0 */
+ __u64 :16, /* 0x00b8 */
+ xso :24,
+ xsl :24;
+ union {
+ __u8 uzp0[56]; /* 0x00c0 */
+ struct {
+ __u32 exmsf; /* 0x00c0 */
+ union {
+ __u32 iexcf; /* 0x00c4 */
+ struct {
+ __u16 iexca; /* 0x00c4 */
+ __u16 iexcd; /* 0x00c6 */
+ };
+ };
+ __u16 svcil; /* 0x00c8 */
+ __u16 svcnt; /* 0x00ca */
+ __u16 iprcl; /* 0x00cc */
+ __u16 iprcc; /* 0x00ce */
+ __u32 itrad; /* 0x00d0 */
+ __u32 imncl; /* 0x00d4 */
+ __u64 gpera; /* 0x00d8 */
+ __u8 excpar; /* 0x00e0 */
+ __u8 perar; /* 0x00e1 */
+ __u8 oprid; /* 0x00e2 */
+ __u8 :8; /* 0x00e3 */
+ __u32 :32; /* 0x00e4 */
+ __u64 gtrad; /* 0x00e8 */
+ __u32 :32; /* 0x00f0 */
+ __u32 :32; /* 0x00f4 */
+ };
+ };
+ __u16 :16; /* 0x00f8 */
+ __u16 ief; /* 0x00fa */
+ __u32 apcbk; /* 0x00fc */
+ __u64 gcr[16]; /* 0x0100 */
+ __u8 reserved[128]; /* 0x0180 */
+} __attribute__((packed));
+
+struct sie_kernel {
+ struct sie_block sie_block;
+ s390_fp_regs host_fpregs;
+ int host_acrs[NUM_ACRS];
+} __attribute__((packed,aligned(4096)));
+
+#define SIE_UPDATE_PSW (1UL << 0)
+#define SIE_FLUSH_TLB (1UL << 1)
+#define SIE_ISKE (1UL << 2)
+#define SIE_SSKE (1UL << 3)
+#define SIE_BLOCK_UPDATE (1UL << 4)
+#define SIE_VSMXM_LOCAL_UPDATE (1UL << 5)
+#define SIE_VSMXM_DIST_UPDATE (1UL << 6)
+
+struct sie_skey_parm {
+ unsigned long sk_reg;
+ unsigned long sk_addr;
+};
+
+struct sie_user {
+ struct sie_block sie_block;
+ psw_t psw;
+ unsigned long gprs[NUM_GPRS];
+ s390_fp_regs guest_fpregs;
+ int guest_acrs[NUM_ACRS];
+ struct sie_skey_parm iske_parm;
+ struct sie_skey_parm sske_parm;
+ int vsmxm_or_local;
+ int vsmxm_and_local;
+ int vsmxm_or;
+ int vsmxm_and;
+ int vsmxm_cpuid;
+} __attribute__((packed,aligned(4096)));
+
+struct sie_io {
+ struct sie_kernel sie_kernel;
+ struct sie_user sie_user;
+};
+
+struct sca_entry {
+ atomic_t scn;
+ __u64 reserved;
+ __u64 sda;
+ __u64 reserved2[2];
+}__attribute__((packed));
+
+struct sca_block {
+ __u64 ipte_control;
+ __u64 reserved[5];
+ __u64 mcn;
+ __u64 reserved2;
+ struct sca_entry cpu[64];
+}__attribute__((packed));
+
+#define S390HOST_MAX_CPUS 64
+
+struct s390host_data {
+ atomic_t count;
+ struct sie_io *sie_io[S390HOST_MAX_CPUS];
+ struct sca_block *sca_block;
+};
+
+/* function definitions */
+extern int sie64a(struct sie_kernel *);
+extern void s390host_put_data(struct s390host_data *);
+
+#endif /* _ASM_S390_SIE64_H */
Index: linux-2.6/arch/s390/Makefile
===================================================================
--- linux-2.6.orig/arch/s390/Makefile
+++ linux-2.6/arch/s390/Makefile
@@ -85,7 +85,7 @@ LDFLAGS_vmlinux := -e start
head-y := arch/s390/kernel/head.o arch/s390/kernel/init_task.o
core-y += arch/s390/mm/ arch/s390/kernel/ arch/s390/crypto/ \
- arch/s390/appldata/ arch/s390/hypfs/
+ arch/s390/appldata/ arch/s390/hypfs/ arch/s390/host/
libs-y += arch/s390/lib/
drivers-y += drivers/s390/
drivers-$(CONFIG_MATHEMU) += arch/s390/math-emu/
Index: linux-2.6/kernel/sys_ni.c
===================================================================
--- linux-2.6.orig/kernel/sys_ni.c
+++ linux-2.6/kernel/sys_ni.c
@@ -122,6 +122,9 @@ cond_syscall(sys32_sysctl);
cond_syscall(ppc_rtas);
cond_syscall(sys_spu_run);
cond_syscall(sys_spu_create);
+cond_syscall(sys_s390host_add_cpu);
+cond_syscall(sys_s390host_remove_cpu);
+cond_syscall(sys_s390host_sie);
/* mmu depending weak syscall entries */
cond_syscall(sys_mprotect);
Index: linux-2.6/arch/s390/kernel/process.c
===================================================================
--- linux-2.6.orig/arch/s390/kernel/process.c
+++ linux-2.6/arch/s390/kernel/process.c
@@ -274,12 +274,23 @@ int copy_thread(int nr, unsigned long cl
#endif /* CONFIG_64BIT */
/* start new process with ar4 pointing to the correct address space */
p->thread.mm_segment = get_fs();
- /* Don't copy debug registers */
- memset(&p->thread.per_info,0,sizeof(p->thread.per_info));
+ /* Don't copy debug registers */
+ memset(&p->thread.per_info,0,sizeof(p->thread.per_info));
+ p->thread_info->s390host_data = NULL;
+ p->thread_info->sie_cpu = -1;
return 0;
}
+void free_thread_info(struct thread_info *ti)
+{
+#ifdef CONFIG_S390_HOST
+ if (ti->s390host_data)
+ s390host_put_data(ti->s390host_data);
+#endif
+ free_pages((unsigned long) (ti),THREAD_ORDER);
+}
+
asmlinkage long sys_fork(struct pt_regs regs)
{
return do_fork(SIGCHLD, regs.gprs[15], ®s, 0, NULL, NULL);
Index: linux-2.6/include/asm-s390/thread_info.h
===================================================================
--- linux-2.6.orig/include/asm-s390/thread_info.h
+++ linux-2.6/include/asm-s390/thread_info.h
@@ -38,6 +38,7 @@
#ifndef __ASSEMBLY__
#include <asm/processor.h>
#include <asm/lowcore.h>
+#include <asm/sie64.h>
/*
* low level task data that entry.S needs immediate access to
@@ -52,6 +53,8 @@ struct thread_info {
unsigned int cpu; /* current CPU */
int preempt_count; /* 0 => preemptable, <0 => BUG */
struct restart_block restart_block;
+ struct s390host_data *s390host_data; /* s390host data */
+ int sie_cpu; /* sie cpu number */
};
/*
@@ -67,6 +70,8 @@ struct thread_info {
.restart_block = { \
.fn = do_no_restart_syscall, \
}, \
+ .s390host_data = NULL, \
+ .sie_cpu = 0, \
}
#define init_thread_info (init_thread_union.thread_info)
@@ -81,7 +86,8 @@ static inline struct thread_info *curren
/* thread information allocation */
#define alloc_thread_info(tsk) ((struct thread_info *) \
__get_free_pages(GFP_KERNEL,THREAD_ORDER))
-#define free_thread_info(ti) free_pages((unsigned long) (ti),THREAD_ORDER)
+
+extern void free_thread_info(struct thread_info *);
#endif
Index: linux-2.6/arch/s390/Kconfig
===================================================================
--- linux-2.6.orig/arch/s390/Kconfig
+++ linux-2.6/arch/s390/Kconfig
@@ -153,6 +153,7 @@ config S390_SWITCH_AMODE
config S390_EXEC_PROTECT
bool "Data execute protection"
+ depends on !S390_HOST
select S390_SWITCH_AMODE
help
This option allows to enable a buffer overflow protection for user
@@ -514,6 +515,12 @@ config KEXEC
current kernel, and to start another kernel. It is like a reboot
but is independent of hardware/microcode support.
+config S390_HOST
+ bool "s390 host support (EXPERIMENTAL)"
+ depends on 64BIT && EXPERIMENTAL
+ select S390_SWITCH_AMODE
+ help
+ Select this option if you want to host s390 guest Linux images
endmenu
source "net/Kconfig"
Index: linux-2.6/arch/s390/kernel/setup.c
===================================================================
--- linux-2.6.orig/arch/s390/kernel/setup.c
+++ linux-2.6/arch/s390/kernel/setup.c
@@ -394,7 +394,11 @@ static int __init early_parse_ipldelay(c
early_param("ipldelay", early_parse_ipldelay);
#ifdef CONFIG_S390_SWITCH_AMODE
+#ifdef CONFIG_S390_HOST
+unsigned int switch_amode = 1;
+#else
unsigned int switch_amode = 0;
+#endif
EXPORT_SYMBOL_GPL(switch_amode);
static void set_amode_and_uaccess(unsigned long user_amode,
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH/RFC 2/2] s390 virtualization interface.
[not found] ` <1177681235.5770.22.camel-WIxn4w2hgUz3YA32ykw5MLlKpX0K8NHHQQ4Iyu8u01E@public.gmane.org>
@ 2007-04-27 15:23 ` Anthony Liguori
[not found] ` <4632155C.8030503-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2007-04-27 16:53 ` Arnd Bergmann
1 sibling, 1 reply; 8+ messages in thread
From: Anthony Liguori @ 2007-04-27 15:23 UTC (permalink / raw)
To: Carsten Otte
Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
cborntra-tA70FqPdS9bQT0dZR+AlfA,
schwidefsky-tA70FqPdS9bQT0dZR+AlfA
Can you provide a high-level description of how this virtualization
mechanism works? How does MMU virtualization work?
Carsten Otte wrote:
> +
> +#ifndef _ASM_S390_SIE64_H
> +#define _ASM_S390_SIE64_H
> +
> +struct sie_block {
> + atomic_t cpuflags; /* 0x0000 */
> + __u32 prefix; /* 0x0004 */
> + __u32 :32; /* 0x0008 */
> + __u32 :32; /* 0x000c */
> + __u64 :64; /* 0x0010 */
> + __u64 :64; /* 0x0018 */
> + __u64 :64; /* 0x0020 */
> + __u64 cputm; /* 0x0028 */
> + __u64 ckc; /* 0x0030 */
> + __u64 epoch; /* 0x0038 */
> + __u8 svcnn :1, /* 0x0040 */
> + svc1c :1,
> + svc2c :1,
> + svc3c :1,
> + :4;
>
I think bitfields can have weird padding issues.
Regards,
Anthony Liguori
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH/RFC 2/2] s390 virtualization interface.
[not found] ` <4632155C.8030503-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2007-04-27 15:45 ` Carsten Otte
0 siblings, 0 replies; 8+ messages in thread
From: Carsten Otte @ 2007-04-27 15:45 UTC (permalink / raw)
To: Anthony Liguori
Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
Christian Borntraeger, mschwid2-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
Anthony Liguori wrote:
> Can you provide a high-level description of how this virtualization
> mechanism works?
Very well. First of all, userspace allocates regular memory via
malloc() or mmap()s a file. The user address space will equals the
virtual machines address space, user space can define an address
offset where the virtual machine starts and can define a maximum size
of the virtual machine. Both is set in each of the SIE control blocks.
Userspace uses open/read/close to read a kernel image into its proper
location.
Next, userspace launches a set of pthreads. Each pthread corresponds
with a virtual CPU, and each pthread calls the sys_s390host_add_cpu
system call.
Now the first CPU enters sys_sie to start interpretive execution via
sys_s3900host_sie. The CPU will run the virtual cpu context for a
while and exit again with an intercept code that indicates the reason
for the exit. Now userspace deals with that, and enters
sys_s390host_sie again. Later on, all other CPUs get started via an
interprocessor signal SIGP. Userspace will also receive this as SIE
intercept. Now all pthreads representing the different virtual CPUs
will enter the virtual machine context simultaneously.
Interprocessor signals (need_resched and such) are handled by the
hardware as long as the sender and receiver are both in virtual
context. Otherwise, userspace will get control again.
In the end, when the machine is halted (or panics), all CPUs will
return with an intercept code indicating that execution has ended for
this CPU. Userspace will now call sys_s390host_remove_cpu to free
things up, then userspace will exit().
> How does MMU virtualization work?
s390 does have hardware support for memory management for virutalized
environments. The first patch of this series introduces an extension
to the page table entry, that tracks dirty/reference bits for host and
virtual machine seperately. For the guest, our CPU does transparently
merge the real dirty and reference bits stored in a storage key and in
the page table entry extension. The virutal machine can set up its
own page tables inside the box and use dynamic address translation.
This does not require intervention by the host, except that it needs
to do its part of dirty and reference tracking in the extended page
table entry explicitly.
cheers,
Carsten
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH/RFC 2/2] s390 virtualization interface.
[not found] ` <1177681235.5770.22.camel-WIxn4w2hgUz3YA32ykw5MLlKpX0K8NHHQQ4Iyu8u01E@public.gmane.org>
2007-04-27 15:23 ` Anthony Liguori
@ 2007-04-27 16:53 ` Arnd Bergmann
[not found] ` <200704271853.41523.arnd-r2nGTMty4D4@public.gmane.org>
1 sibling, 1 reply; 8+ messages in thread
From: Arnd Bergmann @ 2007-04-27 16:53 UTC (permalink / raw)
To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
Cc: cborntra-tA70FqPdS9bQT0dZR+AlfA, schwidefsky-tA70FqPdS9bQT0dZR+AlfA
On Friday 27 April 2007, Carsten Otte wrote:
> Add interface which allows a process to start a virtual machine.
Looks pretty good to me already.
It seems a lot closer to lguest than to kvm at the moment concerning
the kernel interfaces (guest real address == host process, state
is attached to the thread_info, ...).
Now for the nitpicking:
> +static DEFINE_MUTEX(s390host_init_mutex);
> +
> +static void s390host_get_data(struct s390host_data *data)
> +{
> + atomic_inc(&data->count);
> +}
> +
> +void s390host_put_data(struct s390host_data *data)
> +{
> + int cpu;
> +
> + if (atomic_dec_return(&data->count))
> + return;
> +
> + for (cpu = 0; cpu < S390HOST_MAX_CPUS; cpu++)
> + if (data->sie_io[cpu])
> + free_page((unsigned long)data->sie_io[cpu]);
> +
> + if (data->sca_block)
> + free_page((unsigned long)data->sca_block);
> +
> + kfree(data);
> +}
These should probably use a struct kref instead of an atomic_t
to count the references.
> +static unsigned long
> +s390host_create_io_area(unsigned long addr, unsigned long flags,
> + unsigned long io_addr, struct s390host_data *data)
> +{
> + struct mm_struct *mm = current->mm;
> + struct vm_area_struct *vma;
> + unsigned long ret;
> +
> + flags &= MAP_FIXED;
> + addr = get_unmapped_area(NULL, addr, 2 * PAGE_SIZE, 0, flags);
> +
> + if (addr & ~PAGE_MASK)
> + return addr;
> +
> + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
> +
> + if (!vma)
> + return -ENOMEM;
> +
> + vma->vm_mm = mm;
> + vma->vm_start = addr;
> + vma->vm_end = addr + 2 * PAGE_SIZE;
> + vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED;
> + vma->vm_flags |= VM_SHARED | VM_MAYSHARE | VM_DONTCOPY;
> +
> +#if 1 /* FIXME: write access until sys_s390host_sie interface is final */
> + vma->vm_flags |= VM_WRITE | VM_MAYWRITE;
> +#endif
> +
> + vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
> + vma->vm_private_data = data;
> + vma->vm_ops = &s390host_vmops;
> +
> + down_write(&mm->mmap_sem);
> + ret = insert_vm_struct(mm, vma);
> + if (ret) {
> + kmem_cache_free(vm_area_cachep, vma);
> + goto out;
> + }
> + s390host_get_data(data);
> + mm->total_vm += 2;
> + vm_insert_page(vma, addr, virt_to_page(io_addr));
> +
> + ret = split_vma(mm, vma, addr + PAGE_SIZE, 0);
> + if (ret)
> + goto out;
> + s390host_get_data(data);
> +
> + vma = find_vma(mm, addr + PAGE_SIZE);
> + vma->vm_flags |= VM_WRITE | VM_MAYWRITE;
> + vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
> + vm_insert_page(vma, addr + PAGE_SIZE,
> + virt_to_page(io_addr + PAGE_SIZE));
> + ret = addr;
> +out:
> + up_write(&mm->mmap_sem);
> + return ret;
> +}
This open-coded mmap looks a bit scary. I can't see anything wrong in it,
but it may conflict with a user application that tries to do other strange
things with memory maps.
Maybe you can either make this a real call to mmap(), or have it automatically
mapped using the vdso mechanism.
> +long sys_s390host_add_cpu(unsigned long addr, unsigned long flags,
> + struct sie_block __user *sie_template)
> +{
> + struct sie_block *sie_block;
> + struct sie_io *sie_io;
> + struct sca_block *sca_block;
> + struct s390host_data *data = NULL;
> + unsigned long ret;
> + __u16 cpu;
__u16 is a type for user space interfaces. In the kernel, use u16.
> + if (current_thread_info()->sie_cpu != -1)
> + return -EINVAL;
-EBUSY?
> + if (copy_from_user(&cpu, &sie_template->icpua, sizeof(u16)))
> + return -EFAULT;
get_user()?
> + if (cpu >= S390HOST_MAX_CPUS)
> + return -EINVAL;
> +
> + mutex_lock(&s390host_init_mutex);
> +
> + data = get_s390host_context();
> + if (!data) {
> + ret = -ENOMEM;
> + goto out_err;
> + }
> +
> + sca_block = data->sca_block;
> + if (sca_block->mcn & (1UL << (S390HOST_MAX_CPUS - 1 - cpu))) {
__test_bit()?
> + ret = -EINVAL;
-EBUSY?
> +int sys_s390host_remove_cpu(void)
> +{
> + struct sca_block *sca_block;
> + int cpu;
> +
> + cpu = current_thread_info()->sie_cpu;
> + if (cpu == -1)
> + return -EINVAL;
> +
> + mutex_lock(&s390host_init_mutex);
> + sca_block = current_thread_info()->s390host_data->sca_block;
> + sca_block->mcn &= ~(1UL << (S390HOST_MAX_CPUS - 1 - cpu));
> + current_thread_info()->sie_cpu = -1;
> + mutex_unlock(&s390host_init_mutex);
> + return 0;
> +}
Shouldn't this free the resources that were allocated by add_cpu?
If not, this syscall seems pretty useless and I'd simply not
do it at all -- just wait until exit() to clean up.
> + sie_io = current_thread_info()->s390host_data->sie_io[cpu];
> +
> + if (action)
> + ret = s390host_do_action(action, sie_io);
> + if (ret)
> + goto out_err;
> + sie_kernel = &sie_io->sie_kernel;
> + sie_user = &sie_io->sie_user;
> +
> + save_fp_regs(&sie_kernel->host_fpregs);
> + save_access_regs(sie_kernel->host_acrs);
> + sie_user->guest_fpregs.fpc &= FPC_VALID_MASK;
> + restore_fp_regs(&sie_user->guest_fpregs);
> + restore_access_regs(sie_user->guest_acrs);
> + memcpy(&sie_kernel->sie_block.gg14, &sie_user->gprs[14], 16);
> +again:
> + if (need_resched())
> + schedule();
> +
> + sie_kernel->sie_block.icptcode = 0;
> + ret = sie64a(sie_kernel);
> + if (ret)
> + goto out;
> +
> + if (signal_pending(current)) {
> + ret = -EINTR;
> + goto out;
> + }
Can't you handle interrupted system calls? I think it would be nicer
to return -ERESTARTSYS here so you don't have to go back to your
user space after a signal.
> +struct sie_kernel {
> + struct sie_block sie_block;
> + s390_fp_regs host_fpregs;
> + int host_acrs[NUM_ACRS];
> +} __attribute__((packed,aligned(4096)));
Why packed?
> +#define SIE_UPDATE_PSW (1UL << 0)
> +#define SIE_FLUSH_TLB (1UL << 1)
> +#define SIE_ISKE (1UL << 2)
> +#define SIE_SSKE (1UL << 3)
> +#define SIE_BLOCK_UPDATE (1UL << 4)
> +#define SIE_VSMXM_LOCAL_UPDATE (1UL << 5)
> +#define SIE_VSMXM_DIST_UPDATE (1UL << 6)
I find it much more readable to define these like
#define SIE_UPDATE_PSW 1
#define SIE_FLUSH_TLB 2
#define SIE_ISKE 4
especially when trying to interpret values in memory.
> +struct sie_user {
> + struct sie_block sie_block;
> + psw_t psw;
> + unsigned long gprs[NUM_GPRS];
> + s390_fp_regs guest_fpregs;
> + int guest_acrs[NUM_ACRS];
> + struct sie_skey_parm iske_parm;
> + struct sie_skey_parm sske_parm;
> + int vsmxm_or_local;
> + int vsmxm_and_local;
> + int vsmxm_or;
> + int vsmxm_and;
> + int vsmxm_cpuid;
> +} __attribute__((packed,aligned(4096)));
Is this data structure extensible? If it is, you probably need
some sort of versioning information to make sure that user space
doesn't rely on fields that the kernel doesn't know about.
> +struct sca_entry {
> + atomic_t scn;
> + __u64 reserved;
> + __u64 sda;
> + __u64 reserved2[2];
> +}__attribute__((packed));
Is this a hardware data structure? If not, you shouldn't pack it
because the first word is only 32 bits and consequently all following
ones are unaligned.
> +/* function definitions */
> +extern int sie64a(struct sie_kernel *);
> +extern void s390host_put_data(struct s390host_data *);
These should be inside of '#ifdef __KERNEL__', like all declarations that user
space doesn't need to see.
> @@ -52,6 +53,8 @@ struct thread_info {
> unsigned int cpu; /* current CPU */
> int preempt_count; /* 0 => preemptable, <0 => BUG */
> struct restart_block restart_block;
> + struct s390host_data *s390host_data; /* s390host data */
> + int sie_cpu; /* sie cpu number */
> };
What happens to these variables on execve?
Arnd <><
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH/RFC 2/2] s390 virtualization interface.
[not found] ` <200704271853.41523.arnd-r2nGTMty4D4@public.gmane.org>
@ 2007-04-29 8:00 ` Heiko Carstens
[not found] ` <20070429080039.GA8332-5VkHqLvV2o3MbYB6QlFGEg@public.gmane.org>
0 siblings, 1 reply; 8+ messages in thread
From: Heiko Carstens @ 2007-04-29 8:00 UTC (permalink / raw)
To: Arnd Bergmann
Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
cborntra-tA70FqPdS9bQT0dZR+AlfA,
schwidefsky-tA70FqPdS9bQT0dZR+AlfA
On Fri, Apr 27, 2007 at 06:53:40PM +0200, Arnd Bergmann wrote:
> On Friday 27 April 2007, Carsten Otte wrote:
> > Add interface which allows a process to start a virtual machine.
>
> Looks pretty good to me already.
>
> It seems a lot closer to lguest than to kvm at the moment concerning
> the kernel interfaces (guest real address == host process, state
> is attached to the thread_info, ...).
Guess I should take a look at lguest then :)
> > +static DEFINE_MUTEX(s390host_init_mutex);
> > +
> > +static void s390host_get_data(struct s390host_data *data)
> > +{
> > + atomic_inc(&data->count);
> > +}
> > +
> > +void s390host_put_data(struct s390host_data *data)
> > +{
> > + int cpu;
> > +
> > + if (atomic_dec_return(&data->count))
> > + return;
>
> These should probably use a struct kref instead of an atomic_t
> to count the references.
That makes sense.
>
> > +static unsigned long
> > +s390host_create_io_area(unsigned long addr, unsigned long flags,
> > + unsigned long io_addr, struct s390host_data *data)
> > +{
[...]
> > + down_write(&mm->mmap_sem);
> > + ret = insert_vm_struct(mm, vma);
> > + if (ret) {
> > + kmem_cache_free(vm_area_cachep, vma);
> > + goto out;
> > + }
> > + s390host_get_data(data);
> > + mm->total_vm += 2;
> > + vm_insert_page(vma, addr, virt_to_page(io_addr));
> > +
> > + ret = split_vma(mm, vma, addr + PAGE_SIZE, 0);
> > + if (ret)
> > + goto out;
>
> This open-coded mmap looks a bit scary. I can't see anything wrong in it,
> but it may conflict with a user application that tries to do other strange
> things with memory maps.
Well, there is at least already one bug in it. If the call to split_vma fails
there is no cleanup of the vm_insert_page that has already happened.
> Maybe you can either make this a real call to mmap(), or have it automatically
> mapped using the vdso mechanism.
So, yes we need to find something better, since this is the code that I
don't like at all.
> > + __u16 cpu;
>
> __u16 is a type for user space interfaces. In the kernel, use u16.
Yes.
> > + if (current_thread_info()->sie_cpu != -1)
> > + return -EINVAL;
>
> -EBUSY?
Yes.
> > + if (copy_from_user(&cpu, &sie_template->icpua, sizeof(u16)))
> > + return -EFAULT;
>
> get_user()?
Yes.
> > + sca_block = data->sca_block;
> > + if (sca_block->mcn & (1UL << (S390HOST_MAX_CPUS - 1 - cpu))) {
>
> __test_bit()?
Yes, but I think we should probably add some function to access the mcn
bits. Each access looks a bit complicated.
> > + ret = -EINVAL;
>
> -EBUSY?
Yes.
> > +int sys_s390host_remove_cpu(void)
> > +{
> > + struct sca_block *sca_block;
> > + int cpu;
> > +
> > + cpu = current_thread_info()->sie_cpu;
> > + if (cpu == -1)
> > + return -EINVAL;
> > +
> > + mutex_lock(&s390host_init_mutex);
> > + sca_block = current_thread_info()->s390host_data->sca_block;
> > + sca_block->mcn &= ~(1UL << (S390HOST_MAX_CPUS - 1 - cpu));
> > + current_thread_info()->sie_cpu = -1;
> > + mutex_unlock(&s390host_init_mutex);
> > + return 0;
> > +}
>
> Shouldn't this free the resources that were allocated by add_cpu?
Yes, it was supposed to do that. But since it was only a prototype I was
too lazy to code it up and so I decided to free things on exit.
> If not, this syscall seems pretty useless and I'd simply not
> do it at all -- just wait until exit() to clean up.
This reveals that we are not clearing the mcn bit on exit without
pior call to this syscall. BUG().
> > + if (signal_pending(current)) {
> > + ret = -EINTR;
> > + goto out;
> > + }
>
> Can't you handle interrupted system calls? I think it would be nicer
> to return -ERESTARTSYS here so you don't have to go back to your
> user space after a signal.
That should work and it is better.
> > +struct sie_kernel {
> > + struct sie_block sie_block;
> > + s390_fp_regs host_fpregs;
> > + int host_acrs[NUM_ACRS];
> > +} __attribute__((packed,aligned(4096)));
>
> Why packed?
Not needed.
> > +#define SIE_UPDATE_PSW (1UL << 0)
> > +#define SIE_FLUSH_TLB (1UL << 1)
> > +#define SIE_ISKE (1UL << 2)
> > +#define SIE_SSKE (1UL << 3)
> > +#define SIE_BLOCK_UPDATE (1UL << 4)
> > +#define SIE_VSMXM_LOCAL_UPDATE (1UL << 5)
> > +#define SIE_VSMXM_DIST_UPDATE (1UL << 6)
>
> I find it much more readable to define these like
>
> #define SIE_UPDATE_PSW 1
> #define SIE_FLUSH_TLB 2
> #define SIE_ISKE 4
>
> especially when trying to interpret values in memory.
Hmm... don't know. Both make sense, but I have no preference on this.
> > +struct sie_user {
> > + struct sie_block sie_block;
> > + psw_t psw;
> > + unsigned long gprs[NUM_GPRS];
> > + s390_fp_regs guest_fpregs;
> > + int guest_acrs[NUM_ACRS];
> > + struct sie_skey_parm iske_parm;
> > + struct sie_skey_parm sske_parm;
> > + int vsmxm_or_local;
> > + int vsmxm_and_local;
> > + int vsmxm_or;
> > + int vsmxm_and;
> > + int vsmxm_cpuid;
> > +} __attribute__((packed,aligned(4096)));
>
> Is this data structure extensible? If it is, you probably need
> some sort of versioning information to make sure that user space
> doesn't rely on fields that the kernel doesn't know about.
I don't think we can put in some versioning information here. If
the kernel decides to increase the version then old userspace
code would break?
We rather need some mechanism so userpace can ask the kernel
"do you support feature x?" and dependent on the answer some
fields are used or unused.
> > +struct sca_entry {
> > + atomic_t scn;
> > + __u64 reserved;
> > + __u64 sda;
> > + __u64 reserved2[2];
> > +}__attribute__((packed));
>
> Is this a hardware data structure? If not, you shouldn't pack it
> because the first word is only 32 bits and consequently all following
> ones are unaligned.
It's a hardware structure. Guess the unaligned u64s were generated when
this structure was extended to support 64bit.
But of course we shouldn't put an atomic_t in a hardware structure, since
it's implementation might change.
> > +/* function definitions */
> > +extern int sie64a(struct sie_kernel *);
> > +extern void s390host_put_data(struct s390host_data *);
>
> These should be inside of '#ifdef __KERNEL__', like all declarations that user
> space doesn't need to see.
Yes, of course.
>
> > @@ -52,6 +53,8 @@ struct thread_info {
> > unsigned int cpu; /* current CPU */
> > int preempt_count; /* 0 => preemptable, <0 => BUG */
> > struct restart_block restart_block;
> > + struct s390host_data *s390host_data; /* s390host data */
> > + int sie_cpu; /* sie cpu number */
> > };
>
> What happens to these variables on execve?
They stay... We probably should do some cleanup here e.g. by adding some
code to init_new_context().
Thanks for reviewing!
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH/RFC 2/2] s390 virtualization interface.
[not found] ` <20070429080039.GA8332-5VkHqLvV2o3MbYB6QlFGEg@public.gmane.org>
@ 2007-05-01 21:12 ` Arnd Bergmann
[not found] ` <200705012312.11692.arnd-r2nGTMty4D4@public.gmane.org>
0 siblings, 1 reply; 8+ messages in thread
From: Arnd Bergmann @ 2007-05-01 21:12 UTC (permalink / raw)
To: Heiko Carstens
Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
cborntra-tA70FqPdS9bQT0dZR+AlfA,
schwidefsky-tA70FqPdS9bQT0dZR+AlfA
On Sunday 29 April 2007, Heiko Carstens wrote:
> > Is this data structure extensible? If it is, you probably need
> > some sort of versioning information to make sure that user space
> > doesn't rely on fields that the kernel doesn't know about.
>
> I don't think we can put in some versioning information here. If
> the kernel decides to increase the version then old userspace
> code would break?
> We rather need some mechanism so userpace can ask the kernel
> "do you support feature x?" and dependent on the answer some
> fields are used or unused.
You could do it the way that ext2 handles compatible and incompatible
features in the on-disk layout:
Assign a number of bits in the read-only part of the mapping to flags
that the user application can test. A bit in the compatible range mean
that a feature is available to the user application if it wants to
use it. A bit in the incompatible range means that the user space needs
to understand how to use a feature in order to run correctly.
> > > +struct sca_entry {
> > > + atomic_t scn;
> > > + __u64 reserved;
> > > + __u64 sda;
> > > + __u64 reserved2[2];
> > > +}__attribute__((packed));
> >
> > Is this a hardware data structure? If not, you shouldn't pack it
> > because the first word is only 32 bits and consequently all following
> > ones are unaligned.
>
> It's a hardware structure. Guess the unaligned u64s were generated when
> this structure was extended to support 64bit.
> But of course we shouldn't put an atomic_t in a hardware structure, since
> it's implementation might change.
Not sure about that point. If you need to do atomic operations on the
first 32 bits, you shouldn't need to invent your own abstractions for
those, and it's highly unlikely that the implementation of atomic_t changes.
Arnd <><
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH/RFC 2/2] s390 virtualization interface.
[not found] ` <200705012312.11692.arnd-r2nGTMty4D4@public.gmane.org>
@ 2007-05-02 4:46 ` Avi Kivity
2007-05-02 8:25 ` Christoph Hellwig
1 sibling, 0 replies; 8+ messages in thread
From: Avi Kivity @ 2007-05-02 4:46 UTC (permalink / raw)
To: Arnd Bergmann
Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
cborntra-tA70FqPdS9bQT0dZR+AlfA,
schwidefsky-tA70FqPdS9bQT0dZR+AlfA
Arnd Bergmann wrote:
> On Sunday 29 April 2007, Heiko Carstens wrote:
>
>
>>> Is this data structure extensible? If it is, you probably need
>>> some sort of versioning information to make sure that user space
>>> doesn't rely on fields that the kernel doesn't know about.
>>>
>> I don't think we can put in some versioning information here. If
>> the kernel decides to increase the version then old userspace
>> code would break?
>> We rather need some mechanism so userpace can ask the kernel
>> "do you support feature x?" and dependent on the answer some
>> fields are used or unused.
>>
>
> You could do it the way that ext2 handles compatible and incompatible
> features in the on-disk layout:
>
> Assign a number of bits in the read-only part of the mapping to flags
> that the user application can test. A bit in the compatible range mean
> that a feature is available to the user application if it wants to
> use it. A bit in the incompatible range means that the user space needs
> to understand how to use a feature in order to run correctly.
>
The current ioctl() interface has a feature test call, basically you
send down a feature number get see if it's supported or not. Fairly
similar to feature bit except it isn't limited in the number of bits.
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH/RFC 2/2] s390 virtualization interface.
[not found] ` <200705012312.11692.arnd-r2nGTMty4D4@public.gmane.org>
2007-05-02 4:46 ` Avi Kivity
@ 2007-05-02 8:25 ` Christoph Hellwig
1 sibling, 0 replies; 8+ messages in thread
From: Christoph Hellwig @ 2007-05-02 8:25 UTC (permalink / raw)
To: Arnd Bergmann
Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
cborntra-tA70FqPdS9bQT0dZR+AlfA,
schwidefsky-tA70FqPdS9bQT0dZR+AlfA
On Tue, May 01, 2007 at 11:12:11PM +0200, Arnd Bergmann wrote:
> Not sure about that point. If you need to do atomic operations on the
> first 32 bits, you shouldn't need to invent your own abstractions for
> those, and it's highly unlikely that the implementation of atomic_t changes.
I disagree. Using an atomic_t in a hardware structure is against all
the abstractions we've built. It's much better to have separate macros
to atomically modify a word in this hardware strcuture, even if they end
up exactly the same as the atomic_ macros - at least this way we clearly
document their intent.
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2007-05-02 8:25 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-04-27 13:40 [PATCH/RFC 2/2] s390 virtualization interface Carsten Otte
[not found] ` <1177681235.5770.22.camel-WIxn4w2hgUz3YA32ykw5MLlKpX0K8NHHQQ4Iyu8u01E@public.gmane.org>
2007-04-27 15:23 ` Anthony Liguori
[not found] ` <4632155C.8030503-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2007-04-27 15:45 ` Carsten Otte
2007-04-27 16:53 ` Arnd Bergmann
[not found] ` <200704271853.41523.arnd-r2nGTMty4D4@public.gmane.org>
2007-04-29 8:00 ` Heiko Carstens
[not found] ` <20070429080039.GA8332-5VkHqLvV2o3MbYB6QlFGEg@public.gmane.org>
2007-05-01 21:12 ` Arnd Bergmann
[not found] ` <200705012312.11692.arnd-r2nGTMty4D4@public.gmane.org>
2007-05-02 4:46 ` Avi Kivity
2007-05-02 8:25 ` Christoph Hellwig
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.