All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 2/3] sh: movca.l cancel by ocbi
@ 2009-01-11  9:14 Shin-ichiro KAWASAKI
  2009-01-11  9:16 ` [Qemu-devel] [PATCH 3/3] sh: SCI improvements Shin-ichiro KAWASAKI
  2009-01-11 14:22 ` [Qemu-devel] [PATCH 2/3] sh: movca.l cancel by ocbi Edgar E. Iglesias
  0 siblings, 2 replies; 9+ messages in thread
From: Shin-ichiro KAWASAKI @ 2009-01-11  9:14 UTC (permalink / raw)
  To: qemu-devel

Current sh4's "movca.l" instruction is implemented in a same way
as "mov.l". Then, write to memory cannot be canceled by "ocbi"
cache control instrunction.  This makes text area broken on
cache flush by linux kernel.

This patch delays "movca.l" execution and provide the chance to 
cancel it for "ocbi".
# Thank you Edgar, for your advice!

This patch does,
- on executing "movca.l", just records what and where movca 
  should store data.
- on executing "ocbi", find the corresponding record by movca,
  and delete it to cancel.
- lets TCG produce "delayed_movca" instruction at the first
  instruction which is neither "movca.l" nor "ocbi".
- on executing "delayed_movca", does the data store task,
  according to the record. 


Signed-off-by: Shin-ichiro KAWASAKI <kawasaki@juno.dti.ne.jp>

Index: trunk/target-sh4/helper.h
===================================================================
--- trunk/target-sh4/helper.h	(revision 6133)
+++ trunk/target-sh4/helper.h	(working copy)
@@ -45,4 +45,8 @@
 DEF_HELPER_1(ftrc_FT, i32, i32)
 DEF_HELPER_1(ftrc_DT, i32, i64)
 
+DEF_HELPER_2(movca, void, i32, i32)
+DEF_HELPER_1(ocbi, void, i32)
+DEF_HELPER_0(delayed_movca, void)
+
 #include "def-helper.h"
Index: trunk/target-sh4/cpu.h
===================================================================
--- trunk/target-sh4/cpu.h	(revision 6133)
+++ trunk/target-sh4/cpu.h	(working copy)
@@ -70,6 +70,7 @@
  * It is unclear if it is permitted to modify the SR_T flag in a delay slot.
  * The use of DELAY_SLOT_TRUE flag makes us accept such SR_T modification.
  */
+#define MOVCA_DELAYED          (1 << 4)  /* used in translation context */
 
 /* XXXXX The structure could be made more compact */
 typedef struct tlb_t {
@@ -91,6 +92,13 @@
 #define UTLB_SIZE 64
 #define ITLB_SIZE 4
 
+typedef struct delayed_movca_t {
+    uint32_t valid;
+    uint32_t value;
+    uint32_t addr;
+    struct delayed_movca_t * next;
+} delayed_movca_t;
+
 #define NB_MMU_MODES 2
 
 enum sh_features {
@@ -143,6 +151,7 @@
     tlb_t itlb[ITLB_SIZE];	/* instruction translation table */
     void *intc_handle;
     int intr_at_halt;		/* SR_BL ignored during sleep */
+    delayed_movca_t movca_list;
 } CPUSH4State;
 
 CPUSH4State *cpu_sh4_init(const char *cpu_model);
Index: trunk/target-sh4/op_helper.c
===================================================================
--- trunk/target-sh4/op_helper.c	(revision 6133)
+++ trunk/target-sh4/op_helper.c	(working copy)
@@ -20,6 +20,7 @@
 #include <assert.h>
 #include "exec.h"
 #include "helper.h"
+void *qemu_mallocz(size_t size);
 
 #ifndef CONFIG_USER_ONLY
 
@@ -604,3 +605,57 @@
     d.ll = t0;
     return float64_to_int32_round_to_zero(d.d, &env->fp_status);
 }
+
+void helper_movca(uint32_t val, uint32_t addr)
+{
+    delayed_movca_t *cur = &env->movca_list;
+    delayed_movca_t *prev = NULL;
+    while (cur) {
+	if (!cur->valid) {
+	    cur->valid = 1;
+	    cur->value = val;
+	    cur->addr = addr;
+	    return;
+	}
+	prev = cur;
+	cur = cur->next;
+    }
+
+    /* movca entry shortage. allocate it. */
+    prev->next = cur = qemu_mallocz(sizeof(delayed_movca_t));
+    if (cur == NULL) {
+	printf("out of memory for delayed movca. @%08x\n", addr);
+	return;
+    }
+    cur->valid = 1;
+    cur->value = val;
+    cur->addr = addr;
+}
+
+void helper_ocbi(uint32_t addr)
+{
+    delayed_movca_t *cur = &env->movca_list;
+
+    while (cur) {
+	if (cur->valid && cur->addr == addr) { /* found! */
+	    cur->valid = 0;
+	    return;
+	}
+	cur = cur->next;
+    }
+
+    printf("invalid ocbi for address @%08x  pc=%08x\n", addr, env->pc);
+}
+
+void helper_delayed_movca(void)
+{
+    delayed_movca_t *cur = &env->movca_list;
+
+    /* Execute delayed movca tasks. */
+    while (cur) {
+	if (cur->valid)
+	    __stl_mmu(cur->addr, cur->value, (env->sr & SR_MD) ? 1 : 0);
+	cur->valid = 0;
+	cur = cur->next;
+    }
+}
Index: trunk/target-sh4/translate.c
===================================================================
--- trunk/target-sh4/translate.c	(revision 6133)
+++ trunk/target-sh4/translate.c	(working copy)
@@ -1533,7 +1533,17 @@
 	}
 	return;
     case 0x00c3:		/* movca.l R0,@Rm */
-	tcg_gen_qemu_st32(REG(0), REG(B11_8), ctx->memidx);
+	/*
+	 * movca.l is used in two ways.
+	 *  [1] inhibit memory fetch for faster block transfer.
+	 *  [2] flush cache line, being used with ocbi.
+	 * For case [1], movca.l work is same as mov.l.
+	 * For case [2], movca.l work is same as nop.
+	 * Then we need to delay the execution like mov.l to provide
+	 * the chance for ocbi to cancel it.
+	 */
+	ctx->flags |= MOVCA_DELAYED;
+	gen_helper_movca(REG(0), REG(B11_8));
 	return;
     case 0x40a9:
 	/* MOVUA.L @Rm,R0 (Rm) -> R0
@@ -1550,11 +1560,7 @@
 	tcg_gen_andi_i32(REG(B11_8), cpu_sr, SR_T);
 	return;
     case 0x0093:		/* ocbi @Rn */
-	{
-	    TCGv dummy = tcg_temp_new();
-	    tcg_gen_qemu_ld32s(dummy, REG(B11_8), ctx->memidx);
-	    tcg_temp_free(dummy);
-	}
+	gen_helper_ocbi(REG(B11_8));
 	return;
     case 0x00a3:		/* ocbp @Rn */
 	{
@@ -1781,8 +1787,29 @@
 {
     uint32_t old_flags = ctx->flags;
 
+    /* pre generation */
+    if (ctx->flags & MOVCA_DELAYED) {
+	switch (ctx->opcode & 0xf0ff) {
+	case 0x00c3: /* movca.l */
+	case 0x0093: /* ocbi */
+	    /*
+	     * Do not generate delayed movca for these instrunctions to
+	     * provide ocbi to cancel the execution of delayed movca.l.
+	     * This implementation assumes that movca.l and ocbi executed
+	     * consequently to flush cache.
+	     */
+	    break;
+	default:
+	    /* generate delayed movca. */
+	    ctx->flags &= ~MOVCA_DELAYED;
+	    gen_helper_delayed_movca();
+	    break;
+	}
+    }
+
     _decode_opc(ctx);
 
+    /* post generation */
     if (old_flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) {
         if (ctx->flags & DELAY_SLOT_CLEARME) {
             gen_store_flags(0);

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Qemu-devel] [PATCH 3/3] sh: SCI improvements
@ 2009-01-11  9:16 ` Shin-ichiro KAWASAKI
  2009-01-11 15:30   ` Jean-Christophe PLAGNIOL-VILLARD
                     ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Shin-ichiro KAWASAKI @ 2009-01-11  9:16 UTC (permalink / raw)
  To: qemu-devel

SE7750 uses SCI := Serial Communication Interface for one of consoles.
This patch completes the SCI implementation, and makes SCI available
for a console.

# Tabs and spaces are mixed in "hw/sh_serial.c" so much.
# Some clean up might be useful.


Signed-off-by: Shin-ichiro KAWASAKI <kawasaki@juno.dti.ne.jp>

Index: trunk/hw/sh_serial.c
===================================================================
--- trunk/hw/sh_serial.c	(revision 6133)
+++ trunk/hw/sh_serial.c	(working copy)
@@ -96,7 +96,7 @@
         s->scr = val & ((s->feat & SH_SERIAL_FEAT_SCIF) ? 0xfa : 0xff);
         if (!(val & (1 << 5)))
             s->flags |= SH_SERIAL_FLAG_TEND;
-        if ((s->feat & SH_SERIAL_FEAT_SCIF) && s->txi) {
+        if (s->txi) {
 	    qemu_set_irq(s->txi, val & (1 << 7));
         }
         if (!(val & (1 << 6))) {
@@ -109,13 +109,15 @@
             qemu_chr_write(s->chr, &ch, 1);
 	}
 	s->dr = val;
-	s->flags &= ~SH_SERIAL_FLAG_TDE;
+        if (s->feat & SH_SERIAL_FEAT_SCIF)
+            s->flags &= ~SH_SERIAL_FLAG_TDE;
+        else
+            s->flags |= SH_SERIAL_FLAG_TDE;
         return;
-#if 0
     case 0x14: /* FRDR / RDR */
-        ret = 0;
+        /* do nothing */
+        return;
         break;
-#endif
     }
     if (s->feat & SH_SERIAL_FEAT_SCIF) {
         switch(offs) {
@@ -165,17 +167,33 @@
         case 0x24: /* LSR */
             return;
         }
-    }
-    else {
+    } else { /* SCI */
         switch(offs) {
-#if 0
-        case 0x0c:
-            ret = s->dr;
-            break;
-        case 0x10:
-            ret = 0;
-            break;
-#endif
+        case 0x0c: /* TDR */
+            if (s->chr) {
+                ch = val;
+                qemu_chr_write(s->chr, &ch, 1);
+            }
+            s->dr = val;
+            s->flags |= SH_SERIAL_FLAG_TDE | SH_SERIAL_FLAG_TEND;
+            if (s->scr & (1 << 7) && s->txi) {
+                qemu_set_irq(s->txi, 1);
+            }
+            return;
+        case 0x10: /* SSR */
+            /*
+             * Ignore TDRE (1 << 7) bit because TDR is always
+             * writable in this SCI emulation.
+             */
+            if (!(val & (1 << 6))) {
+                s->flags &= ~SH_SERIAL_FLAG_RDF;
+            }
+            if (!(val & (1 << 6))) {
+                if (s->rxi) {
+                    qemu_set_irq(s->rxi, 0);
+                }
+            }
+            return;
         case 0x1c:
             s->sptr = val & 0x8f;
             return;
@@ -191,30 +209,19 @@
     sh_serial_state *s = opaque;
     uint32_t ret = ~0;
 
-#if 0
     switch(offs) {
-    case 0x00:
+    case 0x00: /* SMR */
         ret = s->smr;
         break;
-    case 0x04:
+    case 0x04: /* BRR */
         ret = s->brr;
 	break;
-    case 0x08:
+    case 0x08: /* SCR */
         ret = s->scr;
         break;
-    case 0x14:
-        ret = 0;
-        break;
     }
-#endif
     if (s->feat & SH_SERIAL_FEAT_SCIF) {
         switch(offs) {
-        case 0x00: /* SMR */
-            ret = s->smr;
-            break;
-        case 0x08: /* SCR */
-            ret = s->scr;
-            break;
         case 0x10: /* FSR */
             ret = 0;
             if (s->flags & SH_SERIAL_FLAG_TEND)
@@ -242,11 +249,9 @@
                     s->flags &= ~SH_SERIAL_FLAG_RDF;
             }
             break;
-#if 0
         case 0x18:
             ret = s->fcr;
             break;
-#endif
         case 0x1c:
             ret = s->rx_cnt;
             break;
@@ -257,21 +262,26 @@
             ret = 0;
             break;
         }
-    }
-    else {
+    } else {
         switch(offs) {
-#if 0
-        case 0x0c:
+        case 0x0c: /* TDR */
             ret = s->dr;
             break;
-        case 0x10:
+        case 0x10: /* SSR */
             ret = 0;
+            if (s->flags & SH_SERIAL_FLAG_TDE)
+                ret |= (1 << 7);
+            if (s->flags & SH_SERIAL_FLAG_RDF)
+                ret |= (1 << 6);
+            if (s->flags & SH_SERIAL_FLAG_TEND)
+                ret |= (1 << 2);
+            /* TODO : handle bit MPBT */
             break;
-        case 0x14:
+        case 0x14: /* RDR */
             ret = s->rx_fifo[0];
+            s->flags &= ~SH_SERIAL_FLAG_RDF;
             break;
-#endif
-        case 0x1c:
+        case 0x1c: /* SPTR */
             ret = s->sptr;
             break;
         }
@@ -311,6 +321,10 @@
         }
     } else {
         s->rx_fifo[0] = ch;
+        s->flags |= SH_SERIAL_FLAG_RDF;
+        if (s->scr & (1 << 6) && s->rxi) {
+            qemu_set_irq(s->rxi, 1);
+        }
     }
 }
 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] sh: movca.l cancel by ocbi
  2009-01-11  9:14 [Qemu-devel] [PATCH 2/3] sh: movca.l cancel by ocbi Shin-ichiro KAWASAKI
  2009-01-11  9:16 ` [Qemu-devel] [PATCH 3/3] sh: SCI improvements Shin-ichiro KAWASAKI
@ 2009-01-11 14:22 ` Edgar E. Iglesias
  2009-01-12  5:13   ` Shin-ichiro KAWASAKI
  1 sibling, 1 reply; 9+ messages in thread
From: Edgar E. Iglesias @ 2009-01-11 14:22 UTC (permalink / raw)
  To: Shin-ichiro KAWASAKI; +Cc: qemu-devel

On Sun, Jan 11, 2009 at 06:14:40PM +0900, Shin-ichiro KAWASAKI wrote:
> Current sh4's "movca.l" instruction is implemented in a same way
> as "mov.l". Then, write to memory cannot be canceled by "ocbi"
> cache control instrunction.  This makes text area broken on
> cache flush by linux kernel.
> 
> This patch delays "movca.l" execution and provide the chance to 
> cancel it for "ocbi".
> # Thank you Edgar, for your advice!
> 
> This patch does,
> - on executing "movca.l", just records what and where movca 
>   should store data.
> - on executing "ocbi", find the corresponding record by movca,
>   and delete it to cancel.
> - lets TCG produce "delayed_movca" instruction at the first
>   instruction which is neither "movca.l" nor "ocbi".
> - on executing "delayed_movca", does the data store task,
>   according to the record. 
> 

Hello,

I think your patch will catch the linux dflush sequence but I've got a
few comments.

There is a complication with the delayed stores and potential mmu
exceptions that they might generate. I think with this approach
you'll take those faults at the next instruction which might not
even be a load/store. Not sure if that can cause problems for sh.
Maybe you could avoid that by making movca a load + store and the
ocbi a store with the original value loaded at movca. But you'll
still get into trouble with exceptions that might break the
movca/ocbi sequence.

I don't it's worth having a dynamically sized movca delay buffer.
The movca insn has a restricted addressing mode making it a bit hard
to use back-to-back without alu insns in between. IIU your code
correctly you flush the buffer before every non movca/ocbi insn so I
guess that for most realistic use-cases you'll only need very few
(maybe only 1) entries in the delay buffer.

> Signed-off-by: Shin-ichiro KAWASAKI <kawasaki@juno.dti.ne.jp>
>
> Index: trunk/target-sh4/op_helper.c
> ===================================================================
> --- trunk/target-sh4/op_helper.c	(revision 6133)
> +++ trunk/target-sh4/op_helper.c	(working copy)
> @@ -20,6 +20,7 @@
>  #include <assert.h>
>  #include "exec.h"
>  #include "helper.h"
> +void *qemu_mallocz(size_t size);
>  
>  #ifndef CONFIG_USER_ONLY
>  
> @@ -604,3 +605,57 @@
>      d.ll = t0;
>      return float64_to_int32_round_to_zero(d.d, &env->fp_status);
>  }
> +
> +void helper_movca(uint32_t val, uint32_t addr)
> +{
> +    delayed_movca_t *cur = &env->movca_list;
> +    delayed_movca_t *prev = NULL;
> +    while (cur) {
> +	if (!cur->valid) {

Might be overkill but you can merge here, i.e:

	if (!cur->valid || cur->addr == addr) {


> +	    cur->valid = 1;
> +	    cur->value = val;
> +	    cur->addr = addr;
> +	    return;
> +	}
> +	prev = cur;
> +	cur = cur->next;
> +    }
> +
> +    /* movca entry shortage. allocate it. */
> +    prev->next = cur = qemu_mallocz(sizeof(delayed_movca_t));
> +    if (cur == NULL) {
> +	printf("out of memory for delayed movca. @%08x\n", addr);
> +	return;
> +    }
> +    cur->valid = 1;
> +    cur->value = val;
> +    cur->addr = addr;
> +}
> +
> +void helper_ocbi(uint32_t addr)
> +{
> +    delayed_movca_t *cur = &env->movca_list;
> +
> +    while (cur) {
> +	if (cur->valid && cur->addr == addr) { /* found! */
> +	    cur->valid = 0;
> +	    return;
> +	}
> +	cur = cur->next;
> +    }

I think you can quite easy catch a few more movca use cases if you
dont compare line offsets, i.e for sh4 you'd mask the addr & ~(32 - 1).
Also, you should not return until you've scanned the entire movca delay
buffer as there might be multiple stores to the same line that need to be
ignored.

example:
    addr &= ~31;
    while (cur) {
       if (cur->valid && (cur->addr & ~31) == addr)
           cur->valid = 0;
       cur = cur->next;
    }

Thanks

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] sh: SCI improvements
  2009-01-11  9:16 ` [Qemu-devel] [PATCH 3/3] sh: SCI improvements Shin-ichiro KAWASAKI
@ 2009-01-11 15:30   ` Jean-Christophe PLAGNIOL-VILLARD
  2009-01-11 17:02   ` [Qemu-devel] [PATCH] sh/serial: allow cpu regs definition Jean-Christophe PLAGNIOL-VILLARD
  2009-01-12  6:42   ` [Qemu-devel] [PATCH 3/3] sh: SCI improvements Jean-Christophe PLAGNIOL-VILLARD
  2 siblings, 0 replies; 9+ messages in thread
From: Jean-Christophe PLAGNIOL-VILLARD @ 2009-01-11 15:30 UTC (permalink / raw)
  To: Shin-ichiro KAWASAKI; +Cc: qemu-devel

On 18:16 Sun 11 Jan     , Shin-ichiro KAWASAKI wrote:
> SE7750 uses SCI := Serial Communication Interface for one of consoles.
> This patch completes the SCI implementation, and makes SCI available
> for a console.
> 
> # Tabs and spaces are mixed in "hw/sh_serial.c" so much.
> # Some clean up might be useful.
I've test it also under u-boot the ouput work but the input steel not working

Best Regards,
J.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Qemu-devel] [PATCH] sh/serial: allow cpu regs definition
  2009-01-11  9:16 ` [Qemu-devel] [PATCH 3/3] sh: SCI improvements Shin-ichiro KAWASAKI
  2009-01-11 15:30   ` Jean-Christophe PLAGNIOL-VILLARD
@ 2009-01-11 17:02   ` Jean-Christophe PLAGNIOL-VILLARD
  2009-01-12  6:42   ` [Qemu-devel] [PATCH 3/3] sh: SCI improvements Jean-Christophe PLAGNIOL-VILLARD
  2 siblings, 0 replies; 9+ messages in thread
From: Jean-Christophe PLAGNIOL-VILLARD @ 2009-01-11 17:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: Jean-Christophe PLAGNIOL-VILLARD

sh_serial_regs will allow to define the SCI/SCIF cpu regs
need to add SH cpu other than sh7750

Signed-off-by: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
---
apply after  sh: SCI improvements patch from Kawasaki-san

Best Regards,
J.
 hw/sh.h        |    2 +
 hw/sh7750.c    |   18 +++++++++++-
 hw/sh_serial.c |   86 ++++++++++++++++++++++++++-----------------------------
 hw/sh_serial.h |   41 ++++++++++++++++++++++++++
 4 files changed, 101 insertions(+), 46 deletions(-)
 create mode 100644 hw/sh_serial.h

diff --git a/hw/sh.h b/hw/sh.h
index 5e3c22b..c92fa8c 100644
--- a/hw/sh.h
+++ b/hw/sh.h
@@ -3,6 +3,7 @@
 /* Definitions for SH board emulation.  */
 
 #include "sh_intc.h"
+#include "sh_serial.h"
 
 #define A7ADDR(x) ((x) & 0x1fffffff)
 #define P4ADDR(x) ((x) | 0xe0000000)
@@ -38,6 +39,7 @@ void tmu012_init(target_phys_addr_t base, int feat, uint32_t freq,
 /* sh_serial.c */
 #define SH_SERIAL_FEAT_SCIF (1 << 0)
 void sh_serial_init (target_phys_addr_t base, int feat,
+		     sh_serial_regs *regs,
 		     uint32_t freq, CharDriverState *chr,
 		     qemu_irq eri_source,
 		     qemu_irq rxi_source,
diff --git a/hw/sh7750.c b/hw/sh7750.c
index c8e3070..7b09ae8 100644
--- a/hw/sh7750.c
+++ b/hw/sh7750.c
@@ -30,6 +30,7 @@
 #include "sh7750_regs.h"
 #include "sh7750_regnames.h"
 #include "sh_intc.h"
+#include "sh_serial.h"
 #include "exec-all.h"
 #include "cpu.h"
 
@@ -714,6 +715,19 @@ static CPUWriteMemoryFunc *sh7750_mmct_write[] = {
     sh7750_mmct_writel
 };
 
+sh_serial_regs sh7750_sci = {
+    .scsmr = 0x0,
+    .scbrr = 0x4,
+    .scscr = 0x8,
+    .scfcr = 0x18,
+    .scfdr = 0x1c,
+    .scftdr = 0xc,
+    .scfsr = 0x10,
+    .scfrdr = 0x14,
+    .scsptr = 0x20,
+    .sclsr = 0x24,
+};
+
 SH7750State *sh7750_init(CPUSH4State * cpu)
 {
     SH7750State *s;
@@ -755,13 +769,15 @@ SH7750State *sh7750_init(CPUSH4State * cpu)
 
     cpu->intc_handle = &s->intc;
 
-    sh_serial_init(0x1fe00000, 0, s->periph_freq, serial_hds[0],
+    sh_serial_init(0x1fe00000, 0, &sh7750_sci,
+		   s->periph_freq, serial_hds[0],
 		   s->intc.irqs[SCI1_ERI],
 		   s->intc.irqs[SCI1_RXI],
 		   s->intc.irqs[SCI1_TXI],
 		   s->intc.irqs[SCI1_TEI],
 		   NULL);
     sh_serial_init(0x1fe80000, SH_SERIAL_FEAT_SCIF,
+		   &sh7750_sci,
 		   s->periph_freq, serial_hds[1],
 		   s->intc.irqs[SCIF_ERI],
 		   s->intc.irqs[SCIF_RXI],
diff --git a/hw/sh_serial.c b/hw/sh_serial.c
index 3db7e6c..3d5870d 100644
--- a/hw/sh_serial.c
+++ b/hw/sh_serial.c
@@ -26,6 +26,7 @@
  */
 #include "hw.h"
 #include "sh.h"
+#include "sh_serial.h"
 #include "qemu-char.h"
 #include <assert.h>
 
@@ -65,6 +66,8 @@ typedef struct {
     qemu_irq txi;
     qemu_irq tei;
     qemu_irq bri;
+
+    sh_serial_regs *regs;
 } sh_serial_state;
 
 static void sh_serial_clear_fifo(sh_serial_state * s)
@@ -78,20 +81,22 @@ static void sh_serial_clear_fifo(sh_serial_state * s)
 static void sh_serial_ioport_write(void *opaque, uint32_t offs, uint32_t val)
 {
     sh_serial_state *s = opaque;
+    sh_serial_regs *regs = s->regs;
     unsigned char ch;
 
 #ifdef DEBUG_SERIAL
     printf("sh_serial: write offs=0x%02x val=0x%02x\n",
 	   offs, val);
 #endif
-    switch(offs) {
-    case 0x00: /* SMR */
+    if (offs == regs->scsmr) { /* SMR */
         s->smr = val & ((s->feat & SH_SERIAL_FEAT_SCIF) ? 0x7b : 0xff);
         return;
-    case 0x04: /* BRR */
+    }
+    if (offs == regs->scbrr) { /* BRR */
         s->brr = val;
 	return;
-    case 0x08: /* SCR */
+    }
+    if (offs == regs->scscr) { /* SCR */
         /* TODO : For SH7751, SCIF mask should be 0xfb. */
         s->scr = val & ((s->feat & SH_SERIAL_FEAT_SCIF) ? 0xfa : 0xff);
         if (!(val & (1 << 5)))
@@ -103,7 +108,8 @@ static void sh_serial_ioport_write(void *opaque, uint32_t offs, uint32_t val)
 	    qemu_set_irq(s->rxi, 0);
         }
         return;
-    case 0x0c: /* FTDR / TDR */
+    }
+    if (offs == regs->scftdr) { /* FTDR / TDR */
         if (s->chr) {
             ch = val;
             qemu_chr_write(s->chr, &ch, 1);
@@ -114,14 +120,13 @@ static void sh_serial_ioport_write(void *opaque, uint32_t offs, uint32_t val)
         else
             s->flags |= SH_SERIAL_FLAG_TDE;
         return;
-    case 0x14: /* FRDR / RDR */
+    }
+    if (offs == regs->scfrdr) { /* FRDR / RDR */
         /* do nothing */
         return;
-        break;
     }
     if (s->feat & SH_SERIAL_FEAT_SCIF) {
-        switch(offs) {
-        case 0x10: /* FSR */
+        if (offs == regs->scfsr) { /* FSR */
             if (!(val & (1 << 6)))
                 s->flags &= ~SH_SERIAL_FLAG_TEND;
             if (!(val & (1 << 5)))
@@ -139,7 +144,8 @@ static void sh_serial_ioport_write(void *opaque, uint32_t offs, uint32_t val)
                 }
             }
             return;
-        case 0x18: /* FCR */
+	}
+        if (offs == regs->scfcr) { /* FCR */
             s->fcr = val;
             switch ((val >> 6) & 3) {
             case 0:
@@ -161,15 +167,16 @@ static void sh_serial_ioport_write(void *opaque, uint32_t offs, uint32_t val)
             }
 
             return;
-        case 0x20: /* SPTR */
+	}
+        if (offs == regs->scsptr) { /* SPTR */
             s->sptr = val & 0xf3;
             return;
-        case 0x24: /* LSR */
+	}
+        if (offs == regs->sclsr) { /* LSR */
             return;
         }
     } else { /* SCI */
-        switch(offs) {
-        case 0x0c: /* TDR */
+        if (offs == regs->scftdr) { /* TDR */
             if (s->chr) {
                 ch = val;
                 qemu_chr_write(s->chr, &ch, 1);
@@ -180,7 +187,8 @@ static void sh_serial_ioport_write(void *opaque, uint32_t offs, uint32_t val)
                 qemu_set_irq(s->txi, 1);
             }
             return;
-        case 0x10: /* SSR */
+	}
+        if (offs == regs->scfsr) { /* SSR */
             /*
              * Ignore TDRE (1 << 7) bit because TDR is always
              * writable in this SCI emulation.
@@ -194,7 +202,8 @@ static void sh_serial_ioport_write(void *opaque, uint32_t offs, uint32_t val)
                 }
             }
             return;
-        case 0x1c:
+	}
+        if (offs == regs->scfdr) {
             s->sptr = val & 0x8f;
             return;
         }
@@ -207,22 +216,18 @@ static void sh_serial_ioport_write(void *opaque, uint32_t offs, uint32_t val)
 static uint32_t sh_serial_ioport_read(void *opaque, uint32_t offs)
 {
     sh_serial_state *s = opaque;
+    sh_serial_regs *regs = s->regs;
     uint32_t ret = ~0;
 
-    switch(offs) {
-    case 0x00: /* SMR */
+    if (offs == regs->scsmr) { /* SMR */
         ret = s->smr;
-        break;
-    case 0x04: /* BRR */
+    } else if (offs == regs->scbrr) { /* BRR */
         ret = s->brr;
-	break;
-    case 0x08: /* SCR */
+    } else if (offs == regs->scscr) { /* SCR */
         ret = s->scr;
-        break;
     }
     if (s->feat & SH_SERIAL_FEAT_SCIF) {
-        switch(offs) {
-        case 0x10: /* FSR */
+        if (offs == regs->scfsr) { /* FSR */
             ret = 0;
             if (s->flags & SH_SERIAL_FLAG_TEND)
                 ret |= (1 << 6);
@@ -238,8 +243,7 @@ static uint32_t sh_serial_ioport_read(void *opaque, uint32_t offs)
             if (s->scr & (1 << 5))
                 s->flags |= SH_SERIAL_FLAG_TDE | SH_SERIAL_FLAG_TEND;
 
-            break;
-        case 0x14:
+        } else if (offs == regs->scfrdr) {
             if (s->rx_cnt > 0) {
                 ret = s->rx_fifo[s->rx_tail++];
                 s->rx_cnt--;
@@ -248,26 +252,19 @@ static uint32_t sh_serial_ioport_read(void *opaque, uint32_t offs)
                 if (s->rx_cnt < s->rtrg)
                     s->flags &= ~SH_SERIAL_FLAG_RDF;
             }
-            break;
-        case 0x18:
+        } else if (offs == regs->scfcr) {
             ret = s->fcr;
-            break;
-        case 0x1c:
+        } else if (offs == regs->scfdr) {
             ret = s->rx_cnt;
-            break;
-        case 0x20:
+        } else if (offs == regs->scsptr) {
             ret = s->sptr;
-            break;
-        case 0x24:
+        } else if (offs == regs->sclsr) {
             ret = 0;
-            break;
         }
     } else {
-        switch(offs) {
-        case 0x0c: /* TDR */
+        if (offs == regs->scftdr) { /* TDR */
             ret = s->dr;
-            break;
-        case 0x10: /* SSR */
+        } else if (offs == regs->scfsr) { /* SSR */
             ret = 0;
             if (s->flags & SH_SERIAL_FLAG_TDE)
                 ret |= (1 << 7);
@@ -276,14 +273,11 @@ static uint32_t sh_serial_ioport_read(void *opaque, uint32_t offs)
             if (s->flags & SH_SERIAL_FLAG_TEND)
                 ret |= (1 << 2);
             /* TODO : handle bit MPBT */
-            break;
-        case 0x14: /* RDR */
+        } else if (offs == regs->scfrdr) { /* RDR */
             ret = s->rx_fifo[0];
             s->flags &= ~SH_SERIAL_FLAG_RDF;
-            break;
-        case 0x1c: /* SPTR */
+        } else if (offs == regs->scsptr) { /* SPTR */
             ret = s->sptr;
-            break;
         }
     }
 #ifdef DEBUG_SERIAL
@@ -379,6 +373,7 @@ static CPUWriteMemoryFunc *sh_serial_writefn[] = {
 };
 
 void sh_serial_init (target_phys_addr_t base, int feat,
+		     sh_serial_regs *regs,
 		     uint32_t freq, CharDriverState *chr,
 		     qemu_irq eri_source,
 		     qemu_irq rxi_source,
@@ -394,6 +389,7 @@ void sh_serial_init (target_phys_addr_t base, int feat,
         return;
 
     s->feat = feat;
+    s->regs = regs;
     s->flags = SH_SERIAL_FLAG_TEND | SH_SERIAL_FLAG_TDE;
     s->rtrg = 1;
 
diff --git a/hw/sh_serial.h b/hw/sh_serial.h
new file mode 100644
index 0000000..69345ac
--- /dev/null
+++ b/hw/sh_serial.h
@@ -0,0 +1,41 @@
+/*
+ * QEMU SH serial port emulation
+ *
+ * Copyright (c) 2009 Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __SH_SERIAL_H__
+#define __SH_SERIAL_H__
+
+typedef struct sh_serial_regs {
+    uint16_t scsmr;
+    uint16_t scbrr;
+    uint16_t scscr;
+    uint16_t scfcr;
+    uint16_t scfdr;
+    uint16_t scftdr;
+    uint16_t scfsr;
+    uint16_t scfrdr;
+    uint16_t scsptr;
+    uint16_t sclsr;
+} sh_serial_regs;
+
+#endif /* __SH_SERIAL_H__ */
-- 
1.5.6.5

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] sh: movca.l cancel by ocbi
  2009-01-11 14:22 ` [Qemu-devel] [PATCH 2/3] sh: movca.l cancel by ocbi Edgar E. Iglesias
@ 2009-01-12  5:13   ` Shin-ichiro KAWASAKI
  2009-01-13  0:21     ` Shin-ichiro KAWASAKI
  0 siblings, 1 reply; 9+ messages in thread
From: Shin-ichiro KAWASAKI @ 2009-01-12  5:13 UTC (permalink / raw)
  To: Edgar E. Iglesias; +Cc: qemu-devel

Thank you for your commmnts!

Edgar E. Iglesias wrote:
> On Sun, Jan 11, 2009 at 06:14:40PM +0900, Shin-ichiro KAWASAKI wrote:
>> Current sh4's "movca.l" instruction is implemented in a same way
>> as "mov.l". Then, write to memory cannot be canceled by "ocbi"
>> cache control instrunction.  This makes text area broken on
>> cache flush by linux kernel.
>>
>> This patch delays "movca.l" execution and provide the chance to 
>> cancel it for "ocbi".
>> # Thank you Edgar, for your advice!
>>
>> This patch does,
>> - on executing "movca.l", just records what and where movca 
>>   should store data.
>> - on executing "ocbi", find the corresponding record by movca,
>>   and delete it to cancel.
>> - lets TCG produce "delayed_movca" instruction at the first
>>   instruction which is neither "movca.l" nor "ocbi".
>> - on executing "delayed_movca", does the data store task,
>>   according to the record. 
>>
> 
> Hello,
> 
> I think your patch will catch the linux dflush sequence but I've got a
> few comments.
> 
> There is a complication with the delayed stores and potential mmu
> exceptions that they might generate. I think with this approach
> you'll take those faults at the next instruction which might not
> even be a load/store. Not sure if that can cause problems for sh.
> Maybe you could avoid that by making movca a load + store and the
> ocbi a store with the original value loaded at movca. But you'll
> still get into trouble with exceptions that might break the
> movca/ocbi sequence.
That's an idea. Current implementation is "delay and do-or-cancel" way.
But your idea is "do and commit-or-rollback" way.
I'll try to implement it to avoid mmu fault by movca.
The mmu fault by movca will cause invoke exception handler, and break
movca/ocbi sequence.  It will be a quite rare case, however, I'll consider
it for next version.

> I don't it's worth having a dynamically sized movca delay buffer.
> The movca insn has a restricted addressing mode making it a bit hard
> to use back-to-back without alu insns in between. IIU your code
> correctly you flush the buffer before every non movca/ocbi insn so I
> guess that for most realistic use-cases you'll only need very few
> (maybe only 1) entries in the delay buffer.
That is a point I wavered.
I investigated linux kernel and found following lines in
"__flush_dcache_segment_4way()" in "arch/sh/mm/cache-sh4.c".

	a0 = base_addr;
	a1 = a0 + way_incr;
	a2 = a1 + way_incr;
	a3 = a2 + way_incr;
	a0e = base_addr + extent_per_way;
	do {
		asm volatile("ldc %0, sr" : : "r" (sr_with_bl));
		asm volatile("movca.l r0, @%0\n\t"
			     "movca.l r0, @%1\n\t"
			     "movca.l r0, @%2\n\t"
			     "movca.l r0, @%3\n\t"
			     "ocbi @%0\n\t"
			     "ocbi @%1\n\t"
			     "ocbi @%2\n\t"
			     "ocbi @%3\n\t" : :
			     "r" (a0), "r" (a1), "r" (a2), "r" (a3));

Multiple movca.l insns executed consequently, and they access different lines.
At least, 4 entries are needed for it.  This code shows kernel implementation
decides how many number of movcas are invoked consequently.
That's the reason why I introudcued dynamic size change.  


>> Signed-off-by: Shin-ichiro KAWASAKI <kawasaki@juno.dti.ne.jp>
>>
>> Index: trunk/target-sh4/op_helper.c
>> ===================================================================
>> --- trunk/target-sh4/op_helper.c	(revision 6133)
>> +++ trunk/target-sh4/op_helper.c	(working copy)
>> @@ -20,6 +20,7 @@
>>  #include <assert.h>
>>  #include "exec.h"
>>  #include "helper.h"
>> +void *qemu_mallocz(size_t size);
>>  
>>  #ifndef CONFIG_USER_ONLY
>>  
>> @@ -604,3 +605,57 @@
>>      d.ll = t0;
>>      return float64_to_int32_round_to_zero(d.d, &env->fp_status);
>>  }
>> +
>> +void helper_movca(uint32_t val, uint32_t addr)
>> +{
>> +    delayed_movca_t *cur = &env->movca_list;
>> +    delayed_movca_t *prev = NULL;
>> +    while (cur) {
>> +	if (!cur->valid) {
> 
> Might be overkill but you can merge here, i.e:
> 
> 	if (!cur->valid || cur->addr == addr) {
> 
> 
>> +	    cur->valid = 1;
>> +	    cur->value = val;
>> +	    cur->addr = addr;
>> +	    return;
>> +	}
>> +	prev = cur;
>> +	cur = cur->next;
>> +    }
>> +
>> +    /* movca entry shortage. allocate it. */
>> +    prev->next = cur = qemu_mallocz(sizeof(delayed_movca_t));
>> +    if (cur == NULL) {
>> +	printf("out of memory for delayed movca. @%08x\n", addr);
>> +	return;
>> +    }
>> +    cur->valid = 1;
>> +    cur->value = val;
>> +    cur->addr = addr;
>> +}
>> +
>> +void helper_ocbi(uint32_t addr)
>> +{
>> +    delayed_movca_t *cur = &env->movca_list;
>> +
>> +    while (cur) {
>> +	if (cur->valid && cur->addr == addr) { /* found! */
>> +	    cur->valid = 0;
>> +	    return;
>> +	}
>> +	cur = cur->next;
>> +    }
> 
> I think you can quite easy catch a few more movca use cases if you
> dont compare line offsets, i.e for sh4 you'd mask the addr & ~(32 - 1).
> Also, you should not return until you've scanned the entire movca delay
> buffer as there might be multiple stores to the same line that need to be
> ignored.
> 
> example:
>     addr &= ~31;
>     while (cur) {
>        if (cur->valid && (cur->addr & ~31) == addr)
>            cur->valid = 0;
>        cur = cur->next;
>     }

That's a good advice.  I'll try to implement it for next version.

Thanks again for your review!

Regards,
Shin-ichiro KAWASAKI

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] sh: SCI improvements
  2009-01-11  9:16 ` [Qemu-devel] [PATCH 3/3] sh: SCI improvements Shin-ichiro KAWASAKI
  2009-01-11 15:30   ` Jean-Christophe PLAGNIOL-VILLARD
  2009-01-11 17:02   ` [Qemu-devel] [PATCH] sh/serial: allow cpu regs definition Jean-Christophe PLAGNIOL-VILLARD
@ 2009-01-12  6:42   ` Jean-Christophe PLAGNIOL-VILLARD
  2 siblings, 0 replies; 9+ messages in thread
From: Jean-Christophe PLAGNIOL-VILLARD @ 2009-01-12  6:42 UTC (permalink / raw)
  To: Shin-ichiro KAWASAKI; +Cc: qemu-devel

On 18:16 Sun 11 Jan     , Shin-ichiro KAWASAKI wrote:
> SE7750 uses SCI := Serial Communication Interface for one of consoles.
> This patch completes the SCI implementation, and makes SCI available
> for a console.
> 
> # Tabs and spaces are mixed in "hw/sh_serial.c" so much.
> # Some clean up might be useful.
> 
> 
> Signed-off-by: Shin-ichiro KAWASAKI <kawasaki@juno.dti.ne.jp>
> 
(I'm working on it and have result to nearly the same patch)
I've test your code against current U-Boot
and I've found that the input stell no working

after some debug I've that qemu call twice the function sh_serial_can_receive1
which return false twice. Just after U-Boot enable the receive (writing RE and
TE in SCR) but qemu will not re-call sh_serial_can_receive1.

As test If we force to always return 1 to sh_serial_can_receive1 the input
work

I've start to search how to notify qemu that U-Boot active/desativate the
receive

Best Regards,
J.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] sh: movca.l cancel by ocbi
  2009-01-12  5:13   ` Shin-ichiro KAWASAKI
@ 2009-01-13  0:21     ` Shin-ichiro KAWASAKI
  2009-01-14 18:46       ` Edgar E. Iglesias
  0 siblings, 1 reply; 9+ messages in thread
From: Shin-ichiro KAWASAKI @ 2009-01-13  0:21 UTC (permalink / raw)
  To: Shin-ichiro KAWASAKI; +Cc: qemu-devel, Edgar E. Iglesias

I've completely passed over Vladimir-san's nice patch on movca-ocbi,
even though it is already in qemu-sh staging repository.

http://lists.gnu.org/archive/html/qemu-devel/2008-12/msg00605.html

But now, I can review the patch.  I'll do it later.

Regards,
Shin-ichiro KAWASAKI


Shin-ichiro KAWASAKI wrote:
> Thank you for your commmnts!
> 
> Edgar E. Iglesias wrote:
>> On Sun, Jan 11, 2009 at 06:14:40PM +0900, Shin-ichiro KAWASAKI wrote:
>>> Current sh4's "movca.l" instruction is implemented in a same way
>>> as "mov.l". Then, write to memory cannot be canceled by "ocbi"
>>> cache control instrunction.  This makes text area broken on
>>> cache flush by linux kernel.
>>>
>>> This patch delays "movca.l" execution and provide the chance to 
>>> cancel it for "ocbi".
>>> # Thank you Edgar, for your advice!
>>>
>>> This patch does,
>>> - on executing "movca.l", just records what and where movca   should 
>>> store data.
>>> - on executing "ocbi", find the corresponding record by movca,
>>>   and delete it to cancel.
>>> - lets TCG produce "delayed_movca" instruction at the first
>>>   instruction which is neither "movca.l" nor "ocbi".
>>> - on executing "delayed_movca", does the data store task,
>>>   according to the record.
>>
>> Hello,
>>
>> I think your patch will catch the linux dflush sequence but I've got a
>> few comments.
>>
>> There is a complication with the delayed stores and potential mmu
>> exceptions that they might generate. I think with this approach
>> you'll take those faults at the next instruction which might not
>> even be a load/store. Not sure if that can cause problems for sh.
>> Maybe you could avoid that by making movca a load + store and the
>> ocbi a store with the original value loaded at movca. But you'll
>> still get into trouble with exceptions that might break the
>> movca/ocbi sequence.
> That's an idea. Current implementation is "delay and do-or-cancel" way.
> But your idea is "do and commit-or-rollback" way.
> I'll try to implement it to avoid mmu fault by movca.
> The mmu fault by movca will cause invoke exception handler, and break
> movca/ocbi sequence.  It will be a quite rare case, however, I'll consider
> it for next version.
> 
>> I don't it's worth having a dynamically sized movca delay buffer.
>> The movca insn has a restricted addressing mode making it a bit hard
>> to use back-to-back without alu insns in between. IIU your code
>> correctly you flush the buffer before every non movca/ocbi insn so I
>> guess that for most realistic use-cases you'll only need very few
>> (maybe only 1) entries in the delay buffer.
> That is a point I wavered.
> I investigated linux kernel and found following lines in
> "__flush_dcache_segment_4way()" in "arch/sh/mm/cache-sh4.c".
> 
>     a0 = base_addr;
>     a1 = a0 + way_incr;
>     a2 = a1 + way_incr;
>     a3 = a2 + way_incr;
>     a0e = base_addr + extent_per_way;
>     do {
>         asm volatile("ldc %0, sr" : : "r" (sr_with_bl));
>         asm volatile("movca.l r0, @%0\n\t"
>                  "movca.l r0, @%1\n\t"
>                  "movca.l r0, @%2\n\t"
>                  "movca.l r0, @%3\n\t"
>                  "ocbi @%0\n\t"
>                  "ocbi @%1\n\t"
>                  "ocbi @%2\n\t"
>                  "ocbi @%3\n\t" : :
>                  "r" (a0), "r" (a1), "r" (a2), "r" (a3));
> 
> Multiple movca.l insns executed consequently, and they access different 
> lines.
> At least, 4 entries are needed for it.  This code shows kernel 
> implementation
> decides how many number of movcas are invoked consequently.
> That's the reason why I introudcued dynamic size change. 
> 
>>> Signed-off-by: Shin-ichiro KAWASAKI <kawasaki@juno.dti.ne.jp>
>>>
>>> Index: trunk/target-sh4/op_helper.c
>>> ===================================================================
>>> --- trunk/target-sh4/op_helper.c    (revision 6133)
>>> +++ trunk/target-sh4/op_helper.c    (working copy)
>>> @@ -20,6 +20,7 @@
>>>  #include <assert.h>
>>>  #include "exec.h"
>>>  #include "helper.h"
>>> +void *qemu_mallocz(size_t size);
>>>  
>>>  #ifndef CONFIG_USER_ONLY
>>>  
>>> @@ -604,3 +605,57 @@
>>>      d.ll = t0;
>>>      return float64_to_int32_round_to_zero(d.d, &env->fp_status);
>>>  }
>>> +
>>> +void helper_movca(uint32_t val, uint32_t addr)
>>> +{
>>> +    delayed_movca_t *cur = &env->movca_list;
>>> +    delayed_movca_t *prev = NULL;
>>> +    while (cur) {
>>> +    if (!cur->valid) {
>>
>> Might be overkill but you can merge here, i.e:
>>
>>     if (!cur->valid || cur->addr == addr) {
>>
>>
>>> +        cur->valid = 1;
>>> +        cur->value = val;
>>> +        cur->addr = addr;
>>> +        return;
>>> +    }
>>> +    prev = cur;
>>> +    cur = cur->next;
>>> +    }
>>> +
>>> +    /* movca entry shortage. allocate it. */
>>> +    prev->next = cur = qemu_mallocz(sizeof(delayed_movca_t));
>>> +    if (cur == NULL) {
>>> +    printf("out of memory for delayed movca. @%08x\n", addr);
>>> +    return;
>>> +    }
>>> +    cur->valid = 1;
>>> +    cur->value = val;
>>> +    cur->addr = addr;
>>> +}
>>> +
>>> +void helper_ocbi(uint32_t addr)
>>> +{
>>> +    delayed_movca_t *cur = &env->movca_list;
>>> +
>>> +    while (cur) {
>>> +    if (cur->valid && cur->addr == addr) { /* found! */
>>> +        cur->valid = 0;
>>> +        return;
>>> +    }
>>> +    cur = cur->next;
>>> +    }
>>
>> I think you can quite easy catch a few more movca use cases if you
>> dont compare line offsets, i.e for sh4 you'd mask the addr & ~(32 - 1).
>> Also, you should not return until you've scanned the entire movca delay
>> buffer as there might be multiple stores to the same line that need to be
>> ignored.
>>
>> example:
>>     addr &= ~31;
>>     while (cur) {
>>        if (cur->valid && (cur->addr & ~31) == addr)
>>            cur->valid = 0;
>>        cur = cur->next;
>>     }
> 
> That's a good advice.  I'll try to implement it for next version.
> 
> Thanks again for your review!
> 
> Regards,
> Shin-ichiro KAWASAKI
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] sh: movca.l cancel by ocbi
  2009-01-13  0:21     ` Shin-ichiro KAWASAKI
@ 2009-01-14 18:46       ` Edgar E. Iglesias
  0 siblings, 0 replies; 9+ messages in thread
From: Edgar E. Iglesias @ 2009-01-14 18:46 UTC (permalink / raw)
  To: Shin-ichiro KAWASAKI; +Cc: Edgar E. Iglesias, qemu-devel

On Tue, Jan 13, 2009 at 09:21:18AM +0900, Shin-ichiro KAWASAKI wrote:
> I've completely passed over Vladimir-san's nice patch on movca-ocbi,
> even though it is already in qemu-sh staging repository.
>
> http://lists.gnu.org/archive/html/qemu-devel/2008-12/msg00605.html

Hah funny, that patch looks very much like yours :)
I'll try to review it too.

Cheers

>
> But now, I can review the patch.  I'll do it later.
>
> Regards,
> Shin-ichiro KAWASAKI
>
>
> Shin-ichiro KAWASAKI wrote:
>> Thank you for your commmnts!
>> Edgar E. Iglesias wrote:
>>> On Sun, Jan 11, 2009 at 06:14:40PM +0900, Shin-ichiro KAWASAKI wrote:
>>>> Current sh4's "movca.l" instruction is implemented in a same way
>>>> as "mov.l". Then, write to memory cannot be canceled by "ocbi"
>>>> cache control instrunction.  This makes text area broken on
>>>> cache flush by linux kernel.
>>>>
>>>> This patch delays "movca.l" execution and provide the chance to cancel 
>>>> it for "ocbi".
>>>> # Thank you Edgar, for your advice!
>>>>
>>>> This patch does,
>>>> - on executing "movca.l", just records what and where movca   should 
>>>> store data.
>>>> - on executing "ocbi", find the corresponding record by movca,
>>>>   and delete it to cancel.
>>>> - lets TCG produce "delayed_movca" instruction at the first
>>>>   instruction which is neither "movca.l" nor "ocbi".
>>>> - on executing "delayed_movca", does the data store task,
>>>>   according to the record.
>>>
>>> Hello,
>>>
>>> I think your patch will catch the linux dflush sequence but I've got a
>>> few comments.
>>>
>>> There is a complication with the delayed stores and potential mmu
>>> exceptions that they might generate. I think with this approach
>>> you'll take those faults at the next instruction which might not
>>> even be a load/store. Not sure if that can cause problems for sh.
>>> Maybe you could avoid that by making movca a load + store and the
>>> ocbi a store with the original value loaded at movca. But you'll
>>> still get into trouble with exceptions that might break the
>>> movca/ocbi sequence.
>> That's an idea. Current implementation is "delay and do-or-cancel" way.
>> But your idea is "do and commit-or-rollback" way.
>> I'll try to implement it to avoid mmu fault by movca.
>> The mmu fault by movca will cause invoke exception handler, and break
>> movca/ocbi sequence.  It will be a quite rare case, however, I'll consider
>> it for next version.
>>> I don't it's worth having a dynamically sized movca delay buffer.
>>> The movca insn has a restricted addressing mode making it a bit hard
>>> to use back-to-back without alu insns in between. IIU your code
>>> correctly you flush the buffer before every non movca/ocbi insn so I
>>> guess that for most realistic use-cases you'll only need very few
>>> (maybe only 1) entries in the delay buffer.
>> That is a point I wavered.
>> I investigated linux kernel and found following lines in
>> "__flush_dcache_segment_4way()" in "arch/sh/mm/cache-sh4.c".
>>     a0 = base_addr;
>>     a1 = a0 + way_incr;
>>     a2 = a1 + way_incr;
>>     a3 = a2 + way_incr;
>>     a0e = base_addr + extent_per_way;
>>     do {
>>         asm volatile("ldc %0, sr" : : "r" (sr_with_bl));
>>         asm volatile("movca.l r0, @%0\n\t"
>>                  "movca.l r0, @%1\n\t"
>>                  "movca.l r0, @%2\n\t"
>>                  "movca.l r0, @%3\n\t"
>>                  "ocbi @%0\n\t"
>>                  "ocbi @%1\n\t"
>>                  "ocbi @%2\n\t"
>>                  "ocbi @%3\n\t" : :
>>                  "r" (a0), "r" (a1), "r" (a2), "r" (a3));
>> Multiple movca.l insns executed consequently, and they access different 
>> lines.
>> At least, 4 entries are needed for it.  This code shows kernel 
>> implementation
>> decides how many number of movcas are invoked consequently.
>> That's the reason why I introudcued dynamic size change. 
>>>> Signed-off-by: Shin-ichiro KAWASAKI <kawasaki@juno.dti.ne.jp>
>>>>
>>>> Index: trunk/target-sh4/op_helper.c
>>>> ===================================================================
>>>> --- trunk/target-sh4/op_helper.c    (revision 6133)
>>>> +++ trunk/target-sh4/op_helper.c    (working copy)
>>>> @@ -20,6 +20,7 @@
>>>>  #include <assert.h>
>>>>  #include "exec.h"
>>>>  #include "helper.h"
>>>> +void *qemu_mallocz(size_t size);
>>>>   #ifndef CONFIG_USER_ONLY
>>>>  @@ -604,3 +605,57 @@
>>>>      d.ll = t0;
>>>>      return float64_to_int32_round_to_zero(d.d, &env->fp_status);
>>>>  }
>>>> +
>>>> +void helper_movca(uint32_t val, uint32_t addr)
>>>> +{
>>>> +    delayed_movca_t *cur = &env->movca_list;
>>>> +    delayed_movca_t *prev = NULL;
>>>> +    while (cur) {
>>>> +    if (!cur->valid) {
>>>
>>> Might be overkill but you can merge here, i.e:
>>>
>>>     if (!cur->valid || cur->addr == addr) {
>>>
>>>
>>>> +        cur->valid = 1;
>>>> +        cur->value = val;
>>>> +        cur->addr = addr;
>>>> +        return;
>>>> +    }
>>>> +    prev = cur;
>>>> +    cur = cur->next;
>>>> +    }
>>>> +
>>>> +    /* movca entry shortage. allocate it. */
>>>> +    prev->next = cur = qemu_mallocz(sizeof(delayed_movca_t));
>>>> +    if (cur == NULL) {
>>>> +    printf("out of memory for delayed movca. @%08x\n", addr);
>>>> +    return;
>>>> +    }
>>>> +    cur->valid = 1;
>>>> +    cur->value = val;
>>>> +    cur->addr = addr;
>>>> +}
>>>> +
>>>> +void helper_ocbi(uint32_t addr)
>>>> +{
>>>> +    delayed_movca_t *cur = &env->movca_list;
>>>> +
>>>> +    while (cur) {
>>>> +    if (cur->valid && cur->addr == addr) { /* found! */
>>>> +        cur->valid = 0;
>>>> +        return;
>>>> +    }
>>>> +    cur = cur->next;
>>>> +    }
>>>
>>> I think you can quite easy catch a few more movca use cases if you
>>> dont compare line offsets, i.e for sh4 you'd mask the addr & ~(32 - 1).
>>> Also, you should not return until you've scanned the entire movca delay
>>> buffer as there might be multiple stores to the same line that need to be
>>> ignored.
>>>
>>> example:
>>>     addr &= ~31;
>>>     while (cur) {
>>>        if (cur->valid && (cur->addr & ~31) == addr)
>>>            cur->valid = 0;
>>>        cur = cur->next;
>>>     }
>> That's a good advice.  I'll try to implement it for next version.
>> Thanks again for your review!
>> Regards,
>> Shin-ichiro KAWASAKI
>
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2009-01-14 18:46 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-01-11  9:14 [Qemu-devel] [PATCH 2/3] sh: movca.l cancel by ocbi Shin-ichiro KAWASAKI
2009-01-11  9:16 ` [Qemu-devel] [PATCH 3/3] sh: SCI improvements Shin-ichiro KAWASAKI
2009-01-11 15:30   ` Jean-Christophe PLAGNIOL-VILLARD
2009-01-11 17:02   ` [Qemu-devel] [PATCH] sh/serial: allow cpu regs definition Jean-Christophe PLAGNIOL-VILLARD
2009-01-12  6:42   ` [Qemu-devel] [PATCH 3/3] sh: SCI improvements Jean-Christophe PLAGNIOL-VILLARD
2009-01-11 14:22 ` [Qemu-devel] [PATCH 2/3] sh: movca.l cancel by ocbi Edgar E. Iglesias
2009-01-12  5:13   ` Shin-ichiro KAWASAKI
2009-01-13  0:21     ` Shin-ichiro KAWASAKI
2009-01-14 18:46       ` Edgar E. Iglesias

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.