All of lore.kernel.org
 help / color / mirror / Atom feed
* shared memory problem on ARM v5TE using threads
@ 2009-12-04 11:23 Heiko Schocher
  2009-12-04 12:26 ` Joerg Wagner
                   ` (2 more replies)
  0 siblings, 3 replies; 71+ messages in thread
From: Heiko Schocher @ 2009-12-04 11:23 UTC (permalink / raw)
  To: linux-arm-kernel

Hello,

I have the following shared mem problem on a ARM v5TE
processor using Linux version 2.6.28,

-bash-3.2# cat /proc/cpuinfo
Processor       : Feroceon 88FR131 rev 1 (v5l)
BogoMIPS        : 799.53
Features        : swp half thumb fastmult edsp
CPU implementer : 0x56
CPU architecture: 5TE
CPU variant     : 0x2
CPU part        : 0x131
CPU revision    : 1

The testscript [1] starts 2 processes. One write
process with one thread, which writes in a shared memory.

The second process starts 4 threads, which all read from
this shared memory. This don;t work on this processor [4].
The same demoprogramm works fine on ppc, i386 or on a
ARMv6 based board [3][6] ...

If I start 4 read processes, which themselves starts only
one readthread [5], the demoprogramm works fine!

Also, if I start one read process, which only attaches
the shared memory once with shmat(), and then starts 4
read threads, and all this 4 read threads using
the same shared memory addr, returned from shmat(),
this works as expected.

Any ideas, hints ... ?

TIA

bye,
Heiko
-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany


[1] Testscript

#!/bin/sh
echo "Compile"
gcc -o shmtest2 shmtest2.c -lpthread

echo "Run shmtest2"
./shmtest2 write 1 &
./shmtest2 read 4 &


[2] demoprogramm, shmtest2.c

#include <pthread.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <unistd.h>
#include <stdio.h>
#include <string.h>

extern void      exit();

struct Entry
{
 char          ident_name[1000];
 unsigned int  tipc_nr;
 unsigned int  pid;
 unsigned int  in_msg;
 unsigned int  out_msg;
 unsigned int  rxQueueLength;
};

void* attachSharedMem(int shmid)
{
  void* addr = shmat(shmid, NULL, 0);
  if ((addr != 0) && (0xffffffff != (unsigned int)addr))
  {
    printf("attach shared mem:%x\n",addr);
  }
  else
  {
    printf("shmat failed");
    addr = 0;
  }
  return addr;
}

int createSharedMem()
{
  key_t   key     = 1000;          /* key to be passed to shmget() */
  int     shmflg;                  /* shmflg to be passed to shmget() */
  int     shmid;                   /* return value from shmget() */
  int     size;                    /* size to be passed to shmget() */

  size   = 60000;
  shmflg = IPC_CREAT | 0666;
  if ((shmid = shmget (key, size, shmflg)) == -1)
  {
    printf("shmget failed");
    shmid = 0;
  }

  printf("Shared memory Id:%d\n",shmid);

  return shmid;
}


void* setupSharedMem()
{
  int shmid = createSharedMem();
  void* addrShm = attachSharedMem(shmid);
  return addrShm;
}

void *readThread(void *t)
{
  struct Entry* entry   = 0;

  int shmid = (int)t;
  void* addrShm = attachSharedMem(shmid);

  if (addrShm != 0)
  {
    printf("Start Read Thread addr:%x\n",addrShm);
    entry = (struct Entry*)addrShm;
    entry->in_msg  = 0;
    entry->out_msg = 0;

    int i=0;
    while(i < 60)
    {
      entry->in_msg += 1000;
      sleep(1);
      printf("Read from entry in_msg=%d, out_msg=%d, addr=%x\n",entry->in_msg,entry->out_msg, addrShm);
      i++;
    }
  }

  pthread_exit(NULL);
}

void *writeThread(void *t)
{
  struct Entry* entry   = 0;
  unsigned int threadId = (unsigned int)t;
  void* addrShm         = setupSharedMem();

  if (addrShm != 0)
  {
    printf("Start Write Thread %d, addr:%x\n",threadId,addrShm);
    entry = (struct Entry*)addrShm;
    strcpy(entry->ident_name,"this is a test entry");
    entry->in_msg  = 0;
    entry->out_msg = 0;
    entry->rxQueueLength = 20000;
    entry->pid     = threadId;
    entry->tipc_nr = 1000;

    int i=0;
    while(i < 60)
    {
      entry->out_msg += 1000;
      sleep(1);
      i++;
    }
  }

  pthread_exit(NULL);
}

main(int argc, char* argv[])
{

    //check the arguments
    if (argc != 3)
    {
       printf("Arguments are [read|write] [number of threads]\n");
       exit(1);
    }

    unsigned int mode         = 0;
    unsigned int nbrOfThreads = 0;

    if (strcmp(argv[1],"write") == 0)
    {
      printf("Write to in_msg\n");
      mode = 1;
    }

    if (strcmp(argv[1],"read") == 0)
    {
      printf("Read from in_msg\n");
      mode = 2;
    }

    nbrOfThreads = atoi(argv[2]);

    pthread_t threads[nbrOfThreads];
    pthread_attr_t attr;

    /* Initialize and set thread detached attribute */
    pthread_attr_init(&attr);
    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);

    unsigned int t;
    int rc;
    for(t=0; t<nbrOfThreads; t++)
    {
      printf("Creating thread %ld\n", t);
      if (mode == 1)
      {
        rc = pthread_create(&threads[t], &attr, writeThread, (void *)t);
      }
      else
      {
        int shmid = createSharedMem(t);
        rc = pthread_create(&threads[t], &attr, readThread, (void *)shmid);
      }

      if (rc)
      {
        printf("ERROR; return code from pthread_create() is %d\n", rc);
        exit(-1);
      }
    }

    void* status;
    pthread_attr_destroy(&attr);
    for(t=0; t<nbrOfThreads; t++)
    {
      pthread_join(threads[t], &status);
    }

    printf("All %s threads finished, exit\n",mode == 1 ? "write":"read");

    exit(0);
}

[3] ARMv6 processor cpuinfo
-bash-3.2# cat /proc/version
Linux version 2.6.32-rc6 (dzu@pollux.denx.de) (gcc version 4.2.2) #4 Thu Dec 3 12:31:13 CET 2009
-bash-3.2# cat /proc/cpuinfo
Processor       : ARMv6-compatible processor rev 4 (v6l)
BogoMIPS        : 398.13
Features        : swp half thumb fastmult vfp edsp java
CPU implementer : 0x41
CPU architecture: 6TEJ
CPU variant     : 0x0
CPU part        : 0xb36
CPU revision    : 4

[4] Log from Demoprogramm, not working

-bash-3.2# ./shtest2.sh
Run shmtest2
Read from in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Read Thread addr:40961000
Write to in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Write Thread 0, addr:40961000
348: write new mesg: 0
Read from in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Read Thread addr:40961000
Creating thread 1
Shared memory Id:0
Read from in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Read Thread addr:40961000
attach shared mem:41170000
Start Read Thread addr:41170000
Read from in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Read Thread addr:40961000
-bash-3.2# Read from entry in_msg=1000, out_msg=0, addr=40961000
Read from entry in_msg=0, out_msg=0, addr=40961000
Read from entry in_msg=2000, out_msg=1000, addr=40961000
Read from entry in_msg=1000, out_msg=0, addr=41170000
Read from entry in_msg=3000, out_msg=1000, addr=40961000
Read from entry in_msg=4000, out_msg=1000, addr=40961000
Read from entry in_msg=2000, out_msg=0, addr=40961000
Read from entry in_msg=5000, out_msg=2000, addr=40961000
Read from entry in_msg=3000, out_msg=0, addr=41170000
Read from entry in_msg=6000, out_msg=2000, addr=40961000
Read from entry in_msg=7000, out_msg=2000, addr=40961000
Read from entry in_msg=4000, out_msg=0, addr=40961000
Read from entry in_msg=8000, out_msg=3000, addr=40961000
Read from entry in_msg=5000, out_msg=0, addr=41170000
Read from entry in_msg=9000, out_msg=3000, addr=40961000
Read from entry in_msg=10000, out_msg=3000, addr=40961000
[...]

[5] Testscript which starts 4 read processes each with
    one read thread

#!/bin/sh
echo "Compile"
gcc -o shmtest2 shmtest2.c -lpthread

echo "Run shmtest2"
./shmtest2 write 1 &
./shmtest2 read 1 &
./shmtest2 read 1 &
./shmtest2 read 1 &
./shmtest2 read 1 &

[6] working log on ARMv6

-bash-3.2# ./shtest2.sh
Run shmtest2
Write to in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40960000
Start Write Thread 0, addr:40960000
Read from in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40960000
Start Read Thread addr:40960000
Creating thread 1
Shared memory Id:0
Creating thread 2
Shared memory Id:0
Creating thread 3
Shared memory Id:0
attach shared mem:4216f000
Start Read Thread addr:4216f000
attach shared mem:4217e000
Start Read Thread addr:4217e000
attach shared mem:4218d000
Start Read Thread addr:4218d000
-bash-3.2# Read from entry in_msg=1000, out_msg=1000, addr=40960000
Read from entry in_msg=2000, out_msg=1000, addr=4216f000
Read from entry in_msg=3000, out_msg=1000, addr=4217e000
Read from entry in_msg=4000, out_msg=1000, addr=4218d000
Read from entry in_msg=5000, out_msg=2000, addr=40960000
Read from entry in_msg=6000, out_msg=2000, addr=4216f000
Read from entry in_msg=7000, out_msg=2000, addr=4217e000
Read from entry in_msg=8000, out_msg=2000, addr=4218d000
Read from entry in_msg=9000, out_msg=3000, addr=40960000
Read from entry in_msg=10000, out_msg=3000, addr=4216f000
Read from entry in_msg=11000, out_msg=3000, addr=4217e000
Read from entry in_msg=12000, out_msg=3000, addr=4218d000
Read from entry in_msg=13000, out_msg=4000, addr=40960000
Read from entry in_msg=14000, out_msg=4000, addr=4216f000
Read from entry in_msg=15000, out_msg=4000, addr=4217e000
Read from entry in_msg=16000, out_msg=4000, addr=4218d000
[...]

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 11:23 shared memory problem on ARM v5TE using threads Heiko Schocher
@ 2009-12-04 12:26 ` Joerg Wagner
  2009-12-04 13:13 ` Russell King - ARM Linux
  2009-12-04 17:25 ` Nicolas Pitre
  2 siblings, 0 replies; 71+ messages in thread
From: Joerg Wagner @ 2009-12-04 12:26 UTC (permalink / raw)
  To: linux-arm-kernel

Could you elaborate what you are expecting to see?
You are "read-modify-write"'ing a shared location without using atomic instructions, so it seems reasonable that the updates of the entry->in_msg are not behaving like you want them to behave.

Joerg

> -----Original Message-----
> From: linux-arm-kernel-bounces at lists.infradead.org [mailto:linux-arm-
> kernel-bounces at lists.infradead.org] On Behalf Of Heiko Schocher
> Sent: Freitag, 4. Dezember 2009 12:24
> To: linux-arm-kernel at lists.infradead.org
> Subject: shared memory problem on ARM v5TE using threads
> 
> Hello,
> 
> I have the following shared mem problem on a ARM v5TE
> processor using Linux version 2.6.28,
> 
> -bash-3.2# cat /proc/cpuinfo
> Processor       : Feroceon 88FR131 rev 1 (v5l)
> BogoMIPS        : 799.53
> Features        : swp half thumb fastmult edsp
> CPU implementer : 0x56
> CPU architecture: 5TE
> CPU variant     : 0x2
> CPU part        : 0x131
> CPU revision    : 1
> 
> The testscript [1] starts 2 processes. One write
> process with one thread, which writes in a shared memory.
> 
> The second process starts 4 threads, which all read from
> this shared memory. This don;t work on this processor [4].
> The same demoprogramm works fine on ppc, i386 or on a
> ARMv6 based board [3][6] ...
> 
> If I start 4 read processes, which themselves starts only
> one readthread [5], the demoprogramm works fine!
> 
> Also, if I start one read process, which only attaches
> the shared memory once with shmat(), and then starts 4
> read threads, and all this 4 read threads using
> the same shared memory addr, returned from shmat(),
> this works as expected.
> 
> Any ideas, hints ... ?
> 
> TIA
> 
> bye,
> Heiko
> --
> DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
> HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
> 
> 
> [1] Testscript
> 
> #!/bin/sh
> echo "Compile"
> gcc -o shmtest2 shmtest2.c -lpthread
> 
> echo "Run shmtest2"
> ./shmtest2 write 1 &
> ./shmtest2 read 4 &
> 
> 
> [2] demoprogramm, shmtest2.c
> 
> #include <pthread.h>
> #include <sys/types.h>
> #include <sys/ipc.h>
> #include <sys/shm.h>
> #include <unistd.h>
> #include <stdio.h>
> #include <string.h>
> 
> extern void      exit();
> 
> struct Entry
> {
>  char          ident_name[1000];
>  unsigned int  tipc_nr;
>  unsigned int  pid;
>  unsigned int  in_msg;
>  unsigned int  out_msg;
>  unsigned int  rxQueueLength;
> };
> 
> void* attachSharedMem(int shmid)
> {
>   void* addr = shmat(shmid, NULL, 0);
>   if ((addr != 0) && (0xffffffff != (unsigned int)addr))
>   {
>     printf("attach shared mem:%x\n",addr);
>   }
>   else
>   {
>     printf("shmat failed");
>     addr = 0;
>   }
>   return addr;
> }
> 
> int createSharedMem()
> {
>   key_t   key     = 1000;          /* key to be passed to shmget() */
>   int     shmflg;                  /* shmflg to be passed to shmget()
> */
>   int     shmid;                   /* return value from shmget() */
>   int     size;                    /* size to be passed to shmget() */
> 
>   size   = 60000;
>   shmflg = IPC_CREAT | 0666;
>   if ((shmid = shmget (key, size, shmflg)) == -1)
>   {
>     printf("shmget failed");
>     shmid = 0;
>   }
> 
>   printf("Shared memory Id:%d\n",shmid);
> 
>   return shmid;
> }
> 
> 
> void* setupSharedMem()
> {
>   int shmid = createSharedMem();
>   void* addrShm = attachSharedMem(shmid);
>   return addrShm;
> }
> 
> void *readThread(void *t)
> {
>   struct Entry* entry   = 0;
> 
>   int shmid = (int)t;
>   void* addrShm = attachSharedMem(shmid);
> 
>   if (addrShm != 0)
>   {
>     printf("Start Read Thread addr:%x\n",addrShm);
>     entry = (struct Entry*)addrShm;
>     entry->in_msg  = 0;
>     entry->out_msg = 0;
> 
>     int i=0;
>     while(i < 60)
>     {
>       entry->in_msg += 1000;
>       sleep(1);
>       printf("Read from entry in_msg=%d, out_msg=%d, addr=%x\n",entry-
> >in_msg,entry->out_msg, addrShm);
>       i++;
>     }
>   }
> 
>   pthread_exit(NULL);
> }
> 
> void *writeThread(void *t)
> {
>   struct Entry* entry   = 0;
>   unsigned int threadId = (unsigned int)t;
>   void* addrShm         = setupSharedMem();
> 
>   if (addrShm != 0)
>   {
>     printf("Start Write Thread %d, addr:%x\n",threadId,addrShm);
>     entry = (struct Entry*)addrShm;
>     strcpy(entry->ident_name,"this is a test entry");
>     entry->in_msg  = 0;
>     entry->out_msg = 0;
>     entry->rxQueueLength = 20000;
>     entry->pid     = threadId;
>     entry->tipc_nr = 1000;
> 
>     int i=0;
>     while(i < 60)
>     {
>       entry->out_msg += 1000;
>       sleep(1);
>       i++;
>     }
>   }
> 
>   pthread_exit(NULL);
> }
> 
> main(int argc, char* argv[])
> {
> 
>     //check the arguments
>     if (argc != 3)
>     {
>        printf("Arguments are [read|write] [number of threads]\n");
>        exit(1);
>     }
> 
>     unsigned int mode         = 0;
>     unsigned int nbrOfThreads = 0;
> 
>     if (strcmp(argv[1],"write") == 0)
>     {
>       printf("Write to in_msg\n");
>       mode = 1;
>     }
> 
>     if (strcmp(argv[1],"read") == 0)
>     {
>       printf("Read from in_msg\n");
>       mode = 2;
>     }
> 
>     nbrOfThreads = atoi(argv[2]);
> 
>     pthread_t threads[nbrOfThreads];
>     pthread_attr_t attr;
> 
>     /* Initialize and set thread detached attribute */
>     pthread_attr_init(&attr);
>     pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
> 
>     unsigned int t;
>     int rc;
>     for(t=0; t<nbrOfThreads; t++)
>     {
>       printf("Creating thread %ld\n", t);
>       if (mode == 1)
>       {
>         rc = pthread_create(&threads[t], &attr, writeThread, (void
> *)t);
>       }
>       else
>       {
>         int shmid = createSharedMem(t);
>         rc = pthread_create(&threads[t], &attr, readThread, (void
> *)shmid);
>       }
> 
>       if (rc)
>       {
>         printf("ERROR; return code from pthread_create() is %d\n", rc);
>         exit(-1);
>       }
>     }
> 
>     void* status;
>     pthread_attr_destroy(&attr);
>     for(t=0; t<nbrOfThreads; t++)
>     {
>       pthread_join(threads[t], &status);
>     }
> 
>     printf("All %s threads finished, exit\n",mode == 1 ?
> "write":"read");
> 
>     exit(0);
> }
> 
> [3] ARMv6 processor cpuinfo
> -bash-3.2# cat /proc/version
> Linux version 2.6.32-rc6 (dzu at pollux.denx.de) (gcc version 4.2.2) #4
> Thu Dec 3 12:31:13 CET 2009
> -bash-3.2# cat /proc/cpuinfo
> Processor       : ARMv6-compatible processor rev 4 (v6l)
> BogoMIPS        : 398.13
> Features        : swp half thumb fastmult vfp edsp java
> CPU implementer : 0x41
> CPU architecture: 6TEJ
> CPU variant     : 0x0
> CPU part        : 0xb36
> CPU revision    : 4
> 
> [4] Log from Demoprogramm, not working
> 
> -bash-3.2# ./shtest2.sh
> Run shmtest2
> Read from in_msg
> Creating thread 0
> Shared memory Id:0
> attach shared mem:40961000
> Start Read Thread addr:40961000
> Write to in_msg
> Creating thread 0
> Shared memory Id:0
> attach shared mem:40961000
> Start Write Thread 0, addr:40961000
> 348: write new mesg: 0
> Read from in_msg
> Creating thread 0
> Shared memory Id:0
> attach shared mem:40961000
> Start Read Thread addr:40961000
> Creating thread 1
> Shared memory Id:0
> Read from in_msg
> Creating thread 0
> Shared memory Id:0
> attach shared mem:40961000
> Start Read Thread addr:40961000
> attach shared mem:41170000
> Start Read Thread addr:41170000
> Read from in_msg
> Creating thread 0
> Shared memory Id:0
> attach shared mem:40961000
> Start Read Thread addr:40961000
> -bash-3.2# Read from entry in_msg=1000, out_msg=0, addr=40961000
> Read from entry in_msg=0, out_msg=0, addr=40961000
> Read from entry in_msg=2000, out_msg=1000, addr=40961000
> Read from entry in_msg=1000, out_msg=0, addr=41170000
> Read from entry in_msg=3000, out_msg=1000, addr=40961000
> Read from entry in_msg=4000, out_msg=1000, addr=40961000
> Read from entry in_msg=2000, out_msg=0, addr=40961000
> Read from entry in_msg=5000, out_msg=2000, addr=40961000
> Read from entry in_msg=3000, out_msg=0, addr=41170000
> Read from entry in_msg=6000, out_msg=2000, addr=40961000
> Read from entry in_msg=7000, out_msg=2000, addr=40961000
> Read from entry in_msg=4000, out_msg=0, addr=40961000
> Read from entry in_msg=8000, out_msg=3000, addr=40961000
> Read from entry in_msg=5000, out_msg=0, addr=41170000
> Read from entry in_msg=9000, out_msg=3000, addr=40961000
> Read from entry in_msg=10000, out_msg=3000, addr=40961000
> [...]
> 
> [5] Testscript which starts 4 read processes each with
>     one read thread
> 
> #!/bin/sh
> echo "Compile"
> gcc -o shmtest2 shmtest2.c -lpthread
> 
> echo "Run shmtest2"
> ./shmtest2 write 1 &
> ./shmtest2 read 1 &
> ./shmtest2 read 1 &
> ./shmtest2 read 1 &
> ./shmtest2 read 1 &
> 
> [6] working log on ARMv6
> 
> -bash-3.2# ./shtest2.sh
> Run shmtest2
> Write to in_msg
> Creating thread 0
> Shared memory Id:0
> attach shared mem:40960000
> Start Write Thread 0, addr:40960000
> Read from in_msg
> Creating thread 0
> Shared memory Id:0
> attach shared mem:40960000
> Start Read Thread addr:40960000
> Creating thread 1
> Shared memory Id:0
> Creating thread 2
> Shared memory Id:0
> Creating thread 3
> Shared memory Id:0
> attach shared mem:4216f000
> Start Read Thread addr:4216f000
> attach shared mem:4217e000
> Start Read Thread addr:4217e000
> attach shared mem:4218d000
> Start Read Thread addr:4218d000
> -bash-3.2# Read from entry in_msg=1000, out_msg=1000, addr=40960000
> Read from entry in_msg=2000, out_msg=1000, addr=4216f000
> Read from entry in_msg=3000, out_msg=1000, addr=4217e000
> Read from entry in_msg=4000, out_msg=1000, addr=4218d000
> Read from entry in_msg=5000, out_msg=2000, addr=40960000
> Read from entry in_msg=6000, out_msg=2000, addr=4216f000
> Read from entry in_msg=7000, out_msg=2000, addr=4217e000
> Read from entry in_msg=8000, out_msg=2000, addr=4218d000
> Read from entry in_msg=9000, out_msg=3000, addr=40960000
> Read from entry in_msg=10000, out_msg=3000, addr=4216f000
> Read from entry in_msg=11000, out_msg=3000, addr=4217e000
> Read from entry in_msg=12000, out_msg=3000, addr=4218d000
> Read from entry in_msg=13000, out_msg=4000, addr=40960000
> Read from entry in_msg=14000, out_msg=4000, addr=4216f000
> Read from entry in_msg=15000, out_msg=4000, addr=4217e000
> Read from entry in_msg=16000, out_msg=4000, addr=4218d000
> [...]
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 11:23 shared memory problem on ARM v5TE using threads Heiko Schocher
  2009-12-04 12:26 ` Joerg Wagner
@ 2009-12-04 13:13 ` Russell King - ARM Linux
  2009-12-04 13:42   ` Heiko Schocher
  2009-12-04 17:25 ` Nicolas Pitre
  2 siblings, 1 reply; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-04 13:13 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Dec 04, 2009 at 12:23:45PM +0100, Heiko Schocher wrote:
> [4] Log from Demoprogramm, not working

I think this is messed up - this is not from your first script but your
second script which starts four independent read processes.

I determined this because:
(1) the read thread addresses are mostly the same
(2) there are four "Read form in_msg" strings, which you only print
once at the start of the program.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 13:13 ` Russell King - ARM Linux
@ 2009-12-04 13:42   ` Heiko Schocher
  2009-12-04 15:42     ` Russell King - ARM Linux
  0 siblings, 1 reply; 71+ messages in thread
From: Heiko Schocher @ 2009-12-04 13:42 UTC (permalink / raw)
  To: linux-arm-kernel

Hello Russell King,

Russell King - ARM Linux wrote:
> On Fri, Dec 04, 2009 at 12:23:45PM +0100, Heiko Schocher wrote:
>> [4] Log from Demoprogramm, not working
> 
> I think this is messed up - this is not from your first script but your
> second script which starts four independent read processes.
> 
> I determined this because:
> (1) the read thread addresses are mostly the same
> (2) there are four "Read form in_msg" strings, which you only print
> once at the start of the program.

Ups, sorry for the confusion, here 2 logs with the right values:

-bash-3.2# cat shtest2.sh
#!/bin/sh
echo "Run shmtest2"
./shmtest2 write 1 &
./shmtest2 read 4 &

-bash-3.2# cat shmtest2.c
#include <pthread.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <unistd.h>
#include <stdio.h>
#include <string.h>

extern void      exit();

struct Entry
{
 char          ident_name[1000];
 unsigned int  tipc_nr;
 unsigned int  pid;
 unsigned int  in_msg;
 unsigned int  out_msg;
 unsigned int  rxQueueLength;
};

void* attachSharedMem(int shmid)
{
  void* addr = shmat(shmid, NULL, 0);
  if ((addr != 0) && (0xffffffff != (unsigned int)addr))
  {
    printf("attach shared mem:%x\n",addr);
  }
  else
  {
    printf("shmat failed");
    addr = 0;
  }
  return addr;
}

int createSharedMem()
{
  key_t   key     = 1000;          /* key to be passed to shmget() */
  int     shmflg;                  /* shmflg to be passed to shmget() */
  int     shmid;                   /* return value from shmget() */
  int     size;                    /* size to be passed to shmget() */

  size   = 60000;
  shmflg = IPC_CREAT | 0666;
  if ((shmid = shmget (key, size, shmflg)) == -1)
  {
    printf("shmget failed");
    shmid = 0;
  }

  printf("Shared memory Id:%d\n",shmid);

  return shmid;
}


void* setupSharedMem()
{
  int shmid = createSharedMem();
  void* addrShm = attachSharedMem(shmid);
  return addrShm;
}

void *readThread(void *t)
{
  struct Entry* entry   = 0;

  int shmid = (int)t;
  void* addrShm = attachSharedMem(shmid);

  if (addrShm != 0)
  {
    printf("Start Read Thread addr:%x\n",addrShm);
    entry = (struct Entry*)addrShm;
    entry->in_msg  = 0;
    entry->out_msg = 0;

    int i=0;
    while(i < 60)
    {
      entry->in_msg += 1000;
      sleep(1);
      printf("%d Read from entry in_msg=%d, out_msg=%d, addr=%x\n", getpid(), entry->in_msg,entry->out_msg, addrShm);
      i++;
    }
  }

  pthread_exit(NULL);
}

void *writeThread(void *t)
{
  struct Entry* entry   = 0;
  unsigned int threadId = (unsigned int)t;
  void* addrShm         = setupSharedMem();

  if (addrShm != 0)
  {
    printf("Start Write Thread %d, addr:%x\n",threadId,addrShm);
    entry = (struct Entry*)addrShm;
    strcpy(entry->ident_name,"this is a test entry");
    entry->in_msg  = 0;
    entry->out_msg = 0;
    entry->rxQueueLength = 20000;
    entry->pid     = threadId;
    entry->tipc_nr = 1000;

    int i=0;
    while(i < 60)
    {
        printf("%d: write new mesg: %d\n", getpid(), entry->out_msg);
      entry->out_msg += 1000;
      sleep(1);
      //printf("Write in entry with threadId=%d\n", threadId);
      i++;
    }
  }

  pthread_exit(NULL);
}

main(int argc, char* argv[])
{

    //check the arguments
    if (argc != 3)
    {
       printf("Arguments are [read|write] [number of threads]\n");
       exit(1);
    }

    unsigned int mode         = 0;
    unsigned int nbrOfThreads = 0;

    if (strcmp(argv[1],"write") == 0)
    {
      printf("Write to in_msg\n");
      mode = 1;
    }

    if (strcmp(argv[1],"read") == 0)
    {
      printf("Read from in_msg\n");
      mode = 2;
    }

    nbrOfThreads = atoi(argv[2]);

    pthread_t threads[nbrOfThreads];
    pthread_attr_t attr;

    /* Initialize and set thread detached attribute */
    pthread_attr_init(&attr);
    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);

    unsigned int t;
    int rc;
    for(t=0; t<nbrOfThreads; t++)
    {
      printf("Creating thread %ld\n", t);
      if (mode == 1)
      {
        rc = pthread_create(&threads[t], &attr, writeThread, (void *)t);
      }
      else
      {
        int shmid = createSharedMem(t);
        rc = pthread_create(&threads[t], &attr, readThread, (void *)shmid);
      }

      if (rc)
      {
        printf("ERROR; return code from pthread_create() is %d\n", rc);
        exit(-1);
      }
    }

    void* status;
    pthread_attr_destroy(&attr);
    for(t=0; t<nbrOfThreads; t++)
    {
      pthread_join(threads[t], &status);
    }

    printf("All %s threads finished, exit\n",mode == 1 ? "write":"read");

    exit(0);
}

-bash-3.2# ./shtest2.sh
Run shmtest2
Write to in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Write Thread 0, addr:40961000
411: write new mesg: 0
Read from in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Read Thread addr:40961000
Creating thread 1
Shared memory Id:0
attach shared mem:41170000
Start Read Thread addr:41170000
Creating thread 2
Shared memory Id:0
attach shared mem:4197f000
Start Read Thread addr:4197f000
Creating thread 3
Shared memory Id:0
attach shared mem:4218e000
Start Read Thread addr:4218e000
-bash-3.2# 411: write new mesg: 0
413 Read from entry in_msg=1000, out_msg=0, addr=40961000
413 Read from entry in_msg=2000, out_msg=0, addr=41170000
413 Read from entry in_msg=3000, out_msg=0, addr=4197f000
413 Read from entry in_msg=4000, out_msg=0, addr=4218e000
411: write new mesg: 1000
413 Read from entry in_msg=1000, out_msg=1000, addr=40961000
413 Read from entry in_msg=2000, out_msg=1000, addr=41170000
413 Read from entry in_msg=3000, out_msg=1000, addr=4197f000
413 Read from entry in_msg=4000, out_msg=1000, addr=4218e000
411: write new mesg: 2000
413 Read from entry in_msg=5000, out_msg=1000, addr=40961000
413 Read from entry in_msg=6000, out_msg=1000, addr=41170000
413 Read from entry in_msg=7000, out_msg=1000, addr=4197f000
413 Read from entry in_msg=8000, out_msg=1000, addr=4218e000
411: write new mesg: 3000
413 Read from entry in_msg=9000, out_msg=1000, addr=40961000
413 Read from entry in_msg=10000, out_msg=1000, addr=41170000
413 Read from entry in_msg=11000, out_msg=1000, addr=4197f000
413 Read from entry in_msg=12000, out_msg=1000, addr=4218e000
411: write new mesg: 4000
413 Read from entry in_msg=13000, out_msg=1000, addr=40961000
413 Read from entry in_msg=14000, out_msg=1000, addr=41170000
413 Read from entry in_msg=15000, out_msg=1000, addr=4197f000
413 Read from entry in_msg=16000, out_msg=1000, addr=4218e000
411: write new mesg: 5000
413 Read from entry in_msg=17000, out_msg=1000, addr=40961000
413 Read from entry in_msg=18000, out_msg=1000, addr=41170000
413 Read from entry in_msg=19000, out_msg=1000, addr=4197f000
413 Read from entry in_msg=20000, out_msg=1000, addr=4218e000
411: write new mesg: 6000
413 Read from entry in_msg=21000, out_msg=1000, addr=40961000
413 Read from entry in_msg=22000, out_msg=1000, addr=41170000
413 Read from entry in_msg=23000, out_msg=1000, addr=4197f000
413 Read from entry in_msg=24000, out_msg=1000, addr=4218e000
411: write new mesg: 7000
413 Read from entry in_msg=25000, out_msg=1000, addr=40961000
413 Read from entry in_msg=26000, out_msg=1000, addr=41170000
413 Read from entry in_msg=27000, out_msg=1000, addr=4197f000
413 Read from entry in_msg=28000, out_msg=1000, addr=4218e000
411: write new mesg: 8000
413 Read from entry in_msg=29000, out_msg=1000, addr=40961000
413 Read from entry in_msg=30000, out_msg=1000, addr=41170000
413 Read from entry in_msg=31000, out_msg=1000, addr=4197f000
413 Read from entry in_msg=32000, out_msg=1000, addr=4218e000
411: write new mesg: 9000
413 Read from entry in_msg=33000, out_msg=1000, addr=40961000
413 Read from entry in_msg=34000, out_msg=1000, addr=41170000
413 Read from entry in_msg=35000, out_msg=1000, addr=4197f000
413 Read from entry in_msg=36000, out_msg=1000, addr=4218e000
411: write new mesg: 10000
413 Read from entry in_msg=37000, out_msg=1000, addr=40961000
413 Read from entry in_msg=38000, out_msg=1000, addr=41170000
413 Read from entry in_msg=39000, out_msg=1000, addr=4197f000
413 Read from entry in_msg=40000, out_msg=1000, addr=4218e000
411: write new mesg: 11000
413 Read from entry in_msg=41000, out_msg=1000, addr=40961000
413 Read from entry in_msg=42000, out_msg=1000, addr=41170000
413 Read from entry in_msg=43000, out_msg=1000, addr=4197f000
413 Read from entry in_msg=44000, out_msg=1000, addr=4218e000
411: write new mesg: 12000
413 Read from entry in_msg=45000, out_msg=1000, addr=40961000
413 Read from entry in_msg=46000, out_msg=1000, addr=41170000
413 Read from entry in_msg=47000, out_msg=1000, addr=4197f000
413 Read from entry in_msg=48000, out_msg=1000, addr=4218e000
411: write new mesg: 13000

-bash-3.2# cat shtest2.sh
#!/bin/sh
echo "Run shmtest2"
./shmtest2 write 1 &
./shmtest2 read 1 &
./shmtest2 read 1 &
./shmtest2 read 1 &
./shmtest2 read 1 &

-bash-3.2# ./shtest2.sh
Run shmtest2
Write to in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Write Thread 0, addr:40961000
423: write new mesg: 0
Read from in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Read Thread addr:40961000
Read from in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Read Thread addr:40961000
Read from in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Read Thread addr:40961000
Read from in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Read Thread addr:40961000
-bash-3.2# 423: write new mesg: 0
425 Read from entry in_msg=1000, out_msg=1000, addr=40961000
427 Read from entry in_msg=2000, out_msg=1000, addr=40961000
429 Read from entry in_msg=3000, out_msg=1000, addr=40961000
431 Read from entry in_msg=4000, out_msg=1000, addr=40961000
423: write new mesg: 1000
425 Read from entry in_msg=5000, out_msg=2000, addr=40961000
427 Read from entry in_msg=6000, out_msg=2000, addr=40961000
429 Read from entry in_msg=7000, out_msg=2000, addr=40961000
431 Read from entry in_msg=8000, out_msg=2000, addr=40961000
423: write new mesg: 2000
425 Read from entry in_msg=9000, out_msg=3000, addr=40961000
427 Read from entry in_msg=10000, out_msg=3000, addr=40961000
429 Read from entry in_msg=11000, out_msg=3000, addr=40961000
431 Read from entry in_msg=12000, out_msg=3000, addr=40961000
423: write new mesg: 3000
425 Read from entry in_msg=13000, out_msg=4000, addr=40961000
427 Read from entry in_msg=14000, out_msg=4000, addr=40961000
429 Read from entry in_msg=15000, out_msg=4000, addr=40961000
431 Read from entry in_msg=16000, out_msg=4000, addr=40961000
423: write new mesg: 4000
425 Read from entry in_msg=17000, out_msg=5000, addr=40961000
427 Read from entry in_msg=18000, out_msg=5000, addr=40961000
429 Read from entry in_msg=19000, out_msg=5000, addr=40961000
431 Read from entry in_msg=20000, out_msg=5000, addr=40961000
423: write new mesg: 5000
425 Read from entry in_msg=21000, out_msg=6000, addr=40961000
427 Read from entry in_msg=22000, out_msg=6000, addr=40961000
429 Read from entry in_msg=23000, out_msg=6000, addr=40961000
431 Read from entry in_msg=24000, out_msg=6000, addr=40961000
423: write new mesg: 6000
425 Read from entry in_msg=25000, out_msg=7000, addr=40961000
427 Read from entry in_msg=26000, out_msg=7000, addr=40961000
429 Read from entry in_msg=27000, out_msg=7000, addr=40961000
431 Read from entry in_msg=28000, out_msg=7000, addr=40961000
423: write new mesg: 7000
425 Read from entry in_msg=29000, out_msg=8000, addr=40961000
427 Read from entry in_msg=30000, out_msg=8000, addr=40961000
429 Read from entry in_msg=31000, out_msg=8000, addr=40961000
431 Read from entry in_msg=32000, out_msg=8000, addr=40961000
423: write new mesg: 8000
425 Read from entry in_msg=33000, out_msg=9000, addr=40961000
427 Read from entry in_msg=34000, out_msg=9000, addr=40961000
429 Read from entry in_msg=35000, out_msg=9000, addr=40961000
431 Read from entry in_msg=36000, out_msg=9000, addr=40961000

Thanks!

bye
Heiko
-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 13:42   ` Heiko Schocher
@ 2009-12-04 15:42     ` Russell King - ARM Linux
  2009-12-04 15:58       ` Heiko Schocher
  0 siblings, 1 reply; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-04 15:42 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Dec 04, 2009 at 02:42:12PM +0100, Heiko Schocher wrote:
> Hello Russell King,
> 
> Russell King - ARM Linux wrote:
> > On Fri, Dec 04, 2009 at 12:23:45PM +0100, Heiko Schocher wrote:
> >> [4] Log from Demoprogramm, not working
> > 
> > I think this is messed up - this is not from your first script but your
> > second script which starts four independent read processes.
> > 
> > I determined this because:
> > (1) the read thread addresses are mostly the same
> > (2) there are four "Read form in_msg" strings, which you only print
> > once at the start of the program.
> 
> Ups, sorry for the confusion, here 2 logs with the right values:

I don't remember whether 2.6.28 has highmem.  Can you check whether you're
using highmem please?

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 15:42     ` Russell King - ARM Linux
@ 2009-12-04 15:58       ` Heiko Schocher
  2009-12-04 16:38         ` Russell King - ARM Linux
  0 siblings, 1 reply; 71+ messages in thread
From: Heiko Schocher @ 2009-12-04 15:58 UTC (permalink / raw)
  To: linux-arm-kernel

Hello Russell King,

Russell King - ARM Linux wrote:
> On Fri, Dec 04, 2009 at 02:42:12PM +0100, Heiko Schocher wrote:
>> Hello Russell King,
>>
>> Russell King - ARM Linux wrote:
>>> On Fri, Dec 04, 2009 at 12:23:45PM +0100, Heiko Schocher wrote:
>>>> [4] Log from Demoprogramm, not working
>>> I think this is messed up - this is not from your first script but your
>>> second script which starts four independent read processes.
>>>
>>> I determined this because:
>>> (1) the read thread addresses are mostly the same
>>> (2) there are four "Read form in_msg" strings, which you only print
>>> once at the start of the program.
>> Ups, sorry for the confusion, here 2 logs with the right values:
> 
> I don't remember whether 2.6.28 has highmem.  Can you check whether you're
> using highmem please?

No, there is no HIGHMEM in the defconfig, if you mean this config option.

You can find the complete defconfig here:

http://git.denx.de/?p=linux-2.6-denx.git;a=blob;f=arch/arm/configs/suen3_defconfig;h=905f9bc6a36c03ab41b97a292fca5b3a0011a2a0;hb=3dc3e4dc957f216ec5cdcd2a770f5c4b0cefca4a

bye
Heiko
-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 15:58       ` Heiko Schocher
@ 2009-12-04 16:38         ` Russell King - ARM Linux
  2009-12-04 16:59           ` Russell King - ARM Linux
  2009-12-04 17:53           ` Heiko Schocher
  0 siblings, 2 replies; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-04 16:38 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Dec 04, 2009 at 04:58:43PM +0100, Heiko Schocher wrote:
> Hello Russell King,
> 
> Russell King - ARM Linux wrote:
> > On Fri, Dec 04, 2009 at 02:42:12PM +0100, Heiko Schocher wrote:
> >> Hello Russell King,
> >>
> >> Russell King - ARM Linux wrote:
> >>> On Fri, Dec 04, 2009 at 12:23:45PM +0100, Heiko Schocher wrote:
> >>>> [4] Log from Demoprogramm, not working
> >>> I think this is messed up - this is not from your first script but your
> >>> second script which starts four independent read processes.
> >>>
> >>> I determined this because:
> >>> (1) the read thread addresses are mostly the same
> >>> (2) there are four "Read form in_msg" strings, which you only print
> >>> once at the start of the program.
> >> Ups, sorry for the confusion, here 2 logs with the right values:
> > 
> > I don't remember whether 2.6.28 has highmem.  Can you check whether you're
> > using highmem please?
> 
> No, there is no HIGHMEM in the defconfig, if you mean this config option.
> 
> You can find the complete defconfig here:
> 
> http://git.denx.de/?p=linux-2.6-denx.git;a=blob;f=arch/arm/configs/suen3_defconfig;h=905f9bc6a36c03ab41b97a292fca5b3a0011a2a0;hb=3dc3e4dc957f216ec5cdcd2a770f5c4b0cefca4a

Please apply this patch and re-run your program.  Expect a fair bit of
output - the last 50 or so lines from the kernel messages should cover
everything that's required.  Also useful would be the output from the
program giving the addresses of the shm regions.

(This patch is a little messy because it contains a bit of restructuring
for 2.6.32 bug fixing...)

diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index 7296022..c457920 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -27,6 +27,8 @@
 
 static unsigned long shared_pte_mask = L_PTE_MT_BUFFERABLE;
 
+#define is_debug() (strcmp(current->comm, "shmtest2") == 0)
+
 /*
  * We take the easy way out of this problem - we make the
  * PTE uncacheable.  However, we leave the write buffer on.
@@ -36,33 +38,18 @@ static unsigned long shared_pte_mask = L_PTE_MT_BUFFERABLE;
  * Therefore those configurations which might call adjust_pte (those
  * without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock.
  */
-static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
+static int do_adjust_pte(struct vm_area_struct *vma, unsigned long address,
+	pte_t *ptep)
 {
-	pgd_t *pgd;
-	pmd_t *pmd;
-	pte_t *pte, entry;
+	pte_t entry = *ptep;
 	int ret;
 
-	pgd = pgd_offset(vma->vm_mm, address);
-	if (pgd_none(*pgd))
-		goto no_pgd;
-	if (pgd_bad(*pgd))
-		goto bad_pgd;
-
-	pmd = pmd_offset(pgd, address);
-	if (pmd_none(*pmd))
-		goto no_pmd;
-	if (pmd_bad(*pmd))
-		goto bad_pmd;
-
-	pte = pte_offset_map(pmd, address);
-	entry = *pte;
-
 	/*
 	 * If this page is present, it's actually being shared.
 	 */
 	ret = pte_present(entry);
-
+if (is_debug()) printk("%s:%d: vma %p addr %lx pte %08lx\n",
+  current->comm, current->pid, vma, address, pte_val(entry));
 	/*
 	 * If this page isn't present, or is already setup to
 	 * fault (ie, is old), we can safely ignore any issues.
@@ -74,23 +61,36 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
 				  (pfn << PAGE_SHIFT) + PAGE_SIZE);
 		pte_val(entry) &= ~L_PTE_MT_MASK;
 		pte_val(entry) |= shared_pte_mask;
-		set_pte_at(vma->vm_mm, address, pte, entry);
+		set_pte_at(vma->vm_mm, address, ptep, entry);
 		flush_tlb_page(vma, address);
+if (is_debug()) printk("%s:%d: modified %08lx\n", current->comm, current->pid, pte_val(entry));
 	}
-	pte_unmap(pte);
 	return ret;
+}
+
+static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
+{
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte;
+	int ret;
 
-bad_pgd:
-	pgd_ERROR(*pgd);
-	pgd_clear(pgd);
-no_pgd:
-	return 0;
-
-bad_pmd:
-	pmd_ERROR(*pmd);
-	pmd_clear(pmd);
-no_pmd:
-	return 0;
+	pgd = pgd_offset(vma->vm_mm, address);
+	if (pgd_none_or_clear_bad(pgd))
+		return 0;
+
+	pmd = pmd_offset(pgd, address);
+	if (pmd_none_or_clear_bad(pmd))
+		return 0;
+
+	/*
+	 * This may be called while another page table is mapped.
+	 */
+	pte = pte_offset_map_nested(pmd, address);
+	ret = do_adjust_pte(vma, address, pte);
+	pte_unmap_nested(pte);
+
+	return ret;
 }
 
 static void
@@ -105,6 +105,9 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, unsigne
 
 	pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
 
+if (is_debug()) printk("%s:%d: %p %p %lx %lx %lx\n",
+ current->comm, current->pid, mapping, vma, pgoff, addr, pfn);
+
 	/*
 	 * If we have any shared mappings that are in the same mm
 	 * space, then we need to handle them specially to maintain
@@ -125,6 +128,7 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, unsigne
 		aliases += adjust_pte(mpnt, mpnt->vm_start + offset);
 	}
 	flush_dcache_mmap_unlock(mapping);
+if (is_debug()) printk("%s:%d: aliases %d\n", current->comm, current->pid, aliases);
 	if (aliases)
 		adjust_pte(vma, addr);
 	else

^ permalink raw reply related	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 16:38         ` Russell King - ARM Linux
@ 2009-12-04 16:59           ` Russell King - ARM Linux
  2009-12-04 17:53           ` Heiko Schocher
  1 sibling, 0 replies; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-04 16:59 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Dec 04, 2009 at 04:38:50PM +0000, Russell King - ARM Linux wrote:
> On Fri, Dec 04, 2009 at 04:58:43PM +0100, Heiko Schocher wrote:
> > Hello Russell King,
> > 
> > Russell King - ARM Linux wrote:
> > > On Fri, Dec 04, 2009 at 02:42:12PM +0100, Heiko Schocher wrote:
> > >> Hello Russell King,
> > >>
> > >> Russell King - ARM Linux wrote:
> > >>> On Fri, Dec 04, 2009 at 12:23:45PM +0100, Heiko Schocher wrote:
> > >>>> [4] Log from Demoprogramm, not working
> > >>> I think this is messed up - this is not from your first script but your
> > >>> second script which starts four independent read processes.
> > >>>
> > >>> I determined this because:
> > >>> (1) the read thread addresses are mostly the same
> > >>> (2) there are four "Read form in_msg" strings, which you only print
> > >>> once at the start of the program.
> > >> Ups, sorry for the confusion, here 2 logs with the right values:
> > > 
> > > I don't remember whether 2.6.28 has highmem.  Can you check whether you're
> > > using highmem please?
> > 
> > No, there is no HIGHMEM in the defconfig, if you mean this config option.
> > 
> > You can find the complete defconfig here:
> > 
> > http://git.denx.de/?p=linux-2.6-denx.git;a=blob;f=arch/arm/configs/suen3_defconfig;h=905f9bc6a36c03ab41b97a292fca5b3a0011a2a0;hb=3dc3e4dc957f216ec5cdcd2a770f5c4b0cefca4a
> 
> Please apply this patch and re-run your program.  Expect a fair bit of
> output - the last 50 or so lines from the kernel messages should cover
> everything that's required.  Also useful would be the output from the
> program giving the addresses of the shm regions.

BTW, so that other people know: the use of highmem on VIVT caches _will_
result in buggy shared mmap[*] behaviour similar to what you're seeing; the
ARM fixup in update_mmu_cache() is rendered inoperative for the existing
mappings.

Luckily, however, when the conditions for applying the fixup are triggered,
you _will_ get a kernel oops.

Since the fix for this involves touching every other architecture (because
we need the page table pointer passed into update_mmu_cache), I will not
be forwarding this fix for 2.6.32.X - the patch is just far too big to
consider.

That all said, Heiko, since you're not using highmem, this will not be
affecting you.


* - this occurs when the same physical pages are mapped multiple times
into the same MM space.  Our work-around for this is to mark the pages
non-cachable bufferable, or if the write buffer fails the coherency test,
non-cachable non-bufferable.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 11:23 shared memory problem on ARM v5TE using threads Heiko Schocher
  2009-12-04 12:26 ` Joerg Wagner
  2009-12-04 13:13 ` Russell King - ARM Linux
@ 2009-12-04 17:25 ` Nicolas Pitre
  2009-12-04 17:31   ` Russell King - ARM Linux
  2009-12-04 17:47   ` Heiko Schocher
  2 siblings, 2 replies; 71+ messages in thread
From: Nicolas Pitre @ 2009-12-04 17:25 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 4 Dec 2009, Heiko Schocher wrote:

> Hello,
> 
> I have the following shared mem problem on a ARM v5TE
> processor using Linux version 2.6.28,

Could you apply this patch and test again:

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=08e445bd6a


Nicolas

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 17:25 ` Nicolas Pitre
@ 2009-12-04 17:31   ` Russell King - ARM Linux
  2009-12-04 17:47   ` Heiko Schocher
  1 sibling, 0 replies; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-04 17:31 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Dec 04, 2009 at 12:25:55PM -0500, Nicolas Pitre wrote:
> On Fri, 4 Dec 2009, Heiko Schocher wrote:
> 
> > Hello,
> > 
> > I have the following shared mem problem on a ARM v5TE
> > processor using Linux version 2.6.28,
> 
> Could you apply this patch and test again:
> 
> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=08e445bd6a

That's almost certainly the problem, thanks Nicolas.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 17:25 ` Nicolas Pitre
  2009-12-04 17:31   ` Russell King - ARM Linux
@ 2009-12-04 17:47   ` Heiko Schocher
  2009-12-04 17:56     ` Nicolas Pitre
  1 sibling, 1 reply; 71+ messages in thread
From: Heiko Schocher @ 2009-12-04 17:47 UTC (permalink / raw)
  To: linux-arm-kernel

Hello Nicolas Pitre,

Nicolas Pitre wrote:
> On Fri, 4 Dec 2009, Heiko Schocher wrote:
> 
>> Hello,
>>
>> I have the following shared mem problem on a ARM v5TE
>> processor using Linux version 2.6.28,
> 
> Could you apply this patch and test again:
> 
> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=08e445bd6a

Thanks for your reply. I applied this patch, but it didn;t help.
Same output as without it:

-bash-3.2# cat shtest2.sh
#!/bin/sh
echo "Run shmtest2"
./shmtest2 write 1 &
./shmtest2 read 4 &

-bash-3.2# ./shtest2.sh
Run shmtest2
Write to in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Write Thread 0, addr:40961000
358: write new mesg: 0
Read from in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Read Thread addr:40961000
Creating thread 1
Shared memory Id:0
attach shared mem:41170000
Start Read Thread addr:41170000
Creating thread 2
Shared memory Id:0
attach shared mem:4197f000
Start Read Thread addr:4197f000
Creating thread 3
Shared memory Id:0
attach shared mem:4218e000
Start Read Thread addr:4218e000
-bash-3.2# 358: write new mesg: 0
360 Read from entry in_msg=1000, out_msg=0, addr=40961000
360 Read from entry in_msg=2000, out_msg=0, addr=41170000
360 Read from entry in_msg=3000, out_msg=0, addr=4197f000
360 Read from entry in_msg=4000, out_msg=0, addr=4218e000
358: write new mesg: 1000
360 Read from entry in_msg=5000, out_msg=0, addr=40961000
360 Read from entry in_msg=6000, out_msg=0, addr=41170000
360 Read from entry in_msg=7000, out_msg=0, addr=4197f000
360 Read from entry in_msg=8000, out_msg=0, addr=4218e000
358: write new mesg: 2000
360 Read from entry in_msg=9000, out_msg=0, addr=40961000
360 Read from entry in_msg=10000, out_msg=0, addr=41170000
360 Read from entry in_msg=11000, out_msg=0, addr=4197f000
360 Read from entry in_msg=12000, out_msg=0, addr=4218e000
358: write new mesg: 3000
360 Read from entry in_msg=13000, out_msg=0, addr=40961000
360 Read from entry in_msg=14000, out_msg=0, addr=41170000
360 Read from entry in_msg=15000, out_msg=0, addr=4197f000
360 Read from entry in_msg=16000, out_msg=0, addr=4218e000
358: write new mesg: 4000
360 Read from entry in_msg=17000, out_msg=0, addr=40961000
360 Read from entry in_msg=18000, out_msg=0, addr=41170000
360 Read from entry in_msg=19000, out_msg=0, addr=4197f000
360 Read from entry in_msg=20000, out_msg=0, addr=4218e000
358: write new mesg: 5000

bye
Heiko
-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 16:38         ` Russell King - ARM Linux
  2009-12-04 16:59           ` Russell King - ARM Linux
@ 2009-12-04 17:53           ` Heiko Schocher
  2009-12-04 19:13             ` Russell King - ARM Linux
  1 sibling, 1 reply; 71+ messages in thread
From: Heiko Schocher @ 2009-12-04 17:53 UTC (permalink / raw)
  To: linux-arm-kernel

Hello Russell King,

Russell King - ARM Linux wrote:
> On Fri, Dec 04, 2009 at 04:58:43PM +0100, Heiko Schocher wrote:
>> Hello Russell King,
>>
>> Russell King - ARM Linux wrote:
>>> On Fri, Dec 04, 2009 at 02:42:12PM +0100, Heiko Schocher wrote:
>>>> Hello Russell King,
>>>>
>>>> Russell King - ARM Linux wrote:
>>>>> On Fri, Dec 04, 2009 at 12:23:45PM +0100, Heiko Schocher wrote:
>>>>>> [4] Log from Demoprogramm, not working
>>>>> I think this is messed up - this is not from your first script but your
>>>>> second script which starts four independent read processes.
>>>>>
>>>>> I determined this because:
>>>>> (1) the read thread addresses are mostly the same
>>>>> (2) there are four "Read form in_msg" strings, which you only print
>>>>> once at the start of the program.
>>>> Ups, sorry for the confusion, here 2 logs with the right values:
>>> I don't remember whether 2.6.28 has highmem.  Can you check whether you're
>>> using highmem please?
>> No, there is no HIGHMEM in the defconfig, if you mean this config option.
>>
>> You can find the complete defconfig here:
>>
>> http://git.denx.de/?p=linux-2.6-denx.git;a=blob;f=arch/arm/configs/suen3_defconfig;h=905f9bc6a36c03ab41b97a292fca5b3a0011a2a0;hb=3dc3e4dc957f216ec5cdcd2a770f5c4b0cefca4a
> 
> Please apply this patch and re-run your program.  Expect a fair bit of
> output - the last 50 or so lines from the kernel messages should cover
> everything that's required.  Also useful would be the output from the
> program giving the addresses of the shm regions.

Thanks for your patch. I tried it and it gives with my testprogramm the
following output:

-bash-3.2# ./shtest2.sh
Run shmtest2
shmtest2:348: c6c0bea8 c723bf98 0 40000000 7404
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 0 40000000 7404
shmtest2:349: aliases 0
shmtest2:349: c6c0bea8 c721c968 1 40001000 7403
shmtest2:349: aliases 0
-bash-3.2# shmtest2:348: c6c0bea8 c723bf98 1 40001000 7403
shmtest2:348: aliases 0
shmtest2:348: c6c0bea8 c721cee8 1b 40023000 7410
shmtest2:348: aliases 0
shmtest2:348: c6c0bea8 c723bf98 16 40016000 740e
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c7217700 1b 40023000 7410
shmtest2:349: aliases 0
shmtest2:348: c6c0bea8 c723bf98 8 40008000 7422
shmtest2:348: aliases 0
shmtest2:348: c6c0bea8 c723bf98 13 40013000 7425
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 16 40016000 740e
shmtest2:349: aliases 0
shmtest2:348: c6c0bea8 c723bf98 14 40014000 7424
shmtest2:348: aliases 0
shmtest2:348: c6c0bea8 c723bf98 15 40015000 7409
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 8 40008000 7422
shmtest2:349: aliases 0
shmtest2:348: c6c0bea8 c723bf98 2 40002000 741b
shmtest2:348: aliases 0
shmtest2:348: c6c0bea8 c723bf98 3 40003000 741a
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 13 40013000 7425
shmtest2:349: aliases 0
shmtest2:348: c6c0bea8 c723bf98 9 40009000 7421
shmtest2:348: aliases 0
shmtest2:348: c6c0bea8 c723bf98 a 4000a000 7420
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 14 40014000 7424
shmtest2:349: aliases 0
shmtest2:348: c6c0bea8 c723bf98 19 40019000 7412
shmtest2:348: aliases 0
shmtest2:348: c6ccff08 c7217338 0 8000 64d0
shmtest2:348: aliases 0
shmtest2:348: c6c0bea8 c723bf98 7 40007000 741c
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 15 40015000 7409
shmtest2:349: aliases 0
shmtest2:349: c6c0bea8 c721c968 2 40002000 741b
shmtest2:349: aliases 0
shmtest2:348: c6c0bea8 c723bf98 1a 4001a000 7411
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 3 40003000 741a
shmtest2:349: aliases 0
shmtest2:348: c6c0bea8 c723bf98 17 40017000 7414
shmtest2:348: aliases 0
shmtest2:348: c6c0bea8 c723bf98 e 4000e000 7423
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 9 40009000 7421
shmtest2:349: aliases 0
shmtest2:348: c6c0bea8 c723bf98 b 4000b000 741f
shmtest2:348: aliases 0
shmtest2:348: c6c0bea8 c723bf98 c 4000c000 741e
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 a 4000a000 7420
shmtest2:349: aliases 0
shmtest2:348: c6c0bea8 c723bf98 d 4000d000 741d
shmtest2:348: aliases 0
shmtest2:348: c6c0bea8 c723bf98 5 40005000 7418
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 19 40019000 7412
shmtest2:349: aliases 0
shmtest2:349: c6ccff08 c723bee8 0 8000 64d0
shmtest2:349: aliases 0
shmtest2:349: c6c0bea8 c721c968 7 40007000 741c
shmtest2:349: aliases 0
shmtest2:348: c6c0bea8 c723bf98 6 40006000 7417
shmtest2:348: aliases 0
shmtest2:348: c6c94d08 c722c498 12 4003f000 69bd
shmtest2:348: aliases 0
shmtest2:348: c6c94d08 c722ce38 0 40025000 6991
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 1a 4001a000 7411
shmtest2:349: aliases 0
shmtest2:348: c6c94d08 c722ce38 3 40028000 69cd
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 17 40017000 7414
shmtest2:349: aliases 0
shmtest2:348: c6c0bea8 c723bf98 10 40010000 7428
shmtest2:348: aliases 0
shmtest2:348: c6c0b848 c7226230 111 4015c000 7435
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 e 4000e000 7423
shmtest2:349: aliases 0
shmtest2:349: c6c0bea8 c721c968 b 4000b000 741f
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 0 40043000 742f
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 c 4000c000 741e
shmtest2:349: aliases 0
shmtest2:349: c6c0bea8 c721c968 d 4000d000 741d
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 10 40053000 7446
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 5 40005000 7418
shmtest2:349: aliases 0
shmtest2:349: c6c0bea8 c721c968 6 40006000 7417
shmtest2:349: aliases 0
shmtest2:348: c6c0bea8 c723bf98 4 40004000 7419
shmtest2:348: aliases 0
shmtest2:349: c6c94d08 c722c128 12 4003f000 69bd
shmtest2:349: aliases 0
shmtest2:349: c6c94d08 c722c0d0 0 40025000 6991
shmtest2:349: aliases 0
shmtest2:348: c6c0bea8 c723bf98 f 4000f000 7429
shmtest2:348: aliases 0
shmtest2:349: c6c94d08 c722c0d0 3 40028000 69cd
shmtest2:349: aliases 0
shmtest2:349: c6c0bea8 c721c968 10 40010000 7428
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c230 111 4015c000 7435
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 11 40054000 744c
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 0 40043000 742f
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 10 40053000 7446
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226230 110 4015b000 7436
shmtest2:348: aliases 0
shmtest2:349: c6c0bea8 c721c968 4 40004000 7419
shmtest2:349: aliases 0
shmtest2:349: c6c0bea8 c721c968 f 4000f000 7429
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 12 40055000 744b
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 11 40054000 744c
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c230 110 4015b000 7436
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 13 40056000 744a
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 12 40055000 744b
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 13 40056000 744a
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 14 40057000 7449
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 14 40057000 7449
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 5 40048000 7450
shmtest2:348: aliases 0
shmtest2:348: c6c0b848 c7226e38 1 40044000 742e
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 5 40048000 7450
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 8 4004b000 7459
shmtest2:348: aliases 0
shmtest2:348: c6c0b848 c7226e38 f 40052000 7440
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 1 40044000 742e
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 2 40045000 7453
shmtest2:348: aliases 0
shmtest2:348: c6c0b848 c7226e38 3 40046000 7452
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 8 4004b000 7459
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 b 4004e000 7444
shmtest2:348: aliases 0
shmtest2:348: c6c0b848 c7226e38 4 40047000 7451
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 f 40052000 7440
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 a 4004d000 7445
shmtest2:348: aliases 0
shmtest2:348: c6c0b848 c7226e38 c 4004f000 7443
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 2 40045000 7453
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 6 40049000 7454
shmtest2:348: aliases 0
shmtest2:348: c6c94d08 c722ce38 1 40026000 69b0
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 3 40046000 7452
shmtest2:349: aliases 0
shmtest2:348: c6c94d08 c722ce38 2 40027000 69ce
shmtest2:348: aliases 0
shmtest2:348: c6c0b848 c7226e38 9 4004c000 743f
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 b 4004e000 7444
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 7 4004a000 745a
shmtest2:348: aliases 0
shmtest2:348: c6c0b848 c7226e38 d 40050000 7442
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 4 40047000 7451
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 e 40051000 7441
shmtest2:348: aliases 0
shmtest2:348: c6c94d08 c722ce38 4 40029000 69cc
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 a 4004d000 7445
shmtest2:349: aliases 0
shmtest2:348: c6c94d08 c722ce38 10 40035000 69bf
shmtest2:348: aliases 0
shmtest2:348: c6c0b848 c7226e38 6d 400b0000 746a
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 c 4004f000 7443
shmtest2:349: aliases 0
shmtest2:348: c6c94d08 c722ce38 e 40033000 69ba
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 6 40049000 7454
shmtest2:349: aliases 0
shmtest2:349: c6c94d08 c722c0d0 1 40026000 69b0
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 6e 400b1000 7470
shmtest2:348: aliases 0
shmtest2:349: c6c94d08 c722c0d0 2 40027000 69ce
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 9 4004c000 743f
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 b5 400f8000 7458
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 7 4004a000 745a
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 d 40050000 7442
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 8d 400d0000 749e
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 e 40051000 7441
shmtest2:349: aliases 0
shmtest2:349: c6c94d08 c722c0d0 4 40029000 69cc
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 b6 400f9000 7457
shmtest2:348: aliases 0
shmtest2:349: c6c94d08 c722c0d0 10 40035000 69bf
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 6d 400b0000 746a
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 f1 40134000 7508
shmtest2:348: aliases 0
shmtest2:349: c6c94d08 c722c0d0 e 40033000 69ba
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 6e 400b1000 7470
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 c9 4010c000 7511
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 b5 400f8000 7458
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 8d 400d0000 749e
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 bc 400ff000 745d
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 b6 400f9000 7457
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 f1 40134000 7508
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 6c 400af000 746b
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 c9 4010c000 7511
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 bc 400ff000 745d
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 2c 4006f000 747e
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 6c 400af000 746b
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 2c 4006f000 747e
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 29 4006c000 747a
shmtest2:349: aliases 0
shmtest2:349: c6ccff08 c723bee8 1 9000 64cf
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 29 4006c000 747a
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 57 4009a000 74a5
shmtest2:349: aliases 0
shmtest2:348: c6ccff08 c7217338 1 9000 64cf
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 58 4009b000 74a4
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 61 400a4000 74af
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 57 4009a000 74a5
shmtest2:348: aliases 0
shmtest2:348: c6c0b848 c7226e38 58 4009b000 74a4
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 62 400a5000 74b5
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 61 400a4000 74af
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 55 40098000 74cb
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 62 400a5000 74b5
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 ad 400f0000 7492
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 55 40098000 74cb
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 af 400f2000 7490
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 ad 400f0000 7492
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 b9 400fc000 7460
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 af 400f2000 7490
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 63 400a6000 74b4
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 b9 400fc000 7460
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 f2 40135000 7507
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 ae 400f1000 7491
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 63 400a6000 74b4
shmtest2:348: aliases 0
Read from in_msgshmtest2:349: c6c0b848 c722c180 2b 4006e000 7478

shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 2d 40070000 7484
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 f2 40135000 7507
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 2e 40071000 7483
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 f5 40138000 7504
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 ae 400f1000 7491
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 101 40144000 74f0
shmtest2:349: aliases 0
shmtest2:349: c6c94d08 c722c0d0 6 4002b000 69ca
shmtest2:349: aliases 0
Write to in_msgshmtest2:348: c6c0b848 c7226e38 2b 4006e000 7478

shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 43 40086000 751e
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 2d 40070000 7484
shmtest2:348: aliases 0
shmtest2:348: c6c0b848 c7226e38 2e 40071000 7483
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 3b 4007e000 74b8
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 f5 40138000 7504
shmtest2:348: aliases 0
shmtest2:348: c6c0b848 c7226e38 101 40144000 74f0
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 3c 4007f000 74b7
shmtest2:349: aliases 0
shmtest2:348: c6c94d08 c722ce38 6 4002b000 69ca
shmtest2:348: aliases 0
shmtest2:348: c6c0b848 c7226e38 43 40086000 751e
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 54 40097000 74c5
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 3b 4007e000 74b8
shmtest2:348: aliases 0
shmtest2:349: c6c94d08 c722c0d0 c 40031000 69ae
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 102 40145000 74ef
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 3c 4007f000 74b7
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 39 4007c000 74ba
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 3d 40080000 74bd
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 54 40097000 74c5
shmtest2:348: aliases 0
Creating thread shmtest2:349: c6c0b848 c722c180 3f 40082000 74c2
0
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 3e 40081000 74c3
shmtest2:349: aliases 0
shmtest2:348: c6c94d08 c722ce38 c 40031000 69ae
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 be 40101000 7462
shmtest2:349: aliases 0
Shared memory Idshmtest2:349: c6c94d08 c722c0d0 5 4002a000 69cb
:0
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 102 40145000 74ef
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 6a 400ad000 746d
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 39 4007c000 74ba
shmtest2:348: aliases 0
shmtest2:348: c6c0b848 c7226e38 3d 40080000 74bd
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 69 400ac000 746e
shmtest2:349: aliases 0
Creating thread shmtest2:348: c6c0b848 c7226e38 3f 40082000 74c2
0
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 65 400a8000 74b2
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 66 400a9000 7463
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 3e 40081000 74c3
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 f0 40133000 7509
shmtest2:349: aliases 0
shmtest2:349: c6c94d08 c722c0d0 7 4002c000 69c9
shmtest2:349: aliases 0
shmtest2:348: c6c94d08 c722ce38 5 4002a000 69cb
shmtest2:348: aliases 0
shmtest2:349: c6c94d08 c722c0d0 9 4002e000 69d5
shmtest2:349: aliases 0
shmtest2:349: c6c94d08 c722c0d0 8 4002d000 69cf
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 6a 400ad000 746d
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 68 400ab000 746f
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 67 400aa000 7469
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 69 400ac000 746e
shmtest2:348: aliases 0
shmtest2:349: c6c0b848 c722c180 ba 400fd000 745f
shmtest2:349: aliases 0
shmtest2:349: c6c0b848 c722c180 bd 40100000 745c
shmtest2:349: aliases 0
shmtest2:348: c6c0b848 c7226e38 65 400a8000 74b2
shmtest2:348: aliases 0
attach shared meshmtest2:350: c6c0b848 c722c180 c8 4010b000 750b
m:40961000
shmtest2:350: aliases 0
shmtest2:348: c6c0b848 c7226e38 66 400a9000 7463
shmtest2:348: aliases 0
shmtest2:348: c6c0b848 c7226e38 f0 40133000 7509
shmtest2:348: aliases 0
Start Read Threashmtest2:350: c7220140 c7217968 0 40961000 64a1
shmtest2:350: aliases 0

Creating thread shmtest2:348: c6c94d08 c722ce38 7 4002c000 69c9
1
Shared memoryshmtest2:348: aliases 0
 Id:0
shmtest2:350: c6c0b848 c722c180 8b 400ce000 74a0
shmtest2:350: aliases 0
shmtest2:348: c6c94d08 c722ce38 9 4002e000 69d5
shmtest2:348: aliases 0
shmtest2:348: c6c94d08 c722ce38 8 4002d000 69cf
shmtest2:348: aliases 0
attach shared meshmtest2:351: c7220140 c72179c0 0 41170000 64a1
m:41170000
Starshmtest2:351: vma c7217968 addr 40961000 pte 064a13cf
shmtest2:351: modified 064a13c7
t Read Thread adshmtest2:351: aliases 1
dr:41170000
shmtest2:351: vma c72179c0 addr 41170000 pte 064a13cf
shmtest2:351: modified 064a13c7
shmtest2:348: c6c0b848 c7226e38 68 400ab000 746f
shmtest2:348: aliases 0
Creating thread shmtest2:352: c7220140 c7217548 0 4197f000 64a1
2
Shared memoryshmtest2:352: vma c7217968 addr 40961000 pte 064a13c7
shmtest2:352: vma c72179c0 addr 41170000 pte 064a13c7
 Id:0
attach shshmtest2:352: aliases 2
ared mem:4197f00shmtest2:352: vma c7217548 addr 4197f000 pte 064a13cf
0
Start Read Thshmtest2:352: modified 064a13c7
read addr:4197f0shmtest2:348: c6c0b848 c7226e38 67 400aa000 7469
00
shmtest2:348: aliases 0
Creating thread shmtest2:348: c6c0b848 c7226e38 ba 400fd000 745f
3
Shared memoryshmtest2:348: aliases 0
 Id:0
shmtest2:348: c6c0b848 c7226e38 bd 40100000 745c
shmtest2:348: aliases 0
shmtest2:353: c6c0b848 c7226e38 be 40101000 7462
shmtest2:353: aliases 0
attach shared meshmtest2:354: c7220140 c7217650 0 4218e000 64a1
m:4218e000
Starshmtest2:354: vma c7217968 addr 40961000 pte 064a13c7
shmtest2:354: vma c72179c0 addr 41170000 pte 064a13c7
shmtest2:354: vma c7217548 addr 4197f000 pte 064a13c7
t Read Thread adshmtest2:354: aliases 3
dr:4218e000
shmtest2:354: vma c7217650 addr 4218e000 pte 064a13cf
shmtest2:354: modified 064a13c7
Shared memory Idshmtest2:353: c6c0b848 c7226e38 c8 4010b000 750b
:0
shmtest2:353: aliases 0
attach shared meshmtest2:353: c7220140 c7217288 0 40961000 64a1
m:40961000
Starshmtest2:353: aliases 0
t Write Thread 0shmtest2:353: c6c0b848 c7226e38 8c 400cf000 749f
shmtest2:353: aliases 0

348: write new mshmtest2:353: c6c0b848 c7226e38 8b 400ce000 74a0
esg: 0
shmtest2:353: aliases 0
shmtest2:350: c6c0b848 c722c180 8c 400cf000 749f
shmtest2:350: aliases 0
349 Read from entry in_msg=0, out_msg=1000, addr=40961000
349 Read from entry in_msg=1000, out_msg=1000, addr=41170000
349 Read from entry in_msg=2000, out_msg=1000, addr=4197f000
349 Read from entry in_msg=3000, out_msg=1000, addr=4218e000
348: write new mesg: 1000
349 Read from entry in_msg=4000, out_msg=1000, addr=40961000
349 Read from entry in_msg=5000, out_msg=1000, addr=41170000
349 Read from entry in_msg=6000, out_msg=1000, addr=4197f000
349 Read from entry in_msg=7000, out_msg=1000, addr=4218e000
348: write new mesg: 2000
349 Read from entry in_msg=8000, out_msg=1000, addr=40961000
349 Read from entry in_msg=9000, out_msg=1000, addr=41170000
349 Read from entry in_msg=10000, out_msg=1000, addr=4197f000
349 Read from entry in_msg=11000, out_msg=1000, addr=4218e000
348: write new mesg: 3000
349 Read from entry in_msg=12000, out_msg=1000, addr=40961000
349 Read from entry in_msg=13000, out_msg=1000, addr=41170000
349 Read from entry in_msg=14000, out_msg=1000, addr=4197f000
349 Read from entry in_msg=15000, out_msg=1000, addr=4218e000
348: write new mesg: 4000

If you want a log without the printfs from the userspaceprg., I can make it.

bye
Heiko
-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 17:47   ` Heiko Schocher
@ 2009-12-04 17:56     ` Nicolas Pitre
  2009-12-04 19:33       ` Heiko Schocher
  0 siblings, 1 reply; 71+ messages in thread
From: Nicolas Pitre @ 2009-12-04 17:56 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 4 Dec 2009, Heiko Schocher wrote:

> Hello Nicolas Pitre,
> 
> Nicolas Pitre wrote:
> > On Fri, 4 Dec 2009, Heiko Schocher wrote:
> > 
> >> Hello,
> >>
> >> I have the following shared mem problem on a ARM v5TE
> >> processor using Linux version 2.6.28,
> > 
> > Could you apply this patch and test again:
> > 
> > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=08e445bd6a
> 
> Thanks for your reply. I applied this patch, but it didn;t help.

OK. Then could you test with Linux version 2.6.32, just to be sure the 
issue still exists there?


Nicolas

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 17:53           ` Heiko Schocher
@ 2009-12-04 19:13             ` Russell King - ARM Linux
  2009-12-04 19:35               ` Heiko Schocher
  0 siblings, 1 reply; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-04 19:13 UTC (permalink / raw)
  To: linux-arm-kernel

Well, the kernel messages of relevance from the dump are:

shmtest2:351: c7220140 c72179c0 0 41170000 64a1
shmtest2:351: vma c7217968 addr 40961000 pte 064a13cf
shmtest2:351: modified 064a13c7
shmtest2:351: aliases 1
shmtest2:351: vma c72179c0 addr 41170000 pte 064a13cf
shmtest2:351: modified 064a13c7
shmtest2:352: c7220140 c7217548 0 4197f000 64a1
shmtest2:352: vma c7217968 addr 40961000 pte 064a13c7
shmtest2:352: vma c72179c0 addr 41170000 pte 064a13c7
shmtest2:352: aliases 2
shmtest2:352: vma c7217548 addr 4197f000 pte 064a13cf
shmtest2:352: modified 064a13c7
shmtest2:354: c7220140 c7217650 0 4218e000 64a1
shmtest2:354: vma c7217968 addr 40961000 pte 064a13c7
shmtest2:354: vma c72179c0 addr 41170000 pte 064a13c7
shmtest2:354: vma c7217548 addr 4197f000 pte 064a13c7
shmtest2:354: aliases 3
shmtest2:354: vma c7217650 addr 4218e000 pte 064a13cf
shmtest2:354: modified 064a13c7

which shows that the PTEs are having their 'cacheable' bit correctly
cleared, thus making changes to the page in RAM immediately visible to
the user program.

Could it be that the CPU you're using doesn't support the C=0 B=1
PTE encoding properly, and still caches data in such a region?

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 17:56     ` Nicolas Pitre
@ 2009-12-04 19:33       ` Heiko Schocher
  0 siblings, 0 replies; 71+ messages in thread
From: Heiko Schocher @ 2009-12-04 19:33 UTC (permalink / raw)
  To: linux-arm-kernel

Hello Nicolas Pitre,

Nicolas Pitre wrote:
> On Fri, 4 Dec 2009, Heiko Schocher wrote:
> 
>> Hello Nicolas Pitre,
>>
>> Nicolas Pitre wrote:
>>> On Fri, 4 Dec 2009, Heiko Schocher wrote:
>>>
>>>> Hello,
>>>>
>>>> I have the following shared mem problem on a ARM v5TE
>>>> processor using Linux version 2.6.28,
>>> Could you apply this patch and test again:
>>>
>>> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=08e445bd6a
>> Thanks for your reply. I applied this patch, but it didn;t help.
> 
> OK. Then could you test with Linux version 2.6.32, just to be sure the 
> issue still exists there?

I try to start a 2.6.32 kernel, but maybe I need some time for it.

bye
Heiko
-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 19:13             ` Russell King - ARM Linux
@ 2009-12-04 19:35               ` Heiko Schocher
  2009-12-06 13:53                 ` Ronen Shitrit
  0 siblings, 1 reply; 71+ messages in thread
From: Heiko Schocher @ 2009-12-04 19:35 UTC (permalink / raw)
  To: linux-arm-kernel

Hello Russell King,

Russell King - ARM Linux wrote:
> Well, the kernel messages of relevance from the dump are:
> 
> shmtest2:351: c7220140 c72179c0 0 41170000 64a1
> shmtest2:351: vma c7217968 addr 40961000 pte 064a13cf
> shmtest2:351: modified 064a13c7
> shmtest2:351: aliases 1
> shmtest2:351: vma c72179c0 addr 41170000 pte 064a13cf
> shmtest2:351: modified 064a13c7
> shmtest2:352: c7220140 c7217548 0 4197f000 64a1
> shmtest2:352: vma c7217968 addr 40961000 pte 064a13c7
> shmtest2:352: vma c72179c0 addr 41170000 pte 064a13c7
> shmtest2:352: aliases 2
> shmtest2:352: vma c7217548 addr 4197f000 pte 064a13cf
> shmtest2:352: modified 064a13c7
> shmtest2:354: c7220140 c7217650 0 4218e000 64a1
> shmtest2:354: vma c7217968 addr 40961000 pte 064a13c7
> shmtest2:354: vma c72179c0 addr 41170000 pte 064a13c7
> shmtest2:354: vma c7217548 addr 4197f000 pte 064a13c7
> shmtest2:354: aliases 3
> shmtest2:354: vma c7217650 addr 4218e000 pte 064a13cf
> shmtest2:354: modified 064a13c7
> 
> which shows that the PTEs are having their 'cacheable' bit correctly
> cleared, thus making changes to the page in RAM immediately visible to
> the user program.

Ok, sounds good :-)

> Could it be that the CPU you're using doesn't support the C=0 B=1
> PTE encoding properly, and still caches data in such a region?

Hmm.. I try to find this out, thanks!

bye
Heiko
-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-04 19:35               ` Heiko Schocher
@ 2009-12-06 13:53                 ` Ronen Shitrit
  2009-12-06 14:16                   ` Russell King - ARM Linux
  0 siblings, 1 reply; 71+ messages in thread
From: Ronen Shitrit @ 2009-12-06 13:53 UTC (permalink / raw)
  To: linux-arm-kernel

Hi

There isn't any known issues related to working with C=0 B=1 on Kirkwood.
Can you pls run this test on some other ARMv5 platform and approve it is working.

Thanks

-----Original Message-----
From: linux-arm-kernel-bounces@lists.infradead.org [mailto:linux-arm-kernel-bounces at lists.infradead.org] On Behalf Of Heiko Schocher
Sent: Friday, December 04, 2009 9:35 PM
To: Russell King - ARM Linux
Cc: linux-arm-kernel at lists.infradead.org
Subject: Re: shared memory problem on ARM v5TE using threads

Hello Russell King,

Russell King - ARM Linux wrote:
> Well, the kernel messages of relevance from the dump are:
> 
> shmtest2:351: c7220140 c72179c0 0 41170000 64a1
> shmtest2:351: vma c7217968 addr 40961000 pte 064a13cf
> shmtest2:351: modified 064a13c7
> shmtest2:351: aliases 1
> shmtest2:351: vma c72179c0 addr 41170000 pte 064a13cf
> shmtest2:351: modified 064a13c7
> shmtest2:352: c7220140 c7217548 0 4197f000 64a1
> shmtest2:352: vma c7217968 addr 40961000 pte 064a13c7
> shmtest2:352: vma c72179c0 addr 41170000 pte 064a13c7
> shmtest2:352: aliases 2
> shmtest2:352: vma c7217548 addr 4197f000 pte 064a13cf
> shmtest2:352: modified 064a13c7
> shmtest2:354: c7220140 c7217650 0 4218e000 64a1
> shmtest2:354: vma c7217968 addr 40961000 pte 064a13c7
> shmtest2:354: vma c72179c0 addr 41170000 pte 064a13c7
> shmtest2:354: vma c7217548 addr 4197f000 pte 064a13c7
> shmtest2:354: aliases 3
> shmtest2:354: vma c7217650 addr 4218e000 pte 064a13cf
> shmtest2:354: modified 064a13c7
> 
> which shows that the PTEs are having their 'cacheable' bit correctly
> cleared, thus making changes to the page in RAM immediately visible to
> the user program.

Ok, sounds good :-)

> Could it be that the CPU you're using doesn't support the C=0 B=1
> PTE encoding properly, and still caches data in such a region?

Hmm.. I try to find this out, thanks!

bye
Heiko
-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel at lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-06 13:53                 ` Ronen Shitrit
@ 2009-12-06 14:16                   ` Russell King - ARM Linux
  2009-12-07  7:54                     ` Ronen Shitrit
                                       ` (2 more replies)
  0 siblings, 3 replies; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-06 14:16 UTC (permalink / raw)
  To: linux-arm-kernel

On Sun, Dec 06, 2009 at 03:53:48PM +0200, Ronen Shitrit wrote:
> There isn't any known issues related to working with C=0 B=1 on Kirkwood.
> Can you pls run this test on some other ARMv5 platform and approve it is
> working.

I ran it on an ARM926EJ-S, which is ARMv5 and worked fine.

If there's no problem with C=0 B=1 mappings on Kirkwood, I've no idea
what's going on, and I don't have any suggestion on what to try next.

The log shows that the kernel is doing the right thing: when we detect
two mappings for the same page in the same MM space, we clean and
invalidate any existing cacheable mappings visible in the MM space
(both L1 and L2), and switch all visible mappings to C=0 B=1 mappings.
This makes the area non-cacheable.

That means updates to the area should be visible via any mapping in the
same MM space as soon as they're written to any other mapping.

> -----Original Message-----
> From: linux-arm-kernel-bounces at lists.infradead.org [mailto:linux-arm-kernel-bounces at lists.infradead.org] On Behalf Of Heiko Schocher
> Sent: Friday, December 04, 2009 9:35 PM
> To: Russell King - ARM Linux
> Cc: linux-arm-kernel at lists.infradead.org
> Subject: Re: shared memory problem on ARM v5TE using threads
> 
> Hello Russell King,
> 
> Russell King - ARM Linux wrote:
> > Well, the kernel messages of relevance from the dump are:
> > 
> > shmtest2:351: c7220140 c72179c0 0 41170000 64a1
> > shmtest2:351: vma c7217968 addr 40961000 pte 064a13cf
> > shmtest2:351: modified 064a13c7
> > shmtest2:351: aliases 1
> > shmtest2:351: vma c72179c0 addr 41170000 pte 064a13cf
> > shmtest2:351: modified 064a13c7

The above is the result of the second mapping of the same physical page
occuring in the current MM space: we switch the mappings at 0x40961000
and 0x41170000 virtual to C=0 B=1 mode.

> > shmtest2:352: c7220140 c7217548 0 4197f000 64a1
> > shmtest2:352: vma c7217968 addr 40961000 pte 064a13c7
> > shmtest2:352: vma c72179c0 addr 41170000 pte 064a13c7
> > shmtest2:352: aliases 2
> > shmtest2:352: vma c7217548 addr 4197f000 pte 064a13cf
> > shmtest2:352: modified 064a13c7

We detect a third mapping, and the previous two mappings are already C=0
B=1 mode, so there's no need to touch them.  The new mapping at 0x4197f000
is modified to C=0 B=1 mode.

> > shmtest2:354: c7220140 c7217650 0 4218e000 64a1
> > shmtest2:354: vma c7217968 addr 40961000 pte 064a13c7
> > shmtest2:354: vma c72179c0 addr 41170000 pte 064a13c7
> > shmtest2:354: vma c7217548 addr 4197f000 pte 064a13c7
> > shmtest2:354: aliases 3
> > shmtest2:354: vma c7217650 addr 4218e000 pte 064a13cf
> > shmtest2:354: modified 064a13c7

And here is the forth mapping, and the previous two mappings are already
C=0 B=1 mode, so there's no need to touch them.  The new mapping at
0x4218e000 is modified to C=0 B=1 mode.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-06 14:16                   ` Russell King - ARM Linux
@ 2009-12-07  7:54                     ` Ronen Shitrit
  2009-12-07  8:33                     ` Heiko Schocher
  2009-12-07 11:31                     ` saeed bishara
  2 siblings, 0 replies; 71+ messages in thread
From: Ronen Shitrit @ 2009-12-07  7:54 UTC (permalink / raw)
  To: linux-arm-kernel

Just run the test on kernel 2.6.31 and reproduce the failure, 
I will look into it and update...

Regards

-----Original Message-----
From: Russell King - ARM Linux [mailto:linux at arm.linux.org.uk] 
Sent: Sunday, December 06, 2009 4:16 PM
To: Ronen Shitrit
Cc: hs at denx.de; linux-arm-kernel at lists.infradead.org
Subject: Re: shared memory problem on ARM v5TE using threads

On Sun, Dec 06, 2009 at 03:53:48PM +0200, Ronen Shitrit wrote:
> There isn't any known issues related to working with C=0 B=1 on Kirkwood.
> Can you pls run this test on some other ARMv5 platform and approve it is
> working.

I ran it on an ARM926EJ-S, which is ARMv5 and worked fine.

If there's no problem with C=0 B=1 mappings on Kirkwood, I've no idea
what's going on, and I don't have any suggestion on what to try next.

The log shows that the kernel is doing the right thing: when we detect
two mappings for the same page in the same MM space, we clean and
invalidate any existing cacheable mappings visible in the MM space
(both L1 and L2), and switch all visible mappings to C=0 B=1 mappings.
This makes the area non-cacheable.

That means updates to the area should be visible via any mapping in the
same MM space as soon as they're written to any other mapping.

> -----Original Message-----
> From: linux-arm-kernel-bounces at lists.infradead.org [mailto:linux-arm-kernel-bounces at lists.infradead.org] On Behalf Of Heiko Schocher
> Sent: Friday, December 04, 2009 9:35 PM
> To: Russell King - ARM Linux
> Cc: linux-arm-kernel at lists.infradead.org
> Subject: Re: shared memory problem on ARM v5TE using threads
> 
> Hello Russell King,
> 
> Russell King - ARM Linux wrote:
> > Well, the kernel messages of relevance from the dump are:
> > 
> > shmtest2:351: c7220140 c72179c0 0 41170000 64a1
> > shmtest2:351: vma c7217968 addr 40961000 pte 064a13cf
> > shmtest2:351: modified 064a13c7
> > shmtest2:351: aliases 1
> > shmtest2:351: vma c72179c0 addr 41170000 pte 064a13cf
> > shmtest2:351: modified 064a13c7

The above is the result of the second mapping of the same physical page
occuring in the current MM space: we switch the mappings at 0x40961000
and 0x41170000 virtual to C=0 B=1 mode.

> > shmtest2:352: c7220140 c7217548 0 4197f000 64a1
> > shmtest2:352: vma c7217968 addr 40961000 pte 064a13c7
> > shmtest2:352: vma c72179c0 addr 41170000 pte 064a13c7
> > shmtest2:352: aliases 2
> > shmtest2:352: vma c7217548 addr 4197f000 pte 064a13cf
> > shmtest2:352: modified 064a13c7

We detect a third mapping, and the previous two mappings are already C=0
B=1 mode, so there's no need to touch them.  The new mapping at 0x4197f000
is modified to C=0 B=1 mode.

> > shmtest2:354: c7220140 c7217650 0 4218e000 64a1
> > shmtest2:354: vma c7217968 addr 40961000 pte 064a13c7
> > shmtest2:354: vma c72179c0 addr 41170000 pte 064a13c7
> > shmtest2:354: vma c7217548 addr 4197f000 pte 064a13c7
> > shmtest2:354: aliases 3
> > shmtest2:354: vma c7217650 addr 4218e000 pte 064a13cf
> > shmtest2:354: modified 064a13c7

And here is the forth mapping, and the previous two mappings are already
C=0 B=1 mode, so there's no need to touch them.  The new mapping at
0x4218e000 is modified to C=0 B=1 mode.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-06 14:16                   ` Russell King - ARM Linux
  2009-12-07  7:54                     ` Ronen Shitrit
@ 2009-12-07  8:33                     ` Heiko Schocher
  2009-12-07 11:31                     ` saeed bishara
  2 siblings, 0 replies; 71+ messages in thread
From: Heiko Schocher @ 2009-12-07  8:33 UTC (permalink / raw)
  To: linux-arm-kernel

Hello Russell,

Russell King - ARM Linux wrote:
> On Sun, Dec 06, 2009 at 03:53:48PM +0200, Ronen Shitrit wrote:
>> There isn't any known issues related to working with C=0 B=1 on Kirkwood.
>> Can you pls run this test on some other ARMv5 platform and approve it is
>> working.
> 
> I ran it on an ARM926EJ-S, which is ARMv5 and worked fine.
> 
> If there's no problem with C=0 B=1 mappings on Kirkwood, I've no idea
> what's going on, and I don't have any suggestion on what to try next.

Gilles Chanteperdrix <gilles.chanteperdrix@xenomai.org> suggested
me, to try the following patch, he wrote:
> But to check whether C=0 B=1 is
> indeed the issue, you may try the following patch:
> 
> diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
> index d0d17b6..d65816e 100644
> --- a/arch/arm/mm/fault-armv.c
> +++ b/arch/arm/mm/fault-armv.c
> @@ -23,7 +23,7 @@
>  #include <asm/pgtable.h>
>  #include <asm/tlbflush.h>
> 
> -static unsigned long shared_pte_mask = L_PTE_MT_BUFFERABLE;
> +static unsigned long shared_pte_mask = L_PTE_MT_UNCACHED;
> 
>  /*
>   * We take the easy way out of this problem - we make the
> 
> 

I tried it with this patch, and the testprogram also don;t work ...

bye
Heiko
-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-06 14:16                   ` Russell King - ARM Linux
  2009-12-07  7:54                     ` Ronen Shitrit
  2009-12-07  8:33                     ` Heiko Schocher
@ 2009-12-07 11:31                     ` saeed bishara
  2009-12-07 11:42                       ` Russell King - ARM Linux
  2 siblings, 1 reply; 71+ messages in thread
From: saeed bishara @ 2009-12-07 11:31 UTC (permalink / raw)
  To: linux-arm-kernel

> I ran it on an ARM926EJ-S, which is ARMv5 and worked fine.
>
does it have L2 cache?
> If there's no problem with C=0 B=1 mappings on Kirkwood, I've no idea
> what's going on, and I don't have any suggestion on what to try next.
>
> The log shows that the kernel is doing the right thing: when we detect
> two mappings for the same page in the same MM space, we clean and
> invalidate any existing cacheable mappings visible in the MM space
> (both L1 and L2), and switch all visible mappings to C=0 B=1 mappings.
> This makes the area non-cacheable.
what about the PTE of the MM space of the write process? if it remains
C=1 B=1, then it's data will be at the L2, and as the L2 is not
flushed on context switch, then that explains this behavior.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 11:31                     ` saeed bishara
@ 2009-12-07 11:42                       ` Russell King - ARM Linux
  2009-12-07 12:16                         ` Ronen Shitrit
  2009-12-07 12:24                         ` Heiko Schocher
  0 siblings, 2 replies; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-07 11:42 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 07, 2009 at 01:31:41PM +0200, saeed bishara wrote:
> > I ran it on an ARM926EJ-S, which is ARMv5 and worked fine.
> >
> does it have L2 cache?

No.

> > If there's no problem with C=0 B=1 mappings on Kirkwood, I've no idea
> > what's going on, and I don't have any suggestion on what to try next.
> >
> > The log shows that the kernel is doing the right thing: when we detect
> > two mappings for the same page in the same MM space, we clean and
> > invalidate any existing cacheable mappings visible in the MM space
> > (both L1 and L2), and switch all visible mappings to C=0 B=1 mappings.
> > This makes the area non-cacheable.
>
> what about the PTE of the MM space of the write process? if it remains
> C=1 B=1, then it's data will be at the L2, and as the L2 is not
> flushed on context switch, then that explains this behavior.

That's probably the issue, and it means that _all_ shared writable
mappings on your processor will be broken.

Oh dear, that really is bad news.

There are two solutions to this which I can currently think of:
1. flush the L2 cache on every context switch
2. make all shared writable mappings non-cacheable

Neither of those two options appeals.  Since it's only one set of CPUs
which are affected, we really don't want to apply any fix for this to the
generic ARM kernel code - especially when all other L2 caches are sensibly
implemented as PIPT rather than VIVT.

Can we please forget that Feroceon CPUs exist? ;)

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 11:42                       ` Russell King - ARM Linux
@ 2009-12-07 12:16                         ` Ronen Shitrit
  2009-12-07 12:27                           ` Heiko Schocher
  2009-12-07 12:24                         ` Heiko Schocher
  1 sibling, 1 reply; 71+ messages in thread
From: Ronen Shitrit @ 2009-12-07 12:16 UTC (permalink / raw)
  To: linux-arm-kernel

It's definitely this issue. 
If I disable L2 everything works.
Also using Russell debug prints shows that adjust_pte is never called for the write process page.

I think option 2 below is preferred, we don't want to flush the entire L2 for each context switch, it will be a performance killer.

Regards

-----Original Message-----
From: Russell King - ARM Linux [mailto:linux at arm.linux.org.uk] 
Sent: Monday, December 07, 2009 1:42 PM
To: saeed bishara
Cc: Ronen Shitrit; hs at denx.de; linux-arm-kernel at lists.infradead.org
Subject: Re: shared memory problem on ARM v5TE using threads

On Mon, Dec 07, 2009 at 01:31:41PM +0200, saeed bishara wrote:
> > I ran it on an ARM926EJ-S, which is ARMv5 and worked fine.
> >
> does it have L2 cache?

No.

> > If there's no problem with C=0 B=1 mappings on Kirkwood, I've no idea
> > what's going on, and I don't have any suggestion on what to try next.
> >
> > The log shows that the kernel is doing the right thing: when we detect
> > two mappings for the same page in the same MM space, we clean and
> > invalidate any existing cacheable mappings visible in the MM space
> > (both L1 and L2), and switch all visible mappings to C=0 B=1 mappings.
> > This makes the area non-cacheable.
>
> what about the PTE of the MM space of the write process? if it remains
> C=1 B=1, then it's data will be at the L2, and as the L2 is not
> flushed on context switch, then that explains this behavior.

That's probably the issue, and it means that _all_ shared writable
mappings on your processor will be broken.

Oh dear, that really is bad news.

There are two solutions to this which I can currently think of:
1. flush the L2 cache on every context switch
2. make all shared writable mappings non-cacheable

Neither of those two options appeals.  Since it's only one set of CPUs
which are affected, we really don't want to apply any fix for this to the
generic ARM kernel code - especially when all other L2 caches are sensibly
implemented as PIPT rather than VIVT.

Can we please forget that Feroceon CPUs exist? ;)

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 11:42                       ` Russell King - ARM Linux
  2009-12-07 12:16                         ` Ronen Shitrit
@ 2009-12-07 12:24                         ` Heiko Schocher
  2009-12-07 12:55                           ` Ronen Shitrit
  1 sibling, 1 reply; 71+ messages in thread
From: Heiko Schocher @ 2009-12-07 12:24 UTC (permalink / raw)
  To: linux-arm-kernel

Hello Russell,

Russell King - ARM Linux wrote:
> On Mon, Dec 07, 2009 at 01:31:41PM +0200, saeed bishara wrote:
[...]
>>> If there's no problem with C=0 B=1 mappings on Kirkwood, I've no idea
>>> what's going on, and I don't have any suggestion on what to try next.
>>>
>>> The log shows that the kernel is doing the right thing: when we detect
>>> two mappings for the same page in the same MM space, we clean and
>>> invalidate any existing cacheable mappings visible in the MM space
>>> (both L1 and L2), and switch all visible mappings to C=0 B=1 mappings.
>>> This makes the area non-cacheable.
>> what about the PTE of the MM space of the write process? if it remains
>> C=1 B=1, then it's data will be at the L2, and as the L2 is not
>> flushed on context switch, then that explains this behavior.
> 
> That's probably the issue, and it means that _all_ shared writable
> mappings on your processor will be broken.

Hmm.. I tried also the testprg with CACHE_FEROCEON_L2 deaktivated,
same result ...

> Oh dear, that really is bad news.

Indeed.

> There are two solutions to this which I can currently think of:
> 1. flush the L2 cache on every context switch

To clarify, the testprg runs fine, if I start 4 processes each with
only one read thread. In this case all works as expected. The mess
begins only, if one read process starts more than one read thread ...

> 2. make all shared writable mappings non-cacheable
> 
> Neither of those two options appeals.  Since it's only one set of CPUs
> which are affected, we really don't want to apply any fix for this to the
> generic ARM kernel code - especially when all other L2 caches are sensibly
> implemented as PIPT rather than VIVT.
> 
> Can we please forget that Feroceon CPUs exist? ;)

;-)

bye
Heiko
-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 12:16                         ` Ronen Shitrit
@ 2009-12-07 12:27                           ` Heiko Schocher
  2009-12-07 12:42                             ` Ronen Shitrit
  0 siblings, 1 reply; 71+ messages in thread
From: Heiko Schocher @ 2009-12-07 12:27 UTC (permalink / raw)
  To: linux-arm-kernel

Hello Ronen,

Ronen Shitrit wrote:
> It's definitely this issue. 
> If I disable L2 everything works.

How did you exactly disabled the L2 Cache? And on which system
did you this test?

thanks
Heiko
-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 12:27                           ` Heiko Schocher
@ 2009-12-07 12:42                             ` Ronen Shitrit
  2009-12-07 15:24                               ` Nicolas Pitre
  0 siblings, 1 reply; 71+ messages in thread
From: Ronen Shitrit @ 2009-12-07 12:42 UTC (permalink / raw)
  To: linux-arm-kernel

Disable CONFIG_CACHE_FEROCEON_L2 in the config.
I'm using KW-DB kernel 2.6.31.4.

Regards



-----Original Message-----
From: Heiko Schocher [mailto:hs at denx.de] 
Sent: Monday, December 07, 2009 2:27 PM
To: Ronen Shitrit
Cc: Russell King - ARM Linux; saeed bishara; linux-arm-kernel at lists.infradead.org
Subject: Re: shared memory problem on ARM v5TE using threads

Hello Ronen,

Ronen Shitrit wrote:
> It's definitely this issue. 
> If I disable L2 everything works.

How did you exactly disabled the L2 Cache? And on which system
did you this test?

thanks
Heiko
-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 12:24                         ` Heiko Schocher
@ 2009-12-07 12:55                           ` Ronen Shitrit
  2009-12-07 14:52                             ` Russell King - ARM Linux
  0 siblings, 1 reply; 71+ messages in thread
From: Ronen Shitrit @ 2009-12-07 12:55 UTC (permalink / raw)
  To: linux-arm-kernel



-----Original Message-----
From: Heiko Schocher [mailto:hs at denx.de] 
Sent: Monday, December 07, 2009 2:24 PM
To: Russell King - ARM Linux
Cc: saeed bishara; Ronen Shitrit; linux-arm-kernel at lists.infradead.org
Subject: Re: shared memory problem on ARM v5TE using threads

Hello Russell,

Russell King - ARM Linux wrote:
> On Mon, Dec 07, 2009 at 01:31:41PM +0200, saeed bishara wrote:
[...]
>>> If there's no problem with C=0 B=1 mappings on Kirkwood, I've no idea
>>> what's going on, and I don't have any suggestion on what to try next.
>>>
>>> The log shows that the kernel is doing the right thing: when we detect
>>> two mappings for the same page in the same MM space, we clean and
>>> invalidate any existing cacheable mappings visible in the MM space
>>> (both L1 and L2), and switch all visible mappings to C=0 B=1 mappings.
>>> This makes the area non-cacheable.
>> what about the PTE of the MM space of the write process? if it remains
>> C=1 B=1, then it's data will be at the L2, and as the L2 is not
>> flushed on context switch, then that explains this behavior.
> 
> That's probably the issue, and it means that _all_ shared writable
> mappings on your processor will be broken.

Hmm.. I tried also the testprg with CACHE_FEROCEON_L2 deaktivated,
same result ...

> Oh dear, that really is bad news.

Indeed.

> There are two solutions to this which I can currently think of:
> 1. flush the L2 cache on every context switch

To clarify, the testprg runs fine, if I start 4 processes each with
only one read thread. In this case all works as expected. The mess
begins only, if one read process starts more than one read thread ...

[Ronen Shitrit] That also match the theory:
When using different processes, the shared area will stay C=1 B=1, 
On each context switch L1 will be flushed,
Since L2 is PIPT next process will get the correct data...

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 12:55                           ` Ronen Shitrit
@ 2009-12-07 14:52                             ` Russell King - ARM Linux
  2009-12-07 15:37                               ` Nicolas Pitre
  2009-12-07 15:40                               ` Russell King - ARM Linux
  0 siblings, 2 replies; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-07 14:52 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 07, 2009 at 02:55:52PM +0200, Ronen Shitrit wrote:
> > Russell King - ARM Linux wrote:
> > > On Mon, Dec 07, 2009 at 01:31:41PM +0200, saeed bishara wrote:
> > [...]
> > >>> If there's no problem with C=0 B=1 mappings on Kirkwood, I've no idea
> > >>> what's going on, and I don't have any suggestion on what to try next.
> > >>>
> > >>> The log shows that the kernel is doing the right thing: when we detect
> > >>> two mappings for the same page in the same MM space, we clean and
> > >>> invalidate any existing cacheable mappings visible in the MM space
> > >>> (both L1 and L2), and switch all visible mappings to C=0 B=1 mappings.
> > >>> This makes the area non-cacheable.
> > >> what about the PTE of the MM space of the write process? if it remains
> > >> C=1 B=1, then it's data will be at the L2, and as the L2 is not
> > >> flushed on context switch, then that explains this behavior.
> > > 
> > > That's probably the issue, and it means that _all_ shared writable
> > > mappings on your processor will be broken.
> > 
> > Hmm.. I tried also the testprg with CACHE_FEROCEON_L2 deaktivated,
> > same result ...
> > 
> > > Oh dear, that really is bad news.
> > 
> > Indeed.
> > 
> > > There are two solutions to this which I can currently think of:
> > > 1. flush the L2 cache on every context switch
> > 
> > To clarify, the testprg runs fine, if I start 4 processes each with
> > only one read thread. In this case all works as expected. The mess
> > begins only, if one read process starts more than one read thread ...
> > 
> That also match the theory:
> When using different processes, the shared area will stay C=1 B=1, 
> On each context switch L1 will be flushed,
> Since L2 is PIPT next process will get the correct data...

Hang on - if L2 is PIPT, then there shouldn't be a problem provided it's
searched with C=0 B=1 mappings.  Is that the case?

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 12:42                             ` Ronen Shitrit
@ 2009-12-07 15:24                               ` Nicolas Pitre
  0 siblings, 0 replies; 71+ messages in thread
From: Nicolas Pitre @ 2009-12-07 15:24 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 7 Dec 2009, Ronen Shitrit wrote:

> From: Heiko Schocher [mailto:hs at denx.de] 
> > Hello Ronen,
> > 
> > Ronen Shitrit wrote:
> > > It's definitely this issue. 
> > > If I disable L2 everything works.
> > 
> > How did you exactly disabled the L2 Cache? And on which system
> > did you this test?
> 
> Disable CONFIG_CACHE_FEROCEON_L2 in the config.
> I'm using KW-DB kernel 2.6.31.4.

Beware that if you're using the original Marvell provided U-Boot, you 
need to disable L2 in the kernel _and_ in the U-Boot environment.  If 
L2 is disabled in the kernel but not in U-Boot then stranger things may 
happen.  Best thing to do is to keep L2 in U-Boot always disabled and 
let the kernel enable it as needed.


Nicolas

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 14:52                             ` Russell King - ARM Linux
@ 2009-12-07 15:37                               ` Nicolas Pitre
  2009-12-07 17:05                                 ` Russell King - ARM Linux
  2009-12-07 15:40                               ` Russell King - ARM Linux
  1 sibling, 1 reply; 71+ messages in thread
From: Nicolas Pitre @ 2009-12-07 15:37 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 7 Dec 2009, Russell King - ARM Linux wrote:

> On Mon, Dec 07, 2009 at 02:55:52PM +0200, Ronen Shitrit wrote:
> > > Russell King - ARM Linux wrote:
> > > > On Mon, Dec 07, 2009 at 01:31:41PM +0200, saeed bishara wrote:
> > > [...]
> > > >>> If there's no problem with C=0 B=1 mappings on Kirkwood, I've no idea
> > > >>> what's going on, and I don't have any suggestion on what to try next.
> > > >>>
> > > >>> The log shows that the kernel is doing the right thing: when we detect
> > > >>> two mappings for the same page in the same MM space, we clean and
> > > >>> invalidate any existing cacheable mappings visible in the MM space
> > > >>> (both L1 and L2), and switch all visible mappings to C=0 B=1 mappings.
> > > >>> This makes the area non-cacheable.
> > > >> what about the PTE of the MM space of the write process? if it remains
> > > >> C=1 B=1, then it's data will be at the L2, and as the L2 is not
> > > >> flushed on context switch, then that explains this behavior.
> > > > 
> > > > That's probably the issue, and it means that _all_ shared writable
> > > > mappings on your processor will be broken.
> > > 
> > > Hmm.. I tried also the testprg with CACHE_FEROCEON_L2 deaktivated,
> > > same result ...
> > > 
> > > > Oh dear, that really is bad news.
> > > 
> > > Indeed.
> > > 
> > > > There are two solutions to this which I can currently think of:
> > > > 1. flush the L2 cache on every context switch
> > > 
> > > To clarify, the testprg runs fine, if I start 4 processes each with
> > > only one read thread. In this case all works as expected. The mess
> > > begins only, if one read process starts more than one read thread ...
> > > 
> > That also match the theory:
> > When using different processes, the shared area will stay C=1 B=1, 
> > On each context switch L1 will be flushed,
> > Since L2 is PIPT next process will get the correct data...
> 
> Hang on - if L2 is PIPT, then there shouldn't be a problem provided it's
> searched with C=0 B=1 mappings.  Is that the case?

I don't have the time to properly wrap my brain around the current issue 
at the moment.  However there are 3 facts to account for:

1) Only 2 ARMv5 CPU variants with L2 cache exist: Feroceon and XSC3.
   However this issue should affect both equally.

2) L2 cache is PIPT in both cases.

3) From commit 08e445bd6a which fixed such a similar issue on Feroceon 
   and XSC3:

    Ideally, we would make L1 uncacheable and L2 cacheable as L2 is PIPT. But
    Feroceon does not support that combination, and the TEX=5 C=0 B=0 encoding
    for XSc3 doesn't appear to work in practice.

Hope this helps.


Nicolas

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 14:52                             ` Russell King - ARM Linux
  2009-12-07 15:37                               ` Nicolas Pitre
@ 2009-12-07 15:40                               ` Russell King - ARM Linux
  2009-12-07 15:57                                 ` Nicolas Pitre
  1 sibling, 1 reply; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-07 15:40 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 07, 2009 at 02:52:35PM +0000, Russell King - ARM Linux wrote:
> On Mon, Dec 07, 2009 at 02:55:52PM +0200, Ronen Shitrit wrote:
> > That also match the theory:
> > When using different processes, the shared area will stay C=1 B=1, 
> > On each context switch L1 will be flushed,
> > Since L2 is PIPT next process will get the correct data...
> 
> Hang on - if L2 is PIPT, then there shouldn't be a problem provided it's
> searched with C=0 B=1 mappings.  Is that the case?

Officially, ARMv5 does not support the 'extended small page' format for
the 2nd level descriptors (ARMv6 and Xscale CPUs added this support.)

That means ARMv5 officially only has support for two bits to control
the caching attributes - the C and B bits.  This means we can't specify
the policy for the L2 cache... unless Feroceon also supports the
'extended small page' format.  This might be a third solution to the
problem, and probably the best - provided Feroceon will allow us to
specify 'inner non-cacheable outer write-back' (TEX=111 CB=00).

If Feroceon did support the 'extended small page' format, why isn't it
already using this?  (It's setup to use the architecturally defined
'small page' format for ARMv3 to ARMv5.)

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 15:40                               ` Russell King - ARM Linux
@ 2009-12-07 15:57                                 ` Nicolas Pitre
  2009-12-07 16:06                                   ` Ronen Shitrit
  2009-12-07 17:17                                   ` Russell King - ARM Linux
  0 siblings, 2 replies; 71+ messages in thread
From: Nicolas Pitre @ 2009-12-07 15:57 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 7 Dec 2009, Russell King - ARM Linux wrote:

> On Mon, Dec 07, 2009 at 02:52:35PM +0000, Russell King - ARM Linux wrote:
> > On Mon, Dec 07, 2009 at 02:55:52PM +0200, Ronen Shitrit wrote:
> > > That also match the theory:
> > > When using different processes, the shared area will stay C=1 B=1, 
> > > On each context switch L1 will be flushed,
> > > Since L2 is PIPT next process will get the correct data...
> > 
> > Hang on - if L2 is PIPT, then there shouldn't be a problem provided it's
> > searched with C=0 B=1 mappings.  Is that the case?
> 
> Officially, ARMv5 does not support the 'extended small page' format for
> the 2nd level descriptors (ARMv6 and Xscale CPUs added this support.)
> 
> That means ARMv5 officially only has support for two bits to control
> the caching attributes - the C and B bits.  This means we can't specify
> the policy for the L2 cache... unless Feroceon also supports the
> 'extended small page' format.  This might be a third solution to the
> problem, and probably the best - provided Feroceon will allow us to
> specify 'inner non-cacheable outer write-back' (TEX=111 CB=00).

Last time I checked the Feroceon doc, there was no way to have inner 
non-cacheable outer write-back behavior.  And as I mentioned in my 
previous email, while debugging the issue on an XSC3, the TEX=111 CB=00 
combination didn't appear to behave as expected (no one bothered to 
verify my findings at the time either).  So I concluded that there is no 
such thing as inner non-cacheable outer write-back on ARMv5.  This was 
consigned in commit 08e445bd6a.


Nicolas

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 15:57                                 ` Nicolas Pitre
@ 2009-12-07 16:06                                   ` Ronen Shitrit
  2009-12-07 17:17                                   ` Russell King - ARM Linux
  1 sibling, 0 replies; 71+ messages in thread
From: Ronen Shitrit @ 2009-12-07 16:06 UTC (permalink / raw)
  To: linux-arm-kernel

No TEX bits in KW.

-----Original Message-----
From: Nicolas Pitre [mailto:nico at fluxnic.net] 
Sent: Monday, December 07, 2009 5:57 PM
To: Russell King - ARM Linux
Cc: Ronen Shitrit; hs at denx.de; saeed bishara; linux-arm-kernel at lists.infradead.org
Subject: Re: shared memory problem on ARM v5TE using threads

On Mon, 7 Dec 2009, Russell King - ARM Linux wrote:

> On Mon, Dec 07, 2009 at 02:52:35PM +0000, Russell King - ARM Linux wrote:
> > On Mon, Dec 07, 2009 at 02:55:52PM +0200, Ronen Shitrit wrote:
> > > That also match the theory:
> > > When using different processes, the shared area will stay C=1 B=1, 
> > > On each context switch L1 will be flushed,
> > > Since L2 is PIPT next process will get the correct data...
> > 
> > Hang on - if L2 is PIPT, then there shouldn't be a problem provided it's
> > searched with C=0 B=1 mappings.  Is that the case?
> 
> Officially, ARMv5 does not support the 'extended small page' format for
> the 2nd level descriptors (ARMv6 and Xscale CPUs added this support.)
> 
> That means ARMv5 officially only has support for two bits to control
> the caching attributes - the C and B bits.  This means we can't specify
> the policy for the L2 cache... unless Feroceon also supports the
> 'extended small page' format.  This might be a third solution to the
> problem, and probably the best - provided Feroceon will allow us to
> specify 'inner non-cacheable outer write-back' (TEX=111 CB=00).

Last time I checked the Feroceon doc, there was no way to have inner 
non-cacheable outer write-back behavior.  And as I mentioned in my 
previous email, while debugging the issue on an XSC3, the TEX=111 CB=00 
combination didn't appear to behave as expected (no one bothered to 
verify my findings at the time either).  So I concluded that there is no 
such thing as inner non-cacheable outer write-back on ARMv5.  This was 
consigned in commit 08e445bd6a.


Nicolas

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 15:37                               ` Nicolas Pitre
@ 2009-12-07 17:05                                 ` Russell King - ARM Linux
  2009-12-07 17:33                                   ` Nicolas Pitre
  0 siblings, 1 reply; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-07 17:05 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 07, 2009 at 10:37:35AM -0500, Nicolas Pitre wrote:
> On Mon, 7 Dec 2009, Russell King - ARM Linux wrote:
> 
> > On Mon, Dec 07, 2009 at 02:55:52PM +0200, Ronen Shitrit wrote:
> > > > Russell King - ARM Linux wrote:
> > > > > On Mon, Dec 07, 2009 at 01:31:41PM +0200, saeed bishara wrote:
> > > > [...]
> > > > >>> If there's no problem with C=0 B=1 mappings on Kirkwood, I've no idea
> > > > >>> what's going on, and I don't have any suggestion on what to try next.
> > > > >>>
> > > > >>> The log shows that the kernel is doing the right thing: when we detect
> > > > >>> two mappings for the same page in the same MM space, we clean and
> > > > >>> invalidate any existing cacheable mappings visible in the MM space
> > > > >>> (both L1 and L2), and switch all visible mappings to C=0 B=1 mappings.
> > > > >>> This makes the area non-cacheable.
> > > > >> what about the PTE of the MM space of the write process? if it remains
> > > > >> C=1 B=1, then it's data will be at the L2, and as the L2 is not
> > > > >> flushed on context switch, then that explains this behavior.
> > > > > 
> > > > > That's probably the issue, and it means that _all_ shared writable
> > > > > mappings on your processor will be broken.
> > > > 
> > > > Hmm.. I tried also the testprg with CACHE_FEROCEON_L2 deaktivated,
> > > > same result ...
> > > > 
> > > > > Oh dear, that really is bad news.
> > > > 
> > > > Indeed.
> > > > 
> > > > > There are two solutions to this which I can currently think of:
> > > > > 1. flush the L2 cache on every context switch
> > > > 
> > > > To clarify, the testprg runs fine, if I start 4 processes each with
> > > > only one read thread. In this case all works as expected. The mess
> > > > begins only, if one read process starts more than one read thread ...
> > > > 
> > > That also match the theory:
> > > When using different processes, the shared area will stay C=1 B=1, 
> > > On each context switch L1 will be flushed,
> > > Since L2 is PIPT next process will get the correct data...
> > 
> > Hang on - if L2 is PIPT, then there shouldn't be a problem provided it's
> > searched with C=0 B=1 mappings.  Is that the case?
> 
> I don't have the time to properly wrap my brain around the current issue 
> at the moment.  However there are 3 facts to account for:
> 
> 1) Only 2 ARMv5 CPU variants with L2 cache exist: Feroceon and XSC3.
>    However this issue should affect both equally.
> 
> 2) L2 cache is PIPT in both cases.
> 
> 3) From commit 08e445bd6a which fixed such a similar issue on Feroceon 
>    and XSC3:
> 
>     Ideally, we would make L1 uncacheable and L2 cacheable as L2 is PIPT. But
>     Feroceon does not support that combination, and the TEX=5 C=0 B=0 encoding
>     for XSc3 doesn't appear to work in practice.

Sigh, why do people create this kind of hardware brokenness.

It seems the original commit (08e445bd6a) only partly addresses the problem;
it's broken in so many other ways, as is highlighted by this test case.
Was it originally created for Xscale3 or Feroceon?  Was the problem actually
found to exist on Xscale3 and Feroceon?

Any read or write via another cacheable mapping will result in the L2
being loaded with data.  One instance is as shown in the original posters
test program - where a shared writable mapping exists in another process.

Another case would be having a shared writable mapping, and using read()/
write() on the mapped file.  This is normally taken care of with
flush_dcache_page(), but this does not do any L2 cache maintainence on
Feroceon.

Another case is any kind of mmap() of the same file - in other words, it
doesn't have to be another shared mmap to bring data into the L2 cache.

Now, at first throught, if we disable the cache for all shared writable
mappings in addition to what we're already doing, does this solve the
problem?  Well, it means that the writes will bypass the caches and hit
the RAM directly.  The reads from the other shared mappings will read
direct from the RAM.

A private mapping using the same page will use the same page, and it
will not be marked uncacheable.  Accesses to it will draw data into the
L2 cache.

PIO kernel mode accesses will also use the cached copy, and that _is_
a problem - it means when we update the backing file on disk, we'll
write out the L2 cached data rather than what really should be written
out - the updated data from the writable shared mappings.

So it seems that at least these affected CPUs need flush_dcache_page()
to also do L2 cache maintainence.  I don't think that's enough to cover
all cases though - it probably also needs to do L2 cache maintainence
in all the other flush_cache_* functions as well.

This is something that should be benchmarked on the affected CPUs and
compared with the unmodified code with L2 cache disabled.


As a side note, I'm currently concerned that the sequence:

	mmap(MAP_SHARED);
	write to shared mapping;
	msync(MS_SYNC);

may not result in the written data hitting the disk (due to missing a
cache flush) but as yet I'm unable to prove it.  Since I now get lost
reading the Linux VFS/MM code, I can't prove this by code inspection.

Checking for this isn't going to be easy - (a) munmapping the region
will cause the data to hit RAM, (b) any context switch will cause the
data to hit RAM, (c) merely reading back the file via read() will
trigger flush_dcache_page()...  Need some way to externally monitor
what gets written to the storage device...

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 15:57                                 ` Nicolas Pitre
  2009-12-07 16:06                                   ` Ronen Shitrit
@ 2009-12-07 17:17                                   ` Russell King - ARM Linux
  1 sibling, 0 replies; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-07 17:17 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 07, 2009 at 10:57:00AM -0500, Nicolas Pitre wrote:
> Last time I checked the Feroceon doc, there was no way to have inner 
> non-cacheable outer write-back behavior.  And as I mentioned in my 
> previous email, while debugging the issue on an XSC3, the TEX=111 CB=00 
> combination didn't appear to behave as expected (no one bothered to 
> verify my findings at the time either).

Probably because either no one cared or no one had the hardware to be
able to check.

> So I concluded that there is no 
> such thing as inner non-cacheable outer write-back on ARMv5.  This was 
> consigned in commit 08e445bd6a.

Well, there is no such thing as L2 cache on ARMv5 architecture.  L2 cache
is present on Xscale3 and Feroceon purely as a CPU vendor addition.

On Xscale3, it was extended to have the TEX bits in the page table.
However, Feroceon, being an independently designed CPU, appears to have
L2 cache but without the TEX bits - which gives us less options to solve
this issue.

However, I don't think tricks like making all shared writable mappings
uncacheable is going to fix it either as I mentioned in my previous
message.

Consider the effect of that kind of fix: take for instance a SHM mapping
between the X server and a client for transferring bitmap data (eg,
mplayer).  Do we really want it marked non-cacheable?  I think you can
say goodbye to video playback on these platforms.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 17:05                                 ` Russell King - ARM Linux
@ 2009-12-07 17:33                                   ` Nicolas Pitre
  2009-12-07 17:56                                     ` Russell King - ARM Linux
  0 siblings, 1 reply; 71+ messages in thread
From: Nicolas Pitre @ 2009-12-07 17:33 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 7 Dec 2009, Russell King - ARM Linux wrote:

> On Mon, Dec 07, 2009 at 10:37:35AM -0500, Nicolas Pitre wrote:
> > On Mon, 7 Dec 2009, Russell King - ARM Linux wrote:
> > 
> > > On Mon, Dec 07, 2009 at 02:55:52PM +0200, Ronen Shitrit wrote:
> > > > > Russell King - ARM Linux wrote:
> > > > > > On Mon, Dec 07, 2009 at 01:31:41PM +0200, saeed bishara wrote:
> > > > > [...]
> > > > > >>> If there's no problem with C=0 B=1 mappings on Kirkwood, I've no idea
> > > > > >>> what's going on, and I don't have any suggestion on what to try next.
> > > > > >>>
> > > > > >>> The log shows that the kernel is doing the right thing: when we detect
> > > > > >>> two mappings for the same page in the same MM space, we clean and
> > > > > >>> invalidate any existing cacheable mappings visible in the MM space
> > > > > >>> (both L1 and L2), and switch all visible mappings to C=0 B=1 mappings.
> > > > > >>> This makes the area non-cacheable.
> > > > > >> what about the PTE of the MM space of the write process? if it remains
> > > > > >> C=1 B=1, then it's data will be at the L2, and as the L2 is not
> > > > > >> flushed on context switch, then that explains this behavior.
> > > > > > 
> > > > > > That's probably the issue, and it means that _all_ shared writable
> > > > > > mappings on your processor will be broken.
> > > > > 
> > > > > Hmm.. I tried also the testprg with CACHE_FEROCEON_L2 deaktivated,
> > > > > same result ...
> > > > > 
> > > > > > Oh dear, that really is bad news.
> > > > > 
> > > > > Indeed.
> > > > > 
> > > > > > There are two solutions to this which I can currently think of:
> > > > > > 1. flush the L2 cache on every context switch
> > > > > 
> > > > > To clarify, the testprg runs fine, if I start 4 processes each with
> > > > > only one read thread. In this case all works as expected. The mess
> > > > > begins only, if one read process starts more than one read thread ...
> > > > > 
> > > > That also match the theory:
> > > > When using different processes, the shared area will stay C=1 B=1, 
> > > > On each context switch L1 will be flushed,
> > > > Since L2 is PIPT next process will get the correct data...
> > > 
> > > Hang on - if L2 is PIPT, then there shouldn't be a problem provided it's
> > > searched with C=0 B=1 mappings.  Is that the case?
> > 
> > I don't have the time to properly wrap my brain around the current issue 
> > at the moment.  However there are 3 facts to account for:
> > 
> > 1) Only 2 ARMv5 CPU variants with L2 cache exist: Feroceon and XSC3.
> >    However this issue should affect both equally.
> > 
> > 2) L2 cache is PIPT in both cases.
> > 
> > 3) From commit 08e445bd6a which fixed such a similar issue on Feroceon 
> >    and XSC3:
> > 
> >     Ideally, we would make L1 uncacheable and L2 cacheable as L2 is PIPT. But
> >     Feroceon does not support that combination, and the TEX=5 C=0 B=0 encoding
> >     for XSc3 doesn't appear to work in practice.
> 
> Sigh, why do people create this kind of hardware brokenness.
> 
> It seems the original commit (08e445bd6a) only partly addresses the problem;
> it's broken in so many other ways, as is highlighted by this test case.
> Was it originally created for Xscale3 or Feroceon?  Was the problem actually
> found to exist on Xscale3 and Feroceon?

It fixed a test case that was discovered on XSC3 and turned up to be 
valid on Feroceon as well.  I probably have the source for it somewhere.  
The case was multiple mmap() of the same memory area within the same 
process.  I think (but that needs confirmation) that this fixed a real 
life db4 issue as well.

> Any read or write via another cacheable mapping will result in the L2
> being loaded with data.  One instance is as shown in the original posters
> test program - where a shared writable mapping exists in another process.
> 
> Another case would be having a shared writable mapping, and using read()/
> write() on the mapped file.  This is normally taken care of with
> flush_dcache_page(), but this does not do any L2 cache maintainence on
> Feroceon.

I thought those were already handled by making L1 uncacheable (and L2 
cleaned) as soon as a second user of a shared mapping was 
encountered.

> Another case is any kind of mmap() of the same file - in other words, it
> doesn't have to be another shared mmap to bring data into the L2 cache.

But that case is fine, no?  L2 being PIPT you get the same cached data 
for both mappings, and a write will COW the page.

> Now, at first throught, if we disable the cache for all shared writable
> mappings in addition to what we're already doing, does this solve the
> problem?  Well, it means that the writes will bypass the caches and hit
> the RAM directly.  The reads from the other shared mappings will read
> direct from the RAM.
> 
> A private mapping using the same page will use the same page, and it
> will not be marked uncacheable.  Accesses to it will draw data into the
> L2 cache.

Hmmm...

> PIO kernel mode accesses will also use the cached copy, and that _is_
> a problem - it means when we update the backing file on disk, we'll
> write out the L2 cached data rather than what really should be written
> out - the updated data from the writable shared mappings.
> 
> So it seems that at least these affected CPUs need flush_dcache_page()
> to also do L2 cache maintainence.  I don't think that's enough to cover
> all cases though - it probably also needs to do L2 cache maintainence
> in all the other flush_cache_* functions as well.

/me starts to feel the head ache

> This is something that should be benchmarked on the affected CPUs and
> compared with the unmodified code with L2 cache disabled.
> 
> 
> As a side note, I'm currently concerned that the sequence:
> 
> 	mmap(MAP_SHARED);
> 	write to shared mapping;
> 	msync(MS_SYNC);
> 
> may not result in the written data hitting the disk (due to missing a
> cache flush) but as yet I'm unable to prove it.  Since I now get lost
> reading the Linux VFS/MM code, I can't prove this by code inspection.
> 
> Checking for this isn't going to be easy - (a) munmapping the region
> will cause the data to hit RAM, (b) any context switch will cause the
> data to hit RAM, (c) merely reading back the file via read() will
> trigger flush_dcache_page()...  Need some way to externally monitor
> what gets written to the storage device...
> 


Nicolas

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 17:33                                   ` Nicolas Pitre
@ 2009-12-07 17:56                                     ` Russell King - ARM Linux
  2009-12-13 11:48                                       ` Ronen Shitrit
  0 siblings, 1 reply; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-07 17:56 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 07, 2009 at 12:33:20PM -0500, Nicolas Pitre wrote:
> On Mon, 7 Dec 2009, Russell King - ARM Linux wrote:
> > It seems the original commit (08e445bd6a) only partly addresses the problem;
> > it's broken in so many other ways, as is highlighted by this test case.
> > Was it originally created for Xscale3 or Feroceon?  Was the problem actually
> > found to exist on Xscale3 and Feroceon?
> 
> It fixed a test case that was discovered on XSC3 and turned up to be 
> valid on Feroceon as well.  I probably have the source for it somewhere.  
> The case was multiple mmap() of the same memory area within the same 
> process.  I think (but that needs confirmation) that this fixed a real 
> life db4 issue as well.

I have a test case for this as well.

> > Any read or write via another cacheable mapping will result in the L2
> > being loaded with data.  One instance is as shown in the original posters
> > test program - where a shared writable mapping exists in another process.
> > 
> > Another case would be having a shared writable mapping, and using read()/
> > write() on the mapped file.  This is normally taken care of with
> > flush_dcache_page(), but this does not do any L2 cache maintainence on
> > Feroceon.
> 
> I thought those were already handled by making L1 uncacheable (and L2 
> cleaned) as soon as a second user of a shared mapping was 
> encountered.

That doesn't work - the kernel mapping will still be cacheable, and it is
the kernel mapping that read() and write() will use.  Their coherency
issue is resolved by flush_dcache_page() performed _before_ the access
(note: there is no coherency after a write access, so WBWA caches are
probably broken wrt this.)

So:
	Process/Thread
	1/1		2/1		2/2		3/1	kernel
	map(MAP_SHARED)	map(MAP_SHARED)	map(MAP_SHARED)
	write mapping
								flush L1
			read mapping
					read mapping

Now, lets say we flush the L1 and L2 caches, and we mark all these
mappings uncachable.  Now, another process does a read from the file
backing this mapping:

								flush L1
							read
								flush_dcache_page()
								read data
								(loads L2 cache)
								flush L1
	write mapping
	(does not hit L2)
								flush L1
			read mapping
			(does not hit L2,
			sees data from 1/1)
					read mapping
					(does not hit L2,
					sees data from 1/1)
								flush L1
							read
								flush_dcache_page()
								read data
								(from L2 cache
								 and doesn't
								 see updates
								 from 1/1)

What I'm slightly more worried about as well is whether PIO writeouts
will write the data from process 1/1 onto disk.

> > Another case is any kind of mmap() of the same file - in other words, it
> > doesn't have to be another shared mmap to bring data into the L2 cache.
> 
> But that case is fine, no?  L2 being PIPT you get the same cached data 
> for both mappings, and a write will COW the page.

The point is its a way to get data into the L2 cache, which will be
visible via other cachable mappings and mask the shared-mapped updates.
I wasn't considering a COW to a private mapping.

> > Now, at first throught, if we disable the cache for all shared writable
> > mappings in addition to what we're already doing, does this solve the
> > problem?  Well, it means that the writes will bypass the caches and hit
> > the RAM directly.  The reads from the other shared mappings will read
> > direct from the RAM.
> > 
> > A private mapping using the same page will use the same page, and it
> > will not be marked uncacheable.  Accesses to it will draw data into the
> > L2 cache.
> 
> Hmmm...
> 
> > PIO kernel mode accesses will also use the cached copy, and that _is_
> > a problem - it means when we update the backing file on disk, we'll
> > write out the L2 cached data rather than what really should be written
> > out - the updated data from the writable shared mappings.
> > 
> > So it seems that at least these affected CPUs need flush_dcache_page()
> > to also do L2 cache maintainence.  I don't think that's enough to cover
> > all cases though - it probably also needs to do L2 cache maintainence
> > in all the other flush_cache_* functions as well.
> 
> /me starts to feel the head ache

You're not the only one...

I'm going to try and prove the msync() problem I mentioned at the end of
my mail - it's probably going to be easier to prove and solve (and impacts
more ARM CPUs than this problem.)

As for this problem, I'm not certain what the solution is.

In the mean time, as a work-around, I suggest that any CPU with L1 VIVT
cache (which thereby requires the make_coherent() code) has its L2 cache
disabled.  That should at least allow the system to behave as correctly
as it does for other ARM VIVT CPUs.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-07 17:56                                     ` Russell King - ARM Linux
@ 2009-12-13 11:48                                       ` Ronen Shitrit
  2009-12-13 12:00                                         ` Russell King - ARM Linux
  2009-12-17 11:31                                         ` Heiko Schocher
  0 siblings, 2 replies; 71+ messages in thread
From: Ronen Shitrit @ 2009-12-13 11:48 UTC (permalink / raw)
  To: linux-arm-kernel

Another idea is to change the shared mapping handling, in case of vivt with pipt L2, so it won't remap the shared area as non-cacheable:
- make_coherent won't use adjust_pte and leave only the regular flush.
- Flush L1 for all context switches, also for the case that the new process is using same mm (thread context switch).

I tried it and it solved the original issue posted on this thread.

Any thoughts? Any pitfalls this solution is missing? 

Regards

-----Original Message-----
From: Russell King - ARM Linux [mailto:linux at arm.linux.org.uk] 
Sent: Monday, December 07, 2009 7:56 PM
To: Nicolas Pitre
Cc: Ronen Shitrit; hs at denx.de; saeed bishara; linux-arm-kernel at lists.infradead.org
Subject: Re: shared memory problem on ARM v5TE using threads

On Mon, Dec 07, 2009 at 12:33:20PM -0500, Nicolas Pitre wrote:
> On Mon, 7 Dec 2009, Russell King - ARM Linux wrote:
> > It seems the original commit (08e445bd6a) only partly addresses the problem;
> > it's broken in so many other ways, as is highlighted by this test case.
> > Was it originally created for Xscale3 or Feroceon?  Was the problem actually
> > found to exist on Xscale3 and Feroceon?
> 
> It fixed a test case that was discovered on XSC3 and turned up to be 
> valid on Feroceon as well.  I probably have the source for it somewhere.  
> The case was multiple mmap() of the same memory area within the same 
> process.  I think (but that needs confirmation) that this fixed a real 
> life db4 issue as well.

I have a test case for this as well.

> > Any read or write via another cacheable mapping will result in the L2
> > being loaded with data.  One instance is as shown in the original posters
> > test program - where a shared writable mapping exists in another process.
> > 
> > Another case would be having a shared writable mapping, and using read()/
> > write() on the mapped file.  This is normally taken care of with
> > flush_dcache_page(), but this does not do any L2 cache maintainence on
> > Feroceon.
> 
> I thought those were already handled by making L1 uncacheable (and L2 
> cleaned) as soon as a second user of a shared mapping was 
> encountered.

That doesn't work - the kernel mapping will still be cacheable, and it is
the kernel mapping that read() and write() will use.  Their coherency
issue is resolved by flush_dcache_page() performed _before_ the access
(note: there is no coherency after a write access, so WBWA caches are
probably broken wrt this.)

So:
	Process/Thread
	1/1		2/1		2/2		3/1	kernel
	map(MAP_SHARED)	map(MAP_SHARED)	map(MAP_SHARED)
	write mapping
								flush L1
			read mapping
					read mapping

Now, lets say we flush the L1 and L2 caches, and we mark all these
mappings uncachable.  Now, another process does a read from the file
backing this mapping:

								flush L1
							read
								flush_dcache_page()
								read data
								(loads L2 cache)
								flush L1
	write mapping
	(does not hit L2)
								flush L1
			read mapping
			(does not hit L2,
			sees data from 1/1)
					read mapping
					(does not hit L2,
					sees data from 1/1)
								flush L1
							read
								flush_dcache_page()
								read data
								(from L2 cache
								 and doesn't
								 see updates
								 from 1/1)

What I'm slightly more worried about as well is whether PIO writeouts
will write the data from process 1/1 onto disk.

> > Another case is any kind of mmap() of the same file - in other words, it
> > doesn't have to be another shared mmap to bring data into the L2 cache.
> 
> But that case is fine, no?  L2 being PIPT you get the same cached data 
> for both mappings, and a write will COW the page.

The point is its a way to get data into the L2 cache, which will be
visible via other cachable mappings and mask the shared-mapped updates.
I wasn't considering a COW to a private mapping.

> > Now, at first throught, if we disable the cache for all shared writable
> > mappings in addition to what we're already doing, does this solve the
> > problem?  Well, it means that the writes will bypass the caches and hit
> > the RAM directly.  The reads from the other shared mappings will read
> > direct from the RAM.
> > 
> > A private mapping using the same page will use the same page, and it
> > will not be marked uncacheable.  Accesses to it will draw data into the
> > L2 cache.
> 
> Hmmm...
> 
> > PIO kernel mode accesses will also use the cached copy, and that _is_
> > a problem - it means when we update the backing file on disk, we'll
> > write out the L2 cached data rather than what really should be written
> > out - the updated data from the writable shared mappings.
> > 
> > So it seems that at least these affected CPUs need flush_dcache_page()
> > to also do L2 cache maintainence.  I don't think that's enough to cover
> > all cases though - it probably also needs to do L2 cache maintainence
> > in all the other flush_cache_* functions as well.
> 
> /me starts to feel the head ache

You're not the only one...

I'm going to try and prove the msync() problem I mentioned at the end of
my mail - it's probably going to be easier to prove and solve (and impacts
more ARM CPUs than this problem.)

As for this problem, I'm not certain what the solution is.

In the mean time, as a work-around, I suggest that any CPU with L1 VIVT
cache (which thereby requires the make_coherent() code) has its L2 cache
disabled.  That should at least allow the system to behave as correctly
as it does for other ARM VIVT CPUs.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-13 11:48                                       ` Ronen Shitrit
@ 2009-12-13 12:00                                         ` Russell King - ARM Linux
  2009-12-13 12:06                                           ` Russell King - ARM Linux
  2009-12-18 18:45                                           ` Pavel Machek
  2009-12-17 11:31                                         ` Heiko Schocher
  1 sibling, 2 replies; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-13 12:00 UTC (permalink / raw)
  To: linux-arm-kernel

On Sun, Dec 13, 2009 at 01:48:48PM +0200, Ronen Shitrit wrote:
> Another idea is to change the shared mapping handling, in case of vivt with pipt L2, so it won't remap the shared area as non-cacheable:
> - make_coherent won't use adjust_pte and leave only the regular flush.
> - Flush L1 for all context switches, also for the case that the new process is using same mm (thread context switch).

That doesn't work.

Well, with a VIVT L1, we flush the L1 on all MM switches anyway.
Flushing it on any thread switch is not going to help that much.

The problem with shared mmaps is that if you have multiple within the
same thread, it is required that they are _all_ coherent with respect
to each other, whether or not a context switch has occurred.

I believe there are databases (eg, db4) which are non-threaded, makes
use of multiple shared mappings and requires that it works, otherwise
the database becomes corrupted.

Unfortunately, this means your suggestion is not a solution at all.

I'm afraid to say that the only solution I can see to this problem is to
disable the L2 cache outright on these CPUs - the choice seems to be
between correct system behaviour and lower performance, or performance
but buggy system behaviour in certain cases leading to data corruption.

I know which I'd prefer.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-13 12:00                                         ` Russell King - ARM Linux
@ 2009-12-13 12:06                                           ` Russell King - ARM Linux
  2009-12-13 15:42                                             ` Ronen Shitrit
  2009-12-14 13:13                                             ` christian pellegrin
  2009-12-18 18:45                                           ` Pavel Machek
  1 sibling, 2 replies; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-13 12:06 UTC (permalink / raw)
  To: linux-arm-kernel

On Sun, Dec 13, 2009 at 12:00:33PM +0000, Russell King - ARM Linux wrote:
> I'm afraid to say that the only solution I can see to this problem is to
> disable the L2 cache outright on these CPUs - the choice seems to be
> between correct system behaviour and lower performance, or performance
> but buggy system behaviour in certain cases leading to data corruption.

BTW, if no one can come up with a solution for this, we need to consider
making the kernel by default disable L2 cache on Feroceon, and display a
warning message if L2 is to be enabled.

We really can not have a known data corrupting issue like this exist
silently in the system - it's not fair for users nor developers to waste
time trying to work out why their data is being corrupted.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-13 12:06                                           ` Russell King - ARM Linux
@ 2009-12-13 15:42                                             ` Ronen Shitrit
  2009-12-14 13:13                                             ` christian pellegrin
  1 sibling, 0 replies; 71+ messages in thread
From: Ronen Shitrit @ 2009-12-13 15:42 UTC (permalink / raw)
  To: linux-arm-kernel

There is another possible solution we are looking into:
1) Force all NC writes to go through L2: 
If there is a hit in the L2, the write will update the L2 together with the DRAM.
If there is a miss, the write will continue to the DRAM only.
2) Force all NC reads to wait till L2 write buffer is drained, to avoid any racing with above...
3) Set L2 in WT mode.

I still can't determine if this solution is valid from HW point of view, but I will really appreciate any comment on this.
Any thoughts? Any pitfalls this solution is missing?

Thanks

-----Original Message-----
From: Russell King - ARM Linux [mailto:linux at arm.linux.org.uk] 
Sent: Sunday, December 13, 2009 2:06 PM
To: Ronen Shitrit
Cc: hs at denx.de; saeed bishara; linux-arm-kernel at lists.infradead.org; Nicolas Pitre
Subject: Re: shared memory problem on ARM v5TE using threads

On Sun, Dec 13, 2009 at 12:00:33PM +0000, Russell King - ARM Linux wrote:
> I'm afraid to say that the only solution I can see to this problem is to
> disable the L2 cache outright on these CPUs - the choice seems to be
> between correct system behaviour and lower performance, or performance
> but buggy system behaviour in certain cases leading to data corruption.

BTW, if no one can come up with a solution for this, we need to consider
making the kernel by default disable L2 cache on Feroceon, and display a
warning message if L2 is to be enabled.

We really can not have a known data corrupting issue like this exist
silently in the system - it's not fair for users nor developers to waste
time trying to work out why their data is being corrupted.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-13 12:06                                           ` Russell King - ARM Linux
  2009-12-13 15:42                                             ` Ronen Shitrit
@ 2009-12-14 13:13                                             ` christian pellegrin
  2009-12-14 14:46                                               ` Ronen Shitrit
  2009-12-14 19:59                                               ` shared memory problem on ARM v5TE using threads Nicolas Pitre
  1 sibling, 2 replies; 71+ messages in thread
From: christian pellegrin @ 2009-12-14 13:13 UTC (permalink / raw)
  To: linux-arm-kernel

Hi, I've been using the patch below for a couple of days now on an a
Feroceon based system. It tries to be smart in deciding when to mark a
mapping uncacheable and I haven't noticed many of them (but I am *not*
running it on a X11 system where maybe the situation could be
different). It solves the situation of the test program above. Please
be patient if there is something really wrong with it because I don't
have a deep understanding of the workings of Linux VM and neither the
Feroceon manuals (unfortunately I found that these are not available
*after* buying the hardware). Thank you for looking at this, I think
this problem should be fixed soon so we can trust our systems.

diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index d0d17b6..36dc4a5 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -34,7 +34,7 @@ static unsigned long shared_pte_mask = L_PTE_MT_BUFFERABLE;
  * Therefore those configurations which might call adjust_pte (those
  * without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock.
  */
-static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
+static int adjust_pte(struct vm_area_struct *vma, unsigned long
address, int update, int only_shared)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
@@ -65,7 +65,7 @@ static int adjust_pte(struct vm_area_struct *vma,
unsigned long address)
 	 * If this page isn't present, or is already setup to
 	 * fault (ie, is old), we can safely ignore any issues.
 	 */
-	if (ret && (pte_val(entry) & L_PTE_MT_MASK) != shared_pte_mask) {
+	if (ret && (pte_val(entry) & L_PTE_MT_MASK) != shared_pte_mask && update) {
 		unsigned long pfn = pte_pfn(entry);
 		flush_cache_page(vma, address, pfn);
 		outer_flush_range((pfn << PAGE_SHIFT),
@@ -74,7 +74,12 @@ static int adjust_pte(struct vm_area_struct *vma,
unsigned long address)
 		pte_val(entry) |= shared_pte_mask;
 		set_pte_at(vma->vm_mm, address, pte, entry);
 		flush_tlb_page(vma, address);
+		printk(KERN_INFO "Uncached vma %08x (phy %08x) from pid %d\n",
+		       (unsigned int) vma, (unsigned int) (pfn << PAGE_SHIFT),
+		       current->pid);
 	}
+	if (only_shared && (pte_val(entry) & L_PTE_MT_MASK) != shared_pte_mask)
+	  ret = 0;
 	pte_unmap(pte);
 	return ret;

@@ -100,6 +105,9 @@ make_coherent(struct address_space *mapping,
struct vm_area_struct *vma, unsigne
 	unsigned long offset;
 	pgoff_t pgoff;
 	int aliases = 0;
+#if defined(CONFIG_OUTER_CACHE) && defined(CONFIG_CPU_CACHE_VIVT)
+	int run;
+#endif

 	pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);

@@ -109,6 +117,32 @@ make_coherent(struct address_space *mapping,
struct vm_area_struct *vma, unsigne
 	 * cache coherency.
 	 */
 	flush_dcache_mmap_lock(mapping);
+#if defined(CONFIG_OUTER_CACHE) && defined(CONFIG_CPU_CACHE_VIVT)
+	/*
+	 * In the first run we just check if we have to make some
+	 * address space uncacheable cause of L1 VIVT. In the second
+	 * we check if there is an uncached map in other process.  If
+	 * one of the previous condition is true we proceed to make
+	 * *all* (both in current process VMA and that of others) of
+	 * them so. This should solve both cases of multiple shared
+	 * memories attached in the same process but not impact the
+	 * common case of just one mapping per process.
+	 */
+	for(run = 0; run < 3; run++) {
+		vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
+			if ((mpnt->vm_mm != mm || mpnt == vma) && run == 0)
+				continue;
+			if (!(mpnt->vm_flags & VM_MAYSHARE))
+				continue;
+			offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+			aliases += adjust_pte(mpnt, mpnt->vm_start + offset,
+					      run == 2, /* update only on the last run */
+					      run == 1); /* on the second run catch shared in other procs */
+		}
+		if (aliases == 0 && run == 1)
+			break;
+	}
+#else
 	vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		/*
 		 * If this VMA is not in our MM, we can ignore it.
@@ -120,11 +154,12 @@ make_coherent(struct address_space *mapping,
struct vm_area_struct *vma, unsigne
 		if (!(mpnt->vm_flags & VM_MAYSHARE))
 			continue;
 		offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
-		aliases += adjust_pte(mpnt, mpnt->vm_start + offset);
+		aliases += adjust_pte(mpnt, mpnt->vm_start + offset, 1, 0);
 	}
+#endif
 	flush_dcache_mmap_unlock(mapping);
 	if (aliases)
-		adjust_pte(vma, addr);
+		adjust_pte(vma, addr, 1, 0);
 	else
 		flush_cache_page(vma, addr, pfn);
 }


-- 
Christian Pellegrin, see http://www.evolware.org/chri/
"Real Programmers don't play tennis, or any other sport which requires
you to change clothes. Mountain climbing is OK, and Real Programmers
wear their climbing boots to work in case a mountain should suddenly
spring up in the middle of the computer room."

^ permalink raw reply related	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-14 13:13                                             ` christian pellegrin
@ 2009-12-14 14:46                                               ` Ronen Shitrit
  2009-12-14 17:48                                                 ` christian pellegrin
  2009-12-14 19:59                                               ` shared memory problem on ARM v5TE using threads Nicolas Pitre
  1 sibling, 1 reply; 71+ messages in thread
From: Ronen Shitrit @ 2009-12-14 14:46 UTC (permalink / raw)
  To: linux-arm-kernel

Quoting Russell from this thread:
"
Now, at first throught, if we disable the cache for all shared writable mappings in addition to what we're already doing, does this solve the problem?  Well, it means that the writes will bypass the caches and hit the RAM directly.  The reads from the other shared mappings will read direct from the RAM.

A private mapping using the same page will use the same page, and it will not be marked uncacheable.  Accesses to it will draw data into the
L2 cache.

PIO kernel mode accesses will also use the cached copy, and that _is_ a problem - it means when we update the backing file on disk, we'll write out the L2 cached data rather than what really should be written out - the updated data from the writable shared mappings.
"

I think your patch doesn't cover the PIO mode...


-----Original Message-----
From: linux-arm-kernel-bounces@lists.infradead.org [mailto:linux-arm-kernel-bounces at lists.infradead.org] On Behalf Of christian pellegrin
Sent: Monday, December 14, 2009 3:13 PM
To: linux-arm-kernel at lists.infradead.org
Subject: Re: shared memory problem on ARM v5TE using threads

Hi, I've been using the patch below for a couple of days now on an a
Feroceon based system. It tries to be smart in deciding when to mark a
mapping uncacheable and I haven't noticed many of them (but I am *not*
running it on a X11 system where maybe the situation could be
different). It solves the situation of the test program above. Please
be patient if there is something really wrong with it because I don't
have a deep understanding of the workings of Linux VM and neither the
Feroceon manuals (unfortunately I found that these are not available
*after* buying the hardware). Thank you for looking at this, I think
this problem should be fixed soon so we can trust our systems.

diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index d0d17b6..36dc4a5 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -34,7 +34,7 @@ static unsigned long shared_pte_mask = L_PTE_MT_BUFFERABLE;
  * Therefore those configurations which might call adjust_pte (those
  * without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock.
  */
-static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
+static int adjust_pte(struct vm_area_struct *vma, unsigned long
address, int update, int only_shared)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
@@ -65,7 +65,7 @@ static int adjust_pte(struct vm_area_struct *vma,
unsigned long address)
 	 * If this page isn't present, or is already setup to
 	 * fault (ie, is old), we can safely ignore any issues.
 	 */
-	if (ret && (pte_val(entry) & L_PTE_MT_MASK) != shared_pte_mask) {
+	if (ret && (pte_val(entry) & L_PTE_MT_MASK) != shared_pte_mask && update) {
 		unsigned long pfn = pte_pfn(entry);
 		flush_cache_page(vma, address, pfn);
 		outer_flush_range((pfn << PAGE_SHIFT),
@@ -74,7 +74,12 @@ static int adjust_pte(struct vm_area_struct *vma,
unsigned long address)
 		pte_val(entry) |= shared_pte_mask;
 		set_pte_at(vma->vm_mm, address, pte, entry);
 		flush_tlb_page(vma, address);
+		printk(KERN_INFO "Uncached vma %08x (phy %08x) from pid %d\n",
+		       (unsigned int) vma, (unsigned int) (pfn << PAGE_SHIFT),
+		       current->pid);
 	}
+	if (only_shared && (pte_val(entry) & L_PTE_MT_MASK) != shared_pte_mask)
+	  ret = 0;
 	pte_unmap(pte);
 	return ret;

@@ -100,6 +105,9 @@ make_coherent(struct address_space *mapping,
struct vm_area_struct *vma, unsigne
 	unsigned long offset;
 	pgoff_t pgoff;
 	int aliases = 0;
+#if defined(CONFIG_OUTER_CACHE) && defined(CONFIG_CPU_CACHE_VIVT)
+	int run;
+#endif

 	pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);

@@ -109,6 +117,32 @@ make_coherent(struct address_space *mapping,
struct vm_area_struct *vma, unsigne
 	 * cache coherency.
 	 */
 	flush_dcache_mmap_lock(mapping);
+#if defined(CONFIG_OUTER_CACHE) && defined(CONFIG_CPU_CACHE_VIVT)
+	/*
+	 * In the first run we just check if we have to make some
+	 * address space uncacheable cause of L1 VIVT. In the second
+	 * we check if there is an uncached map in other process.  If
+	 * one of the previous condition is true we proceed to make
+	 * *all* (both in current process VMA and that of others) of
+	 * them so. This should solve both cases of multiple shared
+	 * memories attached in the same process but not impact the
+	 * common case of just one mapping per process.
+	 */
+	for(run = 0; run < 3; run++) {
+		vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
+			if ((mpnt->vm_mm != mm || mpnt == vma) && run == 0)
+				continue;
+			if (!(mpnt->vm_flags & VM_MAYSHARE))
+				continue;
+			offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+			aliases += adjust_pte(mpnt, mpnt->vm_start + offset,
+					      run == 2, /* update only on the last run */
+					      run == 1); /* on the second run catch shared in other procs */
+		}
+		if (aliases == 0 && run == 1)
+			break;
+	}
+#else
 	vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		/*
 		 * If this VMA is not in our MM, we can ignore it.
@@ -120,11 +154,12 @@ make_coherent(struct address_space *mapping,
struct vm_area_struct *vma, unsigne
 		if (!(mpnt->vm_flags & VM_MAYSHARE))
 			continue;
 		offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
-		aliases += adjust_pte(mpnt, mpnt->vm_start + offset);
+		aliases += adjust_pte(mpnt, mpnt->vm_start + offset, 1, 0);
 	}
+#endif
 	flush_dcache_mmap_unlock(mapping);
 	if (aliases)
-		adjust_pte(vma, addr);
+		adjust_pte(vma, addr, 1, 0);
 	else
 		flush_cache_page(vma, addr, pfn);
 }


-- 
Christian Pellegrin, see http://www.evolware.org/chri/
"Real Programmers don't play tennis, or any other sport which requires
you to change clothes. Mountain climbing is OK, and Real Programmers
wear their climbing boots to work in case a mountain should suddenly
spring up in the middle of the computer room."

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel at lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-14 14:46                                               ` Ronen Shitrit
@ 2009-12-14 17:48                                                 ` christian pellegrin
  2009-12-14 20:14                                                   ` Nicolas Pitre
  0 siblings, 1 reply; 71+ messages in thread
From: christian pellegrin @ 2009-12-14 17:48 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 14, 2009 at 3:46 PM, Ronen Shitrit <rshitrit@marvell.com> wrote:

> "
>
> I think your patch doesn't cover the PIO mode...
>

Has someone a suggestion on how to write a test-case for this? I tried
various forms of mixing read/write syscalls (or a mmap wit
MAP_PRIVATE) and access via two mmap with MAP_SHARED aliasing the same
address (I checked that adjust_pte was called by them) but didn't get
any problem. I have to admit that I really don't understand how PIO in
kernel mode can refer to mappings that are aliased (and so market
uncacheable) made by user-space programs. Thank you for your patience.

-- 
Christian Pellegrin, see http://www.evolware.org/chri/
"Real Programmers don't play tennis, or any other sport which requires
you to change clothes. Mountain climbing is OK, and Real Programmers
wear their climbing boots to work in case a mountain should suddenly
spring up in the middle of the computer room."

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-14 13:13                                             ` christian pellegrin
  2009-12-14 14:46                                               ` Ronen Shitrit
@ 2009-12-14 19:59                                               ` Nicolas Pitre
  2009-12-15 10:33                                                 ` christian pellegrin
  1 sibling, 1 reply; 71+ messages in thread
From: Nicolas Pitre @ 2009-12-14 19:59 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 14 Dec 2009, christian pellegrin wrote:

> Feroceon manuals (unfortunately I found that these are not available
> *after* buying the hardware).

Look at:

http://www.marvell.com/products/embedded_processors/kirkwood/index.jsp


Nicolas

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-14 17:48                                                 ` christian pellegrin
@ 2009-12-14 20:14                                                   ` Nicolas Pitre
  2009-12-15  7:50                                                     ` saeed bishara
  0 siblings, 1 reply; 71+ messages in thread
From: Nicolas Pitre @ 2009-12-14 20:14 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 14 Dec 2009, christian pellegrin wrote:

> On Mon, Dec 14, 2009 at 3:46 PM, Ronen Shitrit <rshitrit@marvell.com> wrote:
> 
> > "
> >
> > I think your patch doesn't cover the PIO mode...
> >
> 
> Has someone a suggestion on how to write a test-case for this?

I don't think there is any peripheral that runs in PIO mode on 
Kirkwood... except the NAND flash.

> I tried various forms of mixing read/write syscalls (or a mmap wit 
> MAP_PRIVATE) and access via two mmap with MAP_SHARED aliasing the same 
> address (I checked that adjust_pte was called by them) but didn't get 
> any problem. I have to admit that I really don't understand how PIO in 
> kernel mode can refer to mappings that are aliased (and so market 
> uncacheable) made by user-space programs. Thank you for your patience.

PIO transfer will store data in the kernel direct mapped memory which is 
always cached.  Normally the cache coherency with user space is handled 
by flush_dcache_page(), but that deals only with L1 and data can move to 
L2 where it will be invisible to the uncached user space shared 
mappings.

I don't know off hand how PIO on a page that is already mapped and 
shared in user space may be produced though.


Nicolas

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-14 20:14                                                   ` Nicolas Pitre
@ 2009-12-15  7:50                                                     ` saeed bishara
  2009-12-15 11:01                                                       ` christian pellegrin
  0 siblings, 1 reply; 71+ messages in thread
From: saeed bishara @ 2009-12-15  7:50 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 14, 2009 at 10:14 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
> On Mon, 14 Dec 2009, christian pellegrin wrote:
>
>> On Mon, Dec 14, 2009 at 3:46 PM, Ronen Shitrit <rshitrit@marvell.com> wrote:
>>
>> > "
>> >
>> > I think your patch doesn't cover the PIO mode...
>> >
>>
>> Has someone a suggestion on how to write a test-case for this?
>
> I don't think there is any peripheral that runs in PIO mode on
> Kirkwood... except the NAND flash.
the sata can be configured to use pio instead of dma, this can be done
by adding libata.force=pio to the command line.
>
>> I tried various forms of mixing read/write syscalls (or a mmap wit
>> MAP_PRIVATE) and access via two mmap with MAP_SHARED aliasing the same
>> address (I checked that adjust_pte was called by them) but didn't get
>> any problem. I have to admit that I really don't understand how PIO in
>> kernel mode can refer to mappings that are aliased (and so market
>> uncacheable) made by user-space programs. Thank you for your patience.
>
> PIO transfer will store data in the kernel direct mapped memory which is
> always cached. ?Normally the cache coherency with user space is handled
> by flush_dcache_page(), but that deals only with L1 and data can move to
> L2 where it will be invisible to the uncached user space shared
> mappings.
>
> I don't know off hand how PIO on a page that is already mapped and
> shared in user space may be produced though.
>
>
> Nicolas
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-14 19:59                                               ` shared memory problem on ARM v5TE using threads Nicolas Pitre
@ 2009-12-15 10:33                                                 ` christian pellegrin
  0 siblings, 0 replies; 71+ messages in thread
From: christian pellegrin @ 2009-12-15 10:33 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 14, 2009 at 8:59 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
> On Mon, 14 Dec 2009, christian pellegrin wrote:
>
>> Feroceon manuals (unfortunately I found that these are not available
>> *after* buying the hardware).
>
> Look at:
>
> http://www.marvell.com/products/embedded_processors/kirkwood/index.jsp
>

Unfortunately the chapaters about the core and the L2 cache are
missing. I gugled and found confirmation from openrd forum that they
are available only with a NDA.


-- 
Christian Pellegrin, see http://www.evolware.org/chri/
"Real Programmers don't play tennis, or any other sport which requires
you to change clothes. Mountain climbing is OK, and Real Programmers
wear their climbing boots to work in case a mountain should suddenly
spring up in the middle of the computer room."

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-15  7:50                                                     ` saeed bishara
@ 2009-12-15 11:01                                                       ` christian pellegrin
  2009-12-15 15:31                                                         ` christian pellegrin
  2009-12-15 17:14                                                         ` Russell King - ARM Linux
  0 siblings, 2 replies; 71+ messages in thread
From: christian pellegrin @ 2009-12-15 11:01 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Dec 15, 2009 at 8:50 AM, saeed bishara <saeed.bishara@gmail.com> wrote:
> On Mon, Dec 14, 2009 at 10:14 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
>> On Mon, 14 Dec 2009, christian pellegrin wrote:
>>>
>>> Has someone a suggestion on how to write a test-case for this?
>>
>> I don't think there is any peripheral that runs in PIO mode on
>> Kirkwood... except the NAND flash.
> the sata can be configured to use pio instead of dma, this can be done
> by adding libata.force=pio to the command line.
>>

Thanks for your answers, unfortunately it turned out to be quite easy
to write a program [0] that shows exactly what Russell described. :-(

On the Ferocecon [1] sometimes *both* the read via private mapping and
via simple lseek/read give the wrong result. On another arm without L2
cache [2] it happens sometimes that private mapping gives the wrong
result (I'm not sure it's ok, but it seems reasonable because the
mapping is private) but the read via lseek/read is always ok.

[0]:
/*
  dd if=/dev/zero of=prova count=1 bs=1k
  before running this
 */

#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>

int main(int argc, char *argv[])
{
  int fd, fd1;
  char *s1, *s2, *p;
  volatile char x;

  fd = open("prova", O_RDWR);
  assert(fd);

  fd1 = open("prova", O_RDWR);
  assert(fd1);

  p = mmap(NULL, 1024, PROT_WRITE|PROT_READ, MAP_PRIVATE,
	   fd1, 0);
  assert((long) p != -1);

  x = *p;

  s1 = mmap(NULL, 1024, PROT_WRITE|PROT_READ, MAP_SHARED,
	     fd, 0);
  assert((long) s1 != -1);
  s2 = mmap(NULL, 1024, PROT_WRITE|PROT_READ, MAP_SHARED,
	     fd, 0);
  assert((long) s2 != -1);

  fprintf(stderr,"s1 %p s2 %p p %p\n", s1, s2, p);
  fprintf(stderr, "1: s1 %d s2 %d p %d\n", s1[0], s2[0], p[0]);

  s1[0] = 'a';
  s2[0] = 'b';

  msync(s1, 1024, MS_SYNC);
  msync(s2, 1024, MS_SYNC);

  assert(lseek(fd1, SEEK_SET, 0) == 0);
  assert(read(fd1, &x, 1) == 1);
  fprintf(stderr, "2: s1 %d s2 %d p %d x %d\n", s1[0], s2[0], p[0], x);

  return 0;
}

[1]:
debian-armel:~/cachep# uname -a
Linux debian-armel 2.6.32-rc7-openrd #36 PREEMPT Tue Dec 15 11:07:00
CET 2009 armv5tel GNU/Linux
debian-armel:~/cachep# cat /proc/cpuinfo
Processor	: Feroceon 88FR131 rev 1 (v5l)
BogoMIPS	: 1196.03
Features	: swp half thumb fastmult edsp
CPU implementer	: 0x56
CPU architecture: 5TE
CPU variant	: 0x2
CPU part	: 0x131
CPU revision	: 1

Hardware	: Marvell OpenRD Base Board
Revision	: 0000
Serial		: 0000000000000000

[2]:
root at familiar:/var/tmp# uname -a
Linux familiar 2.6.23-dixe #58 PREEMPT Fri Dec 4 11:33:26 CET 2009
armv4tl unknown
root at familiar:/var/tmp# cat /proc/cpuinfo
Processor       : ARM920T rev 0 (v4l)
BogoMIPS        : 88.84
Features        : swp half thumb
CPU implementer : 0x41
CPU architecture: 4T
CPU variant     : 0x1
CPU part        : 0x920
CPU revision    : 0
Cache type      : write-back
Cache clean     : cp15 c7 ops
Cache lockdown  : format A
Cache format    : Harvard
I size          : 16384
I assoc         : 64
I line length   : 32
I sets          : 8
D size          : 16384
D assoc         : 64
D line length   : 32
D sets          : 8

Hardware        : Atmel AT91RM9200-DK
Revision        : 0000
Serial          : 0000000000000000


-- 
Christian Pellegrin, see http://www.evolware.org/chri/
"Real Programmers don't play tennis, or any other sport which requires
you to change clothes. Mountain climbing is OK, and Real Programmers
wear their climbing boots to work in case a mountain should suddenly
spring up in the middle of the computer room."

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-15 11:01                                                       ` christian pellegrin
@ 2009-12-15 15:31                                                         ` christian pellegrin
  2009-12-15 17:18                                                           ` Russell King - ARM Linux
  2009-12-15 17:14                                                         ` Russell King - ARM Linux
  1 sibling, 1 reply; 71+ messages in thread
From: christian pellegrin @ 2009-12-15 15:31 UTC (permalink / raw)
  To: linux-arm-kernel

The patch here does what was described by Russell and seems to solve
the read/write and the MAP_PRIVATE in test cases like the one I
posted. As noted it may not be enough. If you don't think it's
complete crap I can try to check for other cases too by installing a
more complete system (X11, db4) and using ftrace to watch for which
flush_cache_* we actually use. Any suggestion is appreciated, it would
be a pity to have to turn off L2 cache.

diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index d0d17b6..53ad00f 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -34,7 +34,7 @@ static unsigned long shared_pte_mask = L_PTE_MT_BUFFERABLE;
  * Therefore those configurations which might call adjust_pte (those
  * without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock.
  */
-static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
+static int adjust_pte(struct vm_area_struct *vma, unsigned long
address, int update, int only_shared)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
@@ -65,7 +65,7 @@ static int adjust_pte(struct vm_area_struct *vma,
unsigned long address)
 	 * If this page isn't present, or is already setup to
 	 * fault (ie, is old), we can safely ignore any issues.
 	 */
-	if (ret && (pte_val(entry) & L_PTE_MT_MASK) != shared_pte_mask) {
+	if (ret && (pte_val(entry) & L_PTE_MT_MASK) != shared_pte_mask && update) {
 		unsigned long pfn = pte_pfn(entry);
 		flush_cache_page(vma, address, pfn);
 		outer_flush_range((pfn << PAGE_SHIFT),
@@ -74,7 +74,13 @@ static int adjust_pte(struct vm_area_struct *vma,
unsigned long address)
 		pte_val(entry) |= shared_pte_mask;
 		set_pte_at(vma->vm_mm, address, pte, entry);
 		flush_tlb_page(vma, address);
+		printk(KERN_INFO "Uncached vma %08x (addr %08lx flahs %08lx phy
%08x) from pid %d\n",
+		       (unsigned int) vma, vma->vm_start, vma->vm_flags,
+		       (unsigned int) (pfn << PAGE_SHIFT),
+		       current->pid);
 	}
+	if (only_shared && (pte_val(entry) & L_PTE_MT_MASK) != shared_pte_mask)
+	  ret = 0;
 	pte_unmap(pte);
 	return ret;

@@ -100,6 +106,9 @@ make_coherent(struct address_space *mapping,
struct vm_area_struct *vma, unsigne
 	unsigned long offset;
 	pgoff_t pgoff;
 	int aliases = 0;
+#if defined(CONFIG_OUTER_CACHE) && defined(CONFIG_CPU_CACHE_VIVT)
+	int run;
+#endif

 	pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);

@@ -109,6 +118,32 @@ make_coherent(struct address_space *mapping,
struct vm_area_struct *vma, unsigne
 	 * cache coherency.
 	 */
 	flush_dcache_mmap_lock(mapping);
+#if defined(CONFIG_OUTER_CACHE) && defined(CONFIG_CPU_CACHE_VIVT)
+	/*
+	 * In the first run we just check if we have to make some
+	 * address space uncacheable cause of L1 VIVT. In the second
+	 * we check if there is an uncached map in other process.  If
+	 * one of the previous condition is true we proceed to make
+	 * *all* (both in current process VMA and that of others) of
+	 * them so. This should solve both cases of multiple shared
+	 * memories attached in the same process but not impact the
+	 * common case of just one mapping per process.
+	 */
+	for(run = 0; run < 3; run++) {
+		vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
+			if ((mpnt->vm_mm != mm || mpnt == vma) && run == 0)
+				continue;
+			if (!(mpnt->vm_flags & VM_MAYSHARE) && run != 2) /* update all mappings */
+				continue;
+			offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+			aliases += adjust_pte(mpnt, mpnt->vm_start + offset,
+					      run == 2, /* update only on the last run */
+					      run == 1); /* on the second run catch shared in other procs */
+		}
+		if (aliases == 0 && run == 1)
+			break;
+	}
+#else
 	vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		/*
 		 * If this VMA is not in our MM, we can ignore it.
@@ -120,11 +155,12 @@ make_coherent(struct address_space *mapping,
struct vm_area_struct *vma, unsigne
 		if (!(mpnt->vm_flags & VM_MAYSHARE))
 			continue;
 		offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
-		aliases += adjust_pte(mpnt, mpnt->vm_start + offset);
+		aliases += adjust_pte(mpnt, mpnt->vm_start + offset, 1, 0);
 	}
+#endif
 	flush_dcache_mmap_unlock(mapping);
 	if (aliases)
-		adjust_pte(vma, addr);
+		adjust_pte(vma, addr, 1, 0);
 	else
 		flush_cache_page(vma, addr, pfn);
 }
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 7f294f3..b7c6986 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -209,6 +209,13 @@ void flush_dcache_page(struct page *page)
 			__flush_dcache_aliases(mapping, page);
 		else if (mapping)
 			__flush_icache_all();
+#ifdef CONFIG_OUTER_CACHE
+		{
+		  unsigned long pfn = page_to_pfn(page);
+		  outer_flush_range((pfn << PAGE_SHIFT),
+				    (pfn << PAGE_SHIFT) + PAGE_SIZE);
+		}
+#endif
 	}
 }
 EXPORT_SYMBOL(flush_dcache_page);



-- 
Christian Pellegrin, see http://www.evolware.org/chri/
"Real Programmers don't play tennis, or any other sport which requires
you to change clothes. Mountain climbing is OK, and Real Programmers
wear their climbing boots to work in case a mountain should suddenly
spring up in the middle of the computer room."

^ permalink raw reply related	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-15 11:01                                                       ` christian pellegrin
  2009-12-15 15:31                                                         ` christian pellegrin
@ 2009-12-15 17:14                                                         ` Russell King - ARM Linux
  2009-12-16 16:35                                                           ` christian pellegrin
  1 sibling, 1 reply; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-15 17:14 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Dec 15, 2009 at 12:01:56PM +0100, christian pellegrin wrote:
> Thanks for your answers, unfortunately it turned out to be quite easy
> to write a program [0] that shows exactly what Russell described. :-(

Try this:

- create a 4K file
- create three programs:
  1. repeatedly uses read() and write() to increment the first word of
     this file
  2. uses one shared mmap of this file to increment the 256th word (offset
     1024)
  3. uses two shared mmaps of this file, reading the 128th word from one
     shared mmap, reads the 128th word from the other, increments the
     first value and writes it to the 128th word in the second map,
     printing the two values read.  It also reads the first and 256th
     word and prints the result

This should be an adequate test of most of the important scenarios: the
third program should show an increasing number for both locations.

If it doesn't, then the value is being cached somewhere.  I suspect
what you'll find is that on Feroceon, the only values you will see
increment is the 128th value.  The remainder will be static.

That means multiple shared mmaps on Feroceon are incoherent with:
1. file updates via read/write.
2. other shared mappings in other processes.

And because of (1) I expect that a read() followed by another read() of
the same file offset as being updated by (3) will not show updates - so
it should be both lost write()s and stale data on read()s (which will
mean PIO write-outs - eg swap out, file update - will be affected.)

> int main(int argc, char *argv[])
> {
>   int fd, fd1;
>   char *s1, *s2, *p;
>   volatile char x;

As a note, you want s1, s2 and p to be volatile, otherwise the compiler
can cache the result.

>   fd = open("prova", O_RDWR);
>   assert(fd);
> 
>   fd1 = open("prova", O_RDWR);
>   assert(fd1);
> 
>   p = mmap(NULL, 1024, PROT_WRITE|PROT_READ, MAP_PRIVATE,
> 	   fd1, 0);
>   assert((long) p != -1);
> 
>   x = *p;
> 
>   s1 = mmap(NULL, 1024, PROT_WRITE|PROT_READ, MAP_SHARED,
> 	     fd, 0);
>   assert((long) s1 != -1);
>   s2 = mmap(NULL, 1024, PROT_WRITE|PROT_READ, MAP_SHARED,
> 	     fd, 0);
>   assert((long) s2 != -1);
> 
>   fprintf(stderr,"s1 %p s2 %p p %p\n", s1, s2, p);
>   fprintf(stderr, "1: s1 %d s2 %d p %d\n", s1[0], s2[0], p[0]);
> 
>   s1[0] = 'a';
>   s2[0] = 'b';
> 
>   msync(s1, 1024, MS_SYNC);
>   msync(s2, 1024, MS_SYNC);

These msyncs should not be required for data via one shared mmap to be
visible via another mmap.

>   assert(lseek(fd1, SEEK_SET, 0) == 0);
>   assert(read(fd1, &x, 1) == 1);
>   fprintf(stderr, "2: s1 %d s2 %d p %d x %d\n", s1[0], s2[0], p[0], x);
> 
>   return 0;
> }

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-15 15:31                                                         ` christian pellegrin
@ 2009-12-15 17:18                                                           ` Russell King - ARM Linux
  2009-12-16 14:08                                                             ` Ronen Shitrit
  0 siblings, 1 reply; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-15 17:18 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Dec 15, 2009 at 04:31:35PM +0100, christian pellegrin wrote:
> The patch here does what was described by Russell and seems to solve
> the read/write and the MAP_PRIVATE in test cases like the one I
> posted. As noted it may not be enough.

This might be enough (but needs thorough testing).  However, adding
the requirement for L2 cache flushing in flush_dcache_page() because
you have an L2 cache is unfair on those L2's which don't suffer from
this problem.

I think we need to have a little more information about the behaviour
of the L2 cache so that we can decide how much flushing is required,
and where.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-15 17:18                                                           ` Russell King - ARM Linux
@ 2009-12-16 14:08                                                             ` Ronen Shitrit
  0 siblings, 0 replies; 71+ messages in thread
From: Ronen Shitrit @ 2009-12-16 14:08 UTC (permalink / raw)
  To: linux-arm-kernel



-----Original Message-----
From: linux-arm-kernel-bounces@lists.infradead.org [mailto:linux-arm-kernel-bounces at lists.infradead.org] On Behalf Of Russell King - ARM Linux
Sent: Tuesday, December 15, 2009 7:19 PM
To: christian pellegrin
Cc: linux-arm-kernel at lists.infradead.org
Subject: Re: shared memory problem on ARM v5TE using threads

On Tue, Dec 15, 2009 at 04:31:35PM +0100, christian pellegrin wrote:
> The patch here does what was described by Russell and seems to solve
> the read/write and the MAP_PRIVATE in test cases like the one I
> posted. As noted it may not be enough.

This might be enough (but needs thorough testing).  However, adding
the requirement for L2 cache flushing in flush_dcache_page() because
you have an L2 cache is unfair on those L2's which don't suffer from
this problem.

I think we need to have a little more information about the behaviour
of the L2 cache so that we can decide how much flushing is required,
and where.
[Ronen Shitrit] What information do you need?


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel at lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-15 17:14                                                         ` Russell King - ARM Linux
@ 2009-12-16 16:35                                                           ` christian pellegrin
  2009-12-16 17:38                                                             ` christian pellegrin
  0 siblings, 1 reply; 71+ messages in thread
From: christian pellegrin @ 2009-12-16 16:35 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Dec 15, 2009 at 6:14 PM, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> Try this:
>
> - create a 4K file
> - create three programs:
> ?1. repeatedly uses read() and write() to increment the first word of
> ? ? this file
> ?2. uses one shared mmap of this file to increment the 256th word (offset
> ? ? 1024)
> ?3. uses two shared mmaps of this file, reading the 128th word from one
> ? ? shared mmap, reads the 128th word from the other, increments the
> ? ? first value and writes it to the 128th word in the second map,
> ? ? printing the two values read. ?It also reads the first and 256th
> ? ? word and prints the result
>
> This should be an adequate test of most of the important scenarios: the
> third program should show an increasing number for both locations.
>

I played a bit with the program implementing this idea. With an
unpatched 2.6.32-rc7 from orion git tree all the numbers are actually
incrementing. Of course it's impossible to catch an intermittent
problem with this test because maybe I'm just seeing one update out of
n. I had a similar feeling with my test program about read/write
consistency: sometimes the uncached mapping got the right result and
sometimes not (I did the test on an unload system via console,  so I
really doubt I was implicitly flushing the cache). My guess is that
sometimes the data written through L2 hit the SDRAM. I double checked,
I have L2 working in WB and *not* WT. It's much easier to show the
consistency problem if we print the locations in process 2, now the
128th location is *never* incrementing until I don't trash the cache
running a big program. Somehow read/write is still incrementing. With
the patch I proposed I don't see any problems.

I'm trying some more elaborate tests where just one case of
inconsistency will stop the counting.

-- 
Christian Pellegrin, see http://www.evolware.org/chri/
"Real Programmers don't play tennis, or any other sport which requires
you to change clothes. Mountain climbing is OK, and Real Programmers
wear their climbing boots to work in case a mountain should suddenly
spring up in the middle of the computer room."

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-16 16:35                                                           ` christian pellegrin
@ 2009-12-16 17:38                                                             ` christian pellegrin
  2009-12-17  7:35                                                               ` Ronen Shitrit
  2009-12-18 20:22                                                               ` Nicolas Pitre
  0 siblings, 2 replies; 71+ messages in thread
From: christian pellegrin @ 2009-12-16 17:38 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Dec 16, 2009 at 5:35 PM, christian pellegrin <chripell@gmail.com> wrote:

>
> I'm trying some more elaborate tests where just one case of
> inconsistency will stop the counting.
>

Here is the program that implements Russell's ideas (at least I think
so) but is easier to use. By giving the parameter 1 or -1 you can test
different kind of consistency issues (missing flush in r/w or
inconsistent mapping's cacheness). It is also quite fun to watch at
with the buggy kernel on an idle system: it looks like that every
couple of seconds the 256kb L2 cache get flushed anyway (so even on
the kernel without the patch every now and then you get some
progress). I had it running for tens of minutes on a patched kernel
without stops.

*** rt.sh:

#!/bin/sh

RT=./rta

dd if=/dev/zero of=rt count=1 bs=4k

$RT 0 $1 &
$RT 1 $1 &
$RT 2 $1 &

***  rt.c:
/*
arm-none-linux-gnueabi-gcc -Wall -o rta rt.c
*/
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>

int main(int argc, char *argv[])
{
  int fd, dir;
  volatile unsigned int *s1, *s2;
  volatile int x;

  if (argc != 3) {
    printf("Usage: %s [0/1/2] [dir]\n", argv[0]);
    exit(1);
  }
  dir = atoi(argv[2]);

  fd = open("rt",O_RDWR);
    assert(fd != -1);

  switch(atoi(argv[1])) {
  case 0:
    x = 3000000;
    assert(lseek(fd, sizeof(unsigned int) * 128, SEEK_SET) ==
(sizeof(unsigned int) * 128));
    assert(write(fd, &x, sizeof(x)) == sizeof(x));
    while(1) {
      do {
	assert(lseek(fd, sizeof(unsigned int) * 128, SEEK_SET) ==
(sizeof(unsigned int) * 128));
	assert(read(fd, &x, sizeof(x)) == sizeof(x));
      } while(x % 3 != 0);
      printf("%d rw\n",x);
      x += dir;
      assert(lseek(fd, sizeof(unsigned int) * 128, SEEK_SET) ==
(sizeof(unsigned int) * 128));
      assert(write(fd, &x, sizeof(x)) == sizeof(x));
      if (dir > 0)
	assert(lseek(fd, 0, SEEK_SET) == 0);
      else
	assert(lseek(fd, sizeof(unsigned int) * 256, SEEK_SET) ==
sizeof(unsigned int) * 256);
      assert(write(fd, &x, sizeof(x)) == sizeof(x));
    }
    break;

  case 1:
    s1 = mmap(NULL, 4096, PROT_WRITE|PROT_READ, MAP_SHARED,
	      fd, 0);
    assert((long) s1 != -1);
    while(1) {
      while (s1[0] % 3 != 1);
      printf("%d 1map\n",s1[0]);
      s1[0] += dir;
      if (dir > 0)
	s1[256] = s1[0];
      else
	s1[128] = s1[0];
    }
    break;

  default:
    s1 = mmap(NULL, 4096, PROT_WRITE|PROT_READ, MAP_SHARED,
	      fd, 0);
    assert((long) s1 != -1);
    s2 = mmap(NULL, 4096, PROT_WRITE|PROT_READ, MAP_SHARED,
	      fd, 0);
    assert((long) s2 != -1);
    while (1) {
      while (s1[256] % 3 != 2);
      printf("%d 2map\n",s1[256]);
      s2[256] += dir;
      if (dir > 0)
	s2[128] =  s1[256];
      else
	s2[0] = s1[256];
    }
  }

  return 0;
}


-- 
Christian Pellegrin, see http://www.evolware.org/chri/
"Real Programmers don't play tennis, or any other sport which requires
you to change clothes. Mountain climbing is OK, and Real Programmers
wear their climbing boots to work in case a mountain should suddenly
spring up in the middle of the computer room."

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-16 17:38                                                             ` christian pellegrin
@ 2009-12-17  7:35                                                               ` Ronen Shitrit
  2009-12-18 20:22                                                               ` Nicolas Pitre
  1 sibling, 0 replies; 71+ messages in thread
From: Ronen Shitrit @ 2009-12-17  7:35 UTC (permalink / raw)
  To: linux-arm-kernel



-----Original Message-----
From: christian pellegrin [mailto:chripell at gmail.com] 
Sent: Wednesday, December 16, 2009 7:39 PM
To: Russell King - ARM Linux
Cc: saeed bishara; Ronen Shitrit; linux-arm-kernel at lists.infradead.org; Nicolas Pitre
Subject: Re: shared memory problem on ARM v5TE using threads

On Wed, Dec 16, 2009 at 5:35 PM, christian pellegrin <chripell@gmail.com> wrote:

>
> I'm trying some more elaborate tests where just one case of
> inconsistency will stop the counting.
>

Here is the program that implements Russell's ideas (at least I think
so) but is easier to use. By giving the parameter 1 or -1 you can test
different kind of consistency issues (missing flush in r/w or
inconsistent mapping's cacheness). It is also quite fun to watch at
with the buggy kernel on an idle system: it looks like that every
couple of seconds the 256kb L2 cache get flushed anyway
[Ronen Shitrit] The L2 dirty line eviction is always a result of L1 line fill request, or a specific CP15 flush command.

(so even on
the kernel without the patch every now and then you get some
progress). I had it running for tens of minutes on a patched kernel
without stops.
[Ronen Shitrit] Nice :)

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-13 11:48                                       ` Ronen Shitrit
  2009-12-13 12:00                                         ` Russell King - ARM Linux
@ 2009-12-17 11:31                                         ` Heiko Schocher
  2009-12-18  8:08                                           ` christian pellegrin
  1 sibling, 1 reply; 71+ messages in thread
From: Heiko Schocher @ 2009-12-17 11:31 UTC (permalink / raw)
  To: linux-arm-kernel

Hello Christian,

>On Tue, Dec 15, 2009 at 04:31:35PM +0100, christian pellegrin wrote:
>> The patch here does what was described by Russell and seems to solve
>> the read/write and the MAP_PRIVATE in test cases like the one I
>> posted. As noted it may not be enough.
>
> This might be enough (but needs thorough testing).  However, adding
> the requirement for L2 cache flushing in flush_dcache_page() because
> you have an L2 cache is unfair on those L2's which don't suffer from
> this problem.
>
> I think we need to have a little more information about the behaviour
> of the L2 cache so that we can decide how much flushing is required,
> and where.

FYI:
I tried your patch:
http://lists.infradead.org/pipermail/linux-arm-kernel/2009-December/006192.html

with the testprogramm, I posted here:
http://lists.infradead.org/pipermail/linux-arm-kernel/2009-December/005489.html

and on my board [1] it works fine [2] with your patch.

Another question comes in my mind:

As we have problems with shared memory, if using more than one
mapping, do we have this problems also when using other "standard"
Linux IPC, like for example semaphores, queues, ... ?

bye,
Heiko

[1]
-bash-3.2# cat /proc/cpuinfo
Processor       : Feroceon 88FR131 rev 1 (v5l)
BogoMIPS        : 799.53
Features        : swp half thumb fastmult edsp
CPU implementer : 0x56
CPU architecture: 5TE
CPU variant     : 0x2
CPU part        : 0x131
CPU revision    : 1

Hardware        : Keymile SUEN3 Board
Revision        : 0000
Serial          : 0000000000000000
-bash-3.2#

[2] log from testprogramm with patch from christian:

-bash-3.2# cat shtest2.sh
#!/bin/sh
echo "Run shmtest2"
./shmtest2 write 1 &
./shmtest2 read 4 &
-bash-3.2# ./shtest2.sh
Run shmtest2
Write to in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Write Thread 0, addr:40961000
359: write new mesg: 0
Read from in_msg
Creating thread 0
Shared memory Id:0
attach shared mem:40961000
Start Read Thread addr:40961000
Creating thread 1
Shared memory Id:0
attach shared mem:41170000
Start Read Thread addr:41170000
Creating thread 2
Shared memory Id:0
attach shared mem:4197f000
Start Read Thread addr:4197f000
Creating thread 3
Shared memory Id:0
attach shared mem:4218e000
Start Read Thread addr:4218e000
-bash-3.2# 359: write new mesg: 0
361 Read from entry in_msg=1000, out_msg=1000, addr=40961000
361 Read from entry in_msg=2000, out_msg=1000, addr=41170000
361 Read from entry in_msg=3000, out_msg=1000, addr=4197f000
361 Read from entry in_msg=4000, out_msg=1000, addr=4218e000
359: write new mesg: 1000
361 Read from entry in_msg=5000, out_msg=2000, addr=40961000
361 Read from entry in_msg=6000, out_msg=2000, addr=41170000
361 Read from entry in_msg=7000, out_msg=2000, addr=4197f000
361 Read from entry in_msg=8000, out_msg=2000, addr=4218e000
359: write new mesg: 2000
361 Read from entry in_msg=9000, out_msg=3000, addr=40961000
361 Read from entry in_msg=10000, out_msg=3000, addr=41170000
361 Read from entry in_msg=11000, out_msg=3000, addr=4197f000
361 Read from entry in_msg=12000, out_msg=3000, addr=4218e000
359: write new mesg: 3000
[...]
349: write new mesg: 57000
350 Read from entry in_msg=230000, out_msg=58000, addr=4197f000
350 Read from entry in_msg=231000, out_msg=58000, addr=41170000
350 Read from entry in_msg=232000, out_msg=58000, addr=4218e000
350 Read from entry in_msg=233000, out_msg=58000, addr=40961000
349: write new mesg: 58000
350 Read from entry in_msg=234000, out_msg=59000, addr=4197f000
350 Read from entry in_msg=235000, out_msg=59000, addr=41170000
350 Read from entry in_msg=236000, out_msg=59000, addr=4218e000
350 Read from entry in_msg=237000, out_msg=59000, addr=40961000
350 Read from entry in_msg=237000, out_msg=59000, addr=4197f000
350 Read from entry in_msg=237000, out_msg=59000, addr=4218e000
350 Read from entry in_msg=237000, out_msg=59000, addr=41170000
All write threads finished, exit
All read threads finished, exit

-bash-3.2#

out_msg increments as suggested!

-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-17 11:31                                         ` Heiko Schocher
@ 2009-12-18  8:08                                           ` christian pellegrin
  0 siblings, 0 replies; 71+ messages in thread
From: christian pellegrin @ 2009-12-18  8:08 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Dec 17, 2009 at 12:31 PM, Heiko Schocher <hs@denx.de> wrote:
> Hello Christian,
>

Hi

> As we have problems with shared memory, if using more than one
> mapping, do we have this problems also when using other "standard"
> Linux IPC, like for example semaphores, queues, ... ?
>

AFAIK no. The problem you discovered is caused by the access of the
same physical memory area via a cached and an uncached mapping without
proper L2 cache flushing. The only way that I know about how to create
an uncached one in user space is by mapping the same physical memory
to different virtual address (so triggering the L1 VIVT cache aliasing
avoidance mechanism).

-- 
Christian Pellegrin, see http://www.evolware.org/chri/
"Real Programmers don't play tennis, or any other sport which requires
you to change clothes. Mountain climbing is OK, and Real Programmers
wear their climbing boots to work in case a mountain should suddenly
spring up in the middle of the computer room."

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-13 12:00                                         ` Russell King - ARM Linux
  2009-12-13 12:06                                           ` Russell King - ARM Linux
@ 2009-12-18 18:45                                           ` Pavel Machek
  2009-12-18 19:00                                             ` Nicolas Pitre
  2009-12-18 19:16                                             ` Russell King - ARM Linux
  1 sibling, 2 replies; 71+ messages in thread
From: Pavel Machek @ 2009-12-18 18:45 UTC (permalink / raw)
  To: linux-arm-kernel

On Sun 2009-12-13 12:00:33, Russell King - ARM Linux wrote:
> On Sun, Dec 13, 2009 at 01:48:48PM +0200, Ronen Shitrit wrote:
> > Another idea is to change the shared mapping handling, in case of vivt with pipt L2, so it won't remap the shared area as non-cacheable:
> > - make_coherent won't use adjust_pte and leave only the regular flush.
> > - Flush L1 for all context switches, also for the case that the new process is using same mm (thread context switch).
> 
> That doesn't work.
> 
> Well, with a VIVT L1, we flush the L1 on all MM switches anyway.
> Flushing it on any thread switch is not going to help that much.
> 
> The problem with shared mmaps is that if you have multiple within the
> same thread, it is required that they are _all_ coherent with respect
> to each other, whether or not a context switch has occurred.

But that's pretty unusual situation, right?

So what about...

a) flush L2 on context switch

b) disable L2 when thread has maps one physical address twice



-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-18 18:45                                           ` Pavel Machek
@ 2009-12-18 19:00                                             ` Nicolas Pitre
  2009-12-20 19:51                                               ` Pavel Machek
  2009-12-18 19:16                                             ` Russell King - ARM Linux
  1 sibling, 1 reply; 71+ messages in thread
From: Nicolas Pitre @ 2009-12-18 19:00 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 18 Dec 2009, Pavel Machek wrote:

> On Sun 2009-12-13 12:00:33, Russell King - ARM Linux wrote:
> > On Sun, Dec 13, 2009 at 01:48:48PM +0200, Ronen Shitrit wrote:
> > > Another idea is to change the shared mapping handling, in case of vivt with pipt L2, so it won't remap the shared area as non-cacheable:
> > > - make_coherent won't use adjust_pte and leave only the regular flush.
> > > - Flush L1 for all context switches, also for the case that the new process is using same mm (thread context switch).
> > 
> > That doesn't work.
> > 
> > Well, with a VIVT L1, we flush the L1 on all MM switches anyway.
> > Flushing it on any thread switch is not going to help that much.
> > 
> > The problem with shared mmaps is that if you have multiple within the
> > same thread, it is required that they are _all_ coherent with respect
> > to each other, whether or not a context switch has occurred.
> 
> But that's pretty unusual situation, right?
> 
> So what about...
> 
> a) flush L2 on context switch

No.  This is too big a performance killer.


Nicolas

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-18 18:45                                           ` Pavel Machek
  2009-12-18 19:00                                             ` Nicolas Pitre
@ 2009-12-18 19:16                                             ` Russell King - ARM Linux
  2009-12-20 19:56                                               ` Pavel Machek
  1 sibling, 1 reply; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-18 19:16 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Dec 18, 2009 at 07:45:19PM +0100, Pavel Machek wrote:
> But that's pretty unusual situation, right?

It may be uncommon, but it's something that must work - not only is it
a quality of implementation issue, but its also a data corruption issue.
Get it wrong and you silently corrupt files on your filesystems.

As I've said earlier in the thread, we know that db4 does this.  Not
fixing this means that we really don't care about any program which
uses db4, or data contained within a db4 database.

> So what about...
> 
> a) flush L2 on context switch

If you go to that extent, the system will probably perform better with
the L2 cache permanently disabled.

> b) disable L2 when thread has maps one physical address twice

Due to the way the L2 cache works, you have to disable L2 and flush it
when switching to the thread, and re-enable L2 when you switch away.
Merely flushing it when switching away won't work.

In order to disable or enable the L2 cache, the L1 cache must be
flushed and disabled - and that would have to be done with all IRQs
(including FIQs) disabled to ensure that its done atomically.

I don't think this is a sane option.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-16 17:38                                                             ` christian pellegrin
  2009-12-17  7:35                                                               ` Ronen Shitrit
@ 2009-12-18 20:22                                                               ` Nicolas Pitre
  2009-12-18 20:44                                                                 ` Russell King - ARM Linux
  1 sibling, 1 reply; 71+ messages in thread
From: Nicolas Pitre @ 2009-12-18 20:22 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 16 Dec 2009, christian pellegrin wrote:

> On Wed, Dec 16, 2009 at 5:35 PM, christian pellegrin <chripell@gmail.com> wrote:
> 
> >
> > I'm trying some more elaborate tests where just one case of
> > inconsistency will stop the counting.
> >
> 
> Here is the program that implements Russell's ideas (at least I think
> so) but is easier to use. By giving the parameter 1 or -1 you can test
> different kind of consistency issues (missing flush in r/w or
> inconsistent mapping's cacheness). It is also quite fun to watch at
> with the buggy kernel on an idle system: it looks like that every
> couple of seconds the 256kb L2 cache get flushed anyway (so even on
> the kernel without the patch every now and then you get some
> progress). I had it running for tens of minutes on a patched kernel
> without stops.

Could you please repost your patch, adding CONFIG_CPU_CACHE_VIVT to the 
conditional code flushing the L2 cache in flush_dcache_page()?

I think this would be a good thing to have merged.  If I'm not mistaken, 
this patch appears to fix all identified coherency cases so far.


Nicolas

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-18 20:22                                                               ` Nicolas Pitre
@ 2009-12-18 20:44                                                                 ` Russell King - ARM Linux
  2009-12-18 21:23                                                                   ` Nicolas Pitre
  0 siblings, 1 reply; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-18 20:44 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Dec 18, 2009 at 03:22:34PM -0500, Nicolas Pitre wrote:
> On Wed, 16 Dec 2009, christian pellegrin wrote:
> 
> > On Wed, Dec 16, 2009 at 5:35 PM, christian pellegrin <chripell@gmail.com> wrote:
> > 
> > >
> > > I'm trying some more elaborate tests where just one case of
> > > inconsistency will stop the counting.
> > >
> > 
> > Here is the program that implements Russell's ideas (at least I think
> > so) but is easier to use. By giving the parameter 1 or -1 you can test
> > different kind of consistency issues (missing flush in r/w or
> > inconsistent mapping's cacheness). It is also quite fun to watch at
> > with the buggy kernel on an idle system: it looks like that every
> > couple of seconds the 256kb L2 cache get flushed anyway (so even on
> > the kernel without the patch every now and then you get some
> > progress). I had it running for tens of minutes on a patched kernel
> > without stops.
> 
> Could you please repost your patch, adding CONFIG_CPU_CACHE_VIVT to the 
> conditional code flushing the L2 cache in flush_dcache_page()?

No, that's not sufficient as I pointed out earlier.  We really do not
want to be doing the L2 cache stuff on things which aren't Feroceon
until it's shown that others are affected.

The other annoying thing is that it's touching the same area of code
which is currently broken with highpte - and I have changes in the
pipeline for fixing that.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-18 20:44                                                                 ` Russell King - ARM Linux
@ 2009-12-18 21:23                                                                   ` Nicolas Pitre
  2009-12-18 21:57                                                                     ` Russell King - ARM Linux
  0 siblings, 1 reply; 71+ messages in thread
From: Nicolas Pitre @ 2009-12-18 21:23 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 18 Dec 2009, Russell King - ARM Linux wrote:

> On Fri, Dec 18, 2009 at 03:22:34PM -0500, Nicolas Pitre wrote:
> > Could you please repost your patch, adding CONFIG_CPU_CACHE_VIVT to the 
> > conditional code flushing the L2 cache in flush_dcache_page()?
> 
> No, that's not sufficient as I pointed out earlier.  We really do not
> want to be doing the L2 cache stuff on things which aren't Feroceon
> until it's shown that others are affected.

Aren't we saying the same thing?

As I said earlier already, this is concerning only L2 equipped VIVT 
processor models.  There are very few of those in existence, namely some 
Feroceons and XSC3s.

On both Feroceon and XSC3, there used to be some issues with DB4 when 
creating two shared writable mappings in the same process.  This was 
fixed in commit 08e445bd.  Both processors used to fail and both were 
happy with the change.  And according to the tests I've made and the 
conclusion I've consigned in that commit, none of those processors could 
support L1 uncacheable L2 cacheable.

This didn't cover all cases but we apparently have a solution for those 
now. So by surrounding the code with #if defined(CONFIG_CPU_CACHE_VIVT) 
&& defined(CONFIG_OUTER_CACHE) we actually affect only the same 
processor models i.e. Feroceon (proven to need it) and XSC3 (unproven).

So, even if XSC3 is included while it might potentially not need it, I'd 
cover it anyway given past track records with similar issues to be on 
the safe side, at least until someone with the hardware is able to 
confirm the fix is not needed there (and I doubt it will ever happen).  
We're talking about some seriously rare occurrences anyway, so this is 
not like if this would impose a significant useless performance killing 
overhead on an XSC3.

> The other annoying thing is that it's touching the same area of code
> which is currently broken with highpte - and I have changes in the
> pipeline for fixing that.

That's a valid concern.  In which case I'd suggest that Christian waits 
until your changes are visible somewhere for him to rebase his patch on 
top and retest it before reposting.  The current patch (with my 
suggested modification) might be worth considering for linux-stable 
nevertheless.


Nicolas

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-18 21:23                                                                   ` Nicolas Pitre
@ 2009-12-18 21:57                                                                     ` Russell King - ARM Linux
  2009-12-19 11:24                                                                       ` christian pellegrin
  0 siblings, 1 reply; 71+ messages in thread
From: Russell King - ARM Linux @ 2009-12-18 21:57 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Dec 18, 2009 at 04:23:30PM -0500, Nicolas Pitre wrote:
> On Fri, 18 Dec 2009, Russell King - ARM Linux wrote:
> 
> > On Fri, Dec 18, 2009 at 03:22:34PM -0500, Nicolas Pitre wrote:
> > > Could you please repost your patch, adding CONFIG_CPU_CACHE_VIVT to the 
> > > conditional code flushing the L2 cache in flush_dcache_page()?
> > 
> > No, that's not sufficient as I pointed out earlier.  We really do not
> > want to be doing the L2 cache stuff on things which aren't Feroceon
> > until it's shown that others are affected.
> 
> Aren't we saying the same thing?

Not quite.  You're saying "all VIVT + outer cache will have this fix".
I'm saying "those which have the problem should have the fix".

I believe treating them as being the same is not the right approach.
At the moment, they're the same group of processors, but that doesn't
mean it stays that way, and should it change, finding such hidden
conditions isn't going to be easy.

So I'd like to see something other than CONFIG_CPU_CACHE_VIVT+
CONFIG_OUTER_CACHE in the code - instead, make a new config symbol
which is enabled when both are currently selected, and put a useful
help text with it so people understand what its about.

> > The other annoying thing is that it's touching the same area of code
> > which is currently broken with highpte - and I have changes in the
> > pipeline for fixing that.
> 
> That's a valid concern.  In which case I'd suggest that Christian waits 
> until your changes are visible somewhere for him to rebase his patch on 
> top and retest it before reposting.  The current patch (with my 
> suggested modification) might be worth considering for linux-stable 
> nevertheless.

It'll be a while before my patches are ready - they touch every other
architecture in the kernel because we need access to the kmapped page
table to modify the PTE...

http://ftp.arm.linux.org.uk/git/gitweb.cgi?p=linux-2.6-arm.git;a=shortlog;h=refs/heads/umc

GIT:
http://ftp.arm.linux.org.uk/pub/linux/arm/kernel/git-cur/linux-2.6-arm.git umc

(or something like that.)

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-18 21:57                                                                     ` Russell King - ARM Linux
@ 2009-12-19 11:24                                                                       ` christian pellegrin
  2009-12-19 11:27                                                                         ` [PATCH] Fix coherency problems on ARM v5 with L2 PIPT cache Christian Pellegrin
  0 siblings, 1 reply; 71+ messages in thread
From: christian pellegrin @ 2009-12-19 11:24 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Dec 18, 2009 at 10:57 PM, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> On Fri, Dec 18, 2009 at 04:23:30PM -0500, Nicolas Pitre wrote:
>> That's a valid concern. ?In which case I'd suggest that Christian waits
>> until your changes are visible somewhere for him to rebase his patch on
>> top and retest it before reposting. ?The current patch (with my
>> suggested modification) might be worth considering for linux-stable
>> nevertheless.
>
> It'll be a while before my patches are ready - they touch every other
> architecture in the kernel because we need access to the kmapped page
> table to modify the PTE...
>
> http://ftp.arm.linux.org.uk/git/gitweb.cgi?p=linux-2.6-arm.git;a=shortlog;h=refs/heads/umc
>
> GIT:
> http://ftp.arm.linux.org.uk/pub/linux/arm/kernel/git-cur/linux-2.6-arm.git umc
>

I'm going to send a proper patch (at least for checkpatch.pl
standards) in response to this mail in case someone cares to add it to
theirs trees. I'm cloning the mentioned git repository and will try to
do some testing on umc branch.The modifications seem important so one
more tester won't hurt I guess.


-- 
Christian Pellegrin, see http://www.evolware.org/chri/
"Real Programmers don't play tennis, or any other sport which requires
you to change clothes. Mountain climbing is OK, and Real Programmers
wear their climbing boots to work in case a mountain should suddenly
spring up in the middle of the computer room."

^ permalink raw reply	[flat|nested] 71+ messages in thread

* [PATCH] Fix coherency problems on ARM v5 with L2 PIPT cache.
  2009-12-19 11:24                                                                       ` christian pellegrin
@ 2009-12-19 11:27                                                                         ` Christian Pellegrin
  0 siblings, 0 replies; 71+ messages in thread
From: Christian Pellegrin @ 2009-12-19 11:27 UTC (permalink / raw)
  To: linux-arm-kernel

This patch fixes coherency problems on ARM systems that have a
VIVT L1 cache and a PIPT L2 one. The modifications to fault-armv.c
assure that we don't mix cached and uncached mappings of the same
physical memory area after we made a mapping uncached to prevent
problems with L1 aliasing. The modifications to flush.c assure
coherency with read/write because kernel PIO memory access is
always cached.

Signed-off-by: Christian Pellegrin <chripell@fsfe.org>
---
 arch/arm/mm/Kconfig      |    8 ++++++
 arch/arm/mm/fault-armv.c |   55 ++++++++++++++++++++++++++++++++++++++++++---
 arch/arm/mm/flush.c      |    7 ++++++
 3 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 9264d81..2eacf29 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -776,3 +776,11 @@ config ARM_L1_CACHE_SHIFT
 	int
 	default 6 if ARCH_OMAP3
 	default 5
+
+config ARM_ARMV5_L2_CACHE_COHERENCY_FIX
+       bool "Enable L2 coherency problem fix on ARMv5"
+       depends on OUTER_CACHE && CPU_CACHE_VIVT
+       default y
+       help
+         Fixes coherency problems on ARM systems that have
+	 a non-disableable VIVT L1 cache and a PIPT L2 one.
diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index d0d17b6..bf6d98c 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -34,7 +34,8 @@ static unsigned long shared_pte_mask = L_PTE_MT_BUFFERABLE;
  * Therefore those configurations which might call adjust_pte (those
  * without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock.
  */
-static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
+static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
+		      int update, int only_shared)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
@@ -65,7 +66,9 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
 	 * If this page isn't present, or is already setup to
 	 * fault (ie, is old), we can safely ignore any issues.
 	 */
-	if (ret && (pte_val(entry) & L_PTE_MT_MASK) != shared_pte_mask) {
+	if (ret &&
+	    (pte_val(entry) & L_PTE_MT_MASK) != shared_pte_mask &&
+	    update) {
 		unsigned long pfn = pte_pfn(entry);
 		flush_cache_page(vma, address, pfn);
 		outer_flush_range((pfn << PAGE_SHIFT),
@@ -74,7 +77,14 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
 		pte_val(entry) |= shared_pte_mask;
 		set_pte_at(vma->vm_mm, address, pte, entry);
 		flush_tlb_page(vma, address);
+		printk(KERN_DEBUG "Uncached vma %08x "
+		       "(addr %08lx flags %08lx phy %08x) from pid %d\n",
+		       (unsigned int) vma, vma->vm_start, vma->vm_flags,
+		       (unsigned int) (pfn << PAGE_SHIFT),
+		       current->pid);
 	}
+	if (only_shared && (pte_val(entry) & L_PTE_MT_MASK) != shared_pte_mask)
+		ret = 0;
 	pte_unmap(pte);
 	return ret;
 
@@ -100,6 +110,9 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, unsigne
 	unsigned long offset;
 	pgoff_t pgoff;
 	int aliases = 0;
+#ifdef CONFIG_ARM_ARMV5_L2_CACHE_COHERENCY_FIX
+	int run;
+#endif
 
 	pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
 
@@ -109,6 +122,39 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, unsigne
 	 * cache coherency.
 	 */
 	flush_dcache_mmap_lock(mapping);
+#ifdef CONFIG_ARM_ARMV5_L2_CACHE_COHERENCY_FIX
+	/*
+	 * In the first run we just check if we have to make some
+	 * address space uncacheable because of L1 VIVT. In the second
+	 * we check if there is an uncached map in other processes.  If
+	 * one of the previous condition is true we proceed to make
+	 * *all* (both in current process VMA and that of others) of
+	 * them so. This should solve both cases of multiple shared
+	 * memories attached in the same process but not impact the
+	 * common case of just one mapping per process.
+	 */
+	for (run = 0; run < 3; run++) {
+		vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap,
+				      pgoff, pgoff) {
+			if ((mpnt->vm_mm != mm || mpnt == vma) && run == 0)
+				continue;
+			if (!(mpnt->vm_flags & VM_MAYSHARE) &&
+			    run != 2) /* update all mappings */
+				continue;
+			offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+			aliases += adjust_pte(mpnt, mpnt->vm_start + offset,
+					      /* update only on the last run */
+					      run == 2,
+					      /*
+					       * on the second run
+					       * catch shared in other procs
+					       */
+					      run == 1);
+		}
+		if (aliases == 0 && run == 1)
+			break;
+	}
+#else
 	vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		/*
 		 * If this VMA is not in our MM, we can ignore it.
@@ -120,11 +166,12 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, unsigne
 		if (!(mpnt->vm_flags & VM_MAYSHARE))
 			continue;
 		offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
-		aliases += adjust_pte(mpnt, mpnt->vm_start + offset);
+		aliases += adjust_pte(mpnt, mpnt->vm_start + offset, 1, 0);
 	}
+#endif
 	flush_dcache_mmap_unlock(mapping);
 	if (aliases)
-		adjust_pte(vma, addr);
+		adjust_pte(vma, addr, 1, 0);
 	else
 		flush_cache_page(vma, addr, pfn);
 }
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 7f294f3..779a7f9 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -209,6 +209,13 @@ void flush_dcache_page(struct page *page)
 			__flush_dcache_aliases(mapping, page);
 		else if (mapping)
 			__flush_icache_all();
+#ifdef CONFIG_ARM_ARMV5_L2_CACHE_COHERENCY_FIX
+		{
+		  unsigned long pfn = page_to_pfn(page);
+		  outer_flush_range((pfn << PAGE_SHIFT),
+				    (pfn << PAGE_SHIFT) + PAGE_SIZE);
+		}
+#endif
 	}
 }
 EXPORT_SYMBOL(flush_dcache_page);
-- 
1.5.6.5

^ permalink raw reply related	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-18 19:00                                             ` Nicolas Pitre
@ 2009-12-20 19:51                                               ` Pavel Machek
  2009-12-20 22:32                                                 ` Nicolas Pitre
  0 siblings, 1 reply; 71+ messages in thread
From: Pavel Machek @ 2009-12-20 19:51 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri 2009-12-18 14:00:13, Nicolas Pitre wrote:
> On Fri, 18 Dec 2009, Pavel Machek wrote:
> 
> > On Sun 2009-12-13 12:00:33, Russell King - ARM Linux wrote:
> > > On Sun, Dec 13, 2009 at 01:48:48PM +0200, Ronen Shitrit wrote:
> > > > Another idea is to change the shared mapping handling, in case of vivt with pipt L2, so it won't remap the shared area as non-cacheable:
> > > > - make_coherent won't use adjust_pte and leave only the regular flush.
> > > > - Flush L1 for all context switches, also for the case that the new process is using same mm (thread context switch).
> > > 
> > > That doesn't work.
> > > 
> > > Well, with a VIVT L1, we flush the L1 on all MM switches anyway.
> > > Flushing it on any thread switch is not going to help that much.
> > > 
> > > The problem with shared mmaps is that if you have multiple within the
> > > same thread, it is required that they are _all_ coherent with respect
> > > to each other, whether or not a context switch has occurred.
> > 
> > But that's pretty unusual situation, right?
> > 
> > So what about...
> > 
> > a) flush L2 on context switch
> 
> No.  This is too big a performance killer.

Eek?

Alternative is "disable L2". I guess "use L2, but flush it
occasionally" is still better than "disable L2".

Of course, it should be measured. It looks like hardware already is
braindead, so maybe logic does not work there.

(And it may depend if you want max throughput or min latency).
									Pavel

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-18 19:16                                             ` Russell King - ARM Linux
@ 2009-12-20 19:56                                               ` Pavel Machek
  0 siblings, 0 replies; 71+ messages in thread
From: Pavel Machek @ 2009-12-20 19:56 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri 2009-12-18 19:16:26, Russell King - ARM Linux wrote:
> On Fri, Dec 18, 2009 at 07:45:19PM +0100, Pavel Machek wrote:
> > But that's pretty unusual situation, right?
> 
> It may be uncommon, but it's something that must work - not only is it
> a quality of implementation issue, but its also a data corruption issue.
> Get it wrong and you silently corrupt files on your filesystems.
> 
> As I've said earlier in the thread, we know that db4 does this.  Not
> fixing this means that we really don't care about any program which
> uses db4, or data contained within a db4 database.

I'm not trying to argue it does not need to be fixed; I'm trying to
say that performance penalty in unusual case may be ok.

> > So what about...
> > 
> > a) flush L2 on context switch
> 
> If you go to that extent, the system will probably perform better with
> the L2 cache permanently disabled.

Are you sure? Context switches are not that common (3 times a second
in number-crunching cases?) and L2 cache should help performance quite
a lot. 

> > b) disable L2 when thread has maps one physical address twice
> 
> Due to the way the L2 cache works, you have to disable L2 and flush it
> when switching to the thread, and re-enable L2 when you switch away.
> Merely flushing it when switching away won't work.
> 
> In order to disable or enable the L2 cache, the L1 cache must be
> flushed and disabled - and that would have to be done with all IRQs
> (including FIQs) disabled to ensure that its done atomically.
> 
> I don't think this is a sane option.

Well, it sounds slow and it will not be easy to implement. It really
depends how much L2 cache helps on those systems. (I'd guess PC with
L2 disabled would run at half a speed. If performance penalty for L2
disabled is similar on those systems, extra complexity may be worth
it.)

So I guess someone with affected system needs to do some benchmarking.
									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-20 19:51                                               ` Pavel Machek
@ 2009-12-20 22:32                                                 ` Nicolas Pitre
  2009-12-21  7:40                                                   ` Pavel Machek
  0 siblings, 1 reply; 71+ messages in thread
From: Nicolas Pitre @ 2009-12-20 22:32 UTC (permalink / raw)
  To: linux-arm-kernel

On Sun, 20 Dec 2009, Pavel Machek wrote:

> On Fri 2009-12-18 14:00:13, Nicolas Pitre wrote:
> > On Fri, 18 Dec 2009, Pavel Machek wrote:
> > 
> > > On Sun 2009-12-13 12:00:33, Russell King - ARM Linux wrote:
> > > > On Sun, Dec 13, 2009 at 01:48:48PM +0200, Ronen Shitrit wrote:
> > > > > Another idea is to change the shared mapping handling, in case of vivt with pipt L2, so it won't remap the shared area as non-cacheable:
> > > > > - make_coherent won't use adjust_pte and leave only the regular flush.
> > > > > - Flush L1 for all context switches, also for the case that the new process is using same mm (thread context switch).
> > > > 
> > > > That doesn't work.
> > > > 
> > > > Well, with a VIVT L1, we flush the L1 on all MM switches anyway.
> > > > Flushing it on any thread switch is not going to help that much.
> > > > 
> > > > The problem with shared mmaps is that if you have multiple within the
> > > > same thread, it is required that they are _all_ coherent with respect
> > > > to each other, whether or not a context switch has occurred.
> > > 
> > > But that's pretty unusual situation, right?
> > > 
> > > So what about...
> > > 
> > > a) flush L2 on context switch
> > 
> > No.  This is too big a performance killer.
> 
> Eek?
> 
> Alternative is "disable L2". I guess "use L2, but flush it
> occasionally" is still better than "disable L2".
> 
> Of course, it should be measured. It looks like hardware already is
> braindead, so maybe logic does not work there.
> 
> (And it may depend if you want max throughput or min latency).

Do you own such hardware?

If no, why do you care?

Do you really have something constructive to add to this thread besides 
"hardware is braindead" or other worthless speculations about some issue 
for which you apparently don't know whit about?

For your information, a patch has been proposed already which does not 
imply disabling L2 nor flushing it on every context switches.  Would you 
care to familiarize yourself with it before commenting further please?


Nicolas

^ permalink raw reply	[flat|nested] 71+ messages in thread

* shared memory problem on ARM v5TE using threads
  2009-12-20 22:32                                                 ` Nicolas Pitre
@ 2009-12-21  7:40                                                   ` Pavel Machek
  0 siblings, 0 replies; 71+ messages in thread
From: Pavel Machek @ 2009-12-21  7:40 UTC (permalink / raw)
  To: linux-arm-kernel

On Sun 2009-12-20 17:32:08, Nicolas Pitre wrote:
> On Sun, 20 Dec 2009, Pavel Machek wrote:
> 
> > On Fri 2009-12-18 14:00:13, Nicolas Pitre wrote:
> > > On Fri, 18 Dec 2009, Pavel Machek wrote:
> > > 
> > > > On Sun 2009-12-13 12:00:33, Russell King - ARM Linux wrote:
> > > > > On Sun, Dec 13, 2009 at 01:48:48PM +0200, Ronen Shitrit wrote:
> > > > > > Another idea is to change the shared mapping handling, in case of vivt with pipt L2, so it won't remap the shared area as non-cacheable:
> > > > > > - make_coherent won't use adjust_pte and leave only the regular flush.
> > > > > > - Flush L1 for all context switches, also for the case that the new process is using same mm (thread context switch).
> > > > > 
> > > > > That doesn't work.
> > > > > 
> > > > > Well, with a VIVT L1, we flush the L1 on all MM switches anyway.
> > > > > Flushing it on any thread switch is not going to help that much.
> > > > > 
> > > > > The problem with shared mmaps is that if you have multiple within the
> > > > > same thread, it is required that they are _all_ coherent with respect
> > > > > to each other, whether or not a context switch has occurred.
> > > > 
> > > > But that's pretty unusual situation, right?
> > > > 
> > > > So what about...
> > > > 
> > > > a) flush L2 on context switch
> > > 
> > > No.  This is too big a performance killer.
> > 
> > Eek?
> > 
> > Alternative is "disable L2". I guess "use L2, but flush it
> > occasionally" is still better than "disable L2".
> > 
> > Of course, it should be measured. It looks like hardware already is
> > braindead, so maybe logic does not work there.
> > 
> > (And it may depend if you want max throughput or min latency).
> 
> Do you own such hardware?

No.

> If no, why do you care?

Curiosity? You spreading statements that seem strange? Me wanting to
understand VIVT caches?

> For your information, a patch has been proposed already which does not 
> imply disabling L2 nor flushing it on every context switches.  Would you 
> care to familiarize yourself with it before commenting further please?

Really? The patch seemed to solve cached vs. uncached accesses, but
did not seem to solve two (cached) mappings of same file.
									Pavel

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 71+ messages in thread

end of thread, other threads:[~2009-12-21  7:40 UTC | newest]

Thread overview: 71+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-12-04 11:23 shared memory problem on ARM v5TE using threads Heiko Schocher
2009-12-04 12:26 ` Joerg Wagner
2009-12-04 13:13 ` Russell King - ARM Linux
2009-12-04 13:42   ` Heiko Schocher
2009-12-04 15:42     ` Russell King - ARM Linux
2009-12-04 15:58       ` Heiko Schocher
2009-12-04 16:38         ` Russell King - ARM Linux
2009-12-04 16:59           ` Russell King - ARM Linux
2009-12-04 17:53           ` Heiko Schocher
2009-12-04 19:13             ` Russell King - ARM Linux
2009-12-04 19:35               ` Heiko Schocher
2009-12-06 13:53                 ` Ronen Shitrit
2009-12-06 14:16                   ` Russell King - ARM Linux
2009-12-07  7:54                     ` Ronen Shitrit
2009-12-07  8:33                     ` Heiko Schocher
2009-12-07 11:31                     ` saeed bishara
2009-12-07 11:42                       ` Russell King - ARM Linux
2009-12-07 12:16                         ` Ronen Shitrit
2009-12-07 12:27                           ` Heiko Schocher
2009-12-07 12:42                             ` Ronen Shitrit
2009-12-07 15:24                               ` Nicolas Pitre
2009-12-07 12:24                         ` Heiko Schocher
2009-12-07 12:55                           ` Ronen Shitrit
2009-12-07 14:52                             ` Russell King - ARM Linux
2009-12-07 15:37                               ` Nicolas Pitre
2009-12-07 17:05                                 ` Russell King - ARM Linux
2009-12-07 17:33                                   ` Nicolas Pitre
2009-12-07 17:56                                     ` Russell King - ARM Linux
2009-12-13 11:48                                       ` Ronen Shitrit
2009-12-13 12:00                                         ` Russell King - ARM Linux
2009-12-13 12:06                                           ` Russell King - ARM Linux
2009-12-13 15:42                                             ` Ronen Shitrit
2009-12-14 13:13                                             ` christian pellegrin
2009-12-14 14:46                                               ` Ronen Shitrit
2009-12-14 17:48                                                 ` christian pellegrin
2009-12-14 20:14                                                   ` Nicolas Pitre
2009-12-15  7:50                                                     ` saeed bishara
2009-12-15 11:01                                                       ` christian pellegrin
2009-12-15 15:31                                                         ` christian pellegrin
2009-12-15 17:18                                                           ` Russell King - ARM Linux
2009-12-16 14:08                                                             ` Ronen Shitrit
2009-12-15 17:14                                                         ` Russell King - ARM Linux
2009-12-16 16:35                                                           ` christian pellegrin
2009-12-16 17:38                                                             ` christian pellegrin
2009-12-17  7:35                                                               ` Ronen Shitrit
2009-12-18 20:22                                                               ` Nicolas Pitre
2009-12-18 20:44                                                                 ` Russell King - ARM Linux
2009-12-18 21:23                                                                   ` Nicolas Pitre
2009-12-18 21:57                                                                     ` Russell King - ARM Linux
2009-12-19 11:24                                                                       ` christian pellegrin
2009-12-19 11:27                                                                         ` [PATCH] Fix coherency problems on ARM v5 with L2 PIPT cache Christian Pellegrin
2009-12-14 19:59                                               ` shared memory problem on ARM v5TE using threads Nicolas Pitre
2009-12-15 10:33                                                 ` christian pellegrin
2009-12-18 18:45                                           ` Pavel Machek
2009-12-18 19:00                                             ` Nicolas Pitre
2009-12-20 19:51                                               ` Pavel Machek
2009-12-20 22:32                                                 ` Nicolas Pitre
2009-12-21  7:40                                                   ` Pavel Machek
2009-12-18 19:16                                             ` Russell King - ARM Linux
2009-12-20 19:56                                               ` Pavel Machek
2009-12-17 11:31                                         ` Heiko Schocher
2009-12-18  8:08                                           ` christian pellegrin
2009-12-07 15:40                               ` Russell King - ARM Linux
2009-12-07 15:57                                 ` Nicolas Pitre
2009-12-07 16:06                                   ` Ronen Shitrit
2009-12-07 17:17                                   ` Russell King - ARM Linux
2009-12-04 17:25 ` Nicolas Pitre
2009-12-04 17:31   ` Russell King - ARM Linux
2009-12-04 17:47   ` Heiko Schocher
2009-12-04 17:56     ` Nicolas Pitre
2009-12-04 19:33       ` Heiko Schocher

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.