All of lore.kernel.org
 help / color / mirror / Atom feed
* crypto accelerator driver problems
       [not found] ` <AANLkTikSfb1W21NxQJ0JzMWX7sqg-2D6HAJpfXTNNAHR@mail.gmail.com>
@ 2010-12-19 12:58   ` Hamid Nassiby
  2010-12-21 12:13     ` Fwd: " Hamid Nassiby
  0 siblings, 1 reply; 17+ messages in thread
From: Hamid Nassiby @ 2010-12-19 12:58 UTC (permalink / raw)
  To: linux-crypto

Hi All,

In a research project, we've developed a crypto accelerator based on Xilinx
Virtex5 FPGA family which is connected to PC through PCI-Express slot and is
used by IPSec to offload crypto processing from CPU. The accelerator only
provides AES and DES3_EDE algorithms and I am responsible for providing driver
of the stuff. I inspired much of driver work from geode_aes.c which is
located in "drivers/crypto" subdir of kernel source directory. Both algorithms
are registered as blkcipher providing cbc wrapper "cbc(aes)" just as one that is
registered in geode_aes. Now after months of work, the accelerator is ready to
work (Correctness of hardware operation is assured by direct crypto
test and not by IPSec) and it is time of driver to provide IPSec
access to accelerator. In first
try I could get  "ping" through the IPsec tunnel. One end of IPSec tunnel is
equipped by our accelerator and the other end is using kernel native IPSec and
built in AES and DES3_EDE algorithms. Now I am faced with 2 problems:

1. Ping will stop getting reply with packet sizes greater than 1426 Bytes
(ping dest_ip -s  1427). I guessed that it might be MTU problem, but reducing
mtu with "ifconfig eth1 mtu xxx" or
"echo 1 > /proc/sys/net/ipv4/ip_no_pmtu_disc"
 does not solve the problem. Also when I ping each of tunnel ends from
another end
simultaneously with "ping other_node_ip  -i 0.001", the kernel hangs
out completely.

2. Iperf problem. When I try to measure throughput of the IPSec gateway equipped
by our accelerator ( AES-MD5 ), using iperf in tcp mode, the kernel hangs such
that sometimes "Magic SysRq key" does not respond too! And so I could not trace
the problem anyway. Using iperf in udp mode works but I get "UDP bad cheksum" in
'dmesg' output of other end of tunnel (Native IPSec and built in kernel
algorithms).

Two gateways are connected by a cross cable and no router/switch is located
between them to cause mtu problems. In my test pcrypt is not used by now and
booting the kernel with nosmp (so no fear of thread contention) does not change
the situation.

So I request you to help me solve the problem. I bring some parts of driver
that is changed from geode_aes.c and might give useful information. If
it is required,
I'll post all driver text.
------------------------------ ----------------------------

static struct crypto_alg mydriver_cbc_alg = {
       .cra_name               =       "cbc(aes)",
       .cra_driver_name        =       "cbc-aes-mydriver",
       .cra_priority           =       400,
       .cra_flags                      =       CRYPTO_ALG_TYPE_BLKCIPHER |

CRYPTO_ALG_NEED_FALLBACK,
       .cra_init                       =       fallback_init_blk,
       .cra_exit                       =       fallback_exit_blk,
       .cra_blocksize          =       AES_MIN_BLOCK_SIZE,
       .cra_ctxsize            =       sizeof(struct mydriver_aes_op),
       .cra_alignmask          =       15,
       .cra_type                       =       &crypto_blkcipher_type,
       .cra_module                     =       THIS_MODULE,
       .cra_list                       =
LIST_HEAD_INIT(mydriver_cbc_alg.cra_list),
       .cra_u                          =       {
               .blkcipher      =       {
                       .min_keysize    =       AES_MIN_KEY_SIZE,
                       .max_keysize    =       AES_MIN_KEY_SIZE,
                       .setkey                 =       mydriver_setkey_blk,
                       .encrypt                =       mydriver_cbc_encrypt,
                       .decrypt                =       mydriver_cbc_decrypt,
                       .ivsize                 =       AES_IV_LENGTH,
               }
       }
};
//---------------
static int
mydriver_cbc_encrypt(struct blkcipher_desc *desc,
                 struct scatterlist *dst, struct scatterlist *src,
                 unsigned int nbytes)
{

       struct mydriver_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
       struct blkcipher_walk walk;
       int err, ret;

       if (unlikely(op->keylen != AES_KEYSIZE_128))
               return fallback_blk_enc(desc, dst, src, nbytes);

       blkcipher_walk_init(&walk, dst, src, nbytes);
       err = blkcipher_walk_virt(desc, &walk);
       op->iv = walk.iv;

       while((nbytes = walk.nbytes)) {

               op->src = walk.src.virt.addr,
               op->dst = walk.dst.virt.addr;
               op->mode = AES_MODE_CBC;
               op->len = nbytes - (nbytes % AES_MIN_BLOCK_SIZE);
               op->dir = AES_DIR_ENCRYPT;
                       //ret = mydriver_aes_crypt(op);
               ret = mydriver_transform(op, 0);
               nbytes -= ret;
               err = blkcipher_walk_done(desc, &walk, nbytes);
       }

       return err;
}
/*--------- mydriver_transform which makes a buffer containing key, iv, data
with
some additional header that is required by our accelerator, writes the buffer
to accelerator by DMA and then reads response from hardware.*/

static inline int mydriver_transform(struct mydriver_aes_op *op, int alg)
{

               int  req_len, err;
               u8 *req_buf = NULL, *res_buf = NULL;
               alg_operation operation;
               u32 my_req_id;
               if (op->len == 0)
                       return 0;

               if ((op->dir == AES_DIR_ENCRYPT) ||(op->dir ==
DES3_DIR_ENCRYPT)){
                        operation = SH_ENCRYPT;
                        my_req_id = smp_processor_id();// This ID is
put into our packet and is checked by each thread when the hardware
response is ready to see if the packet is its?
               }
               else {
                       operation = SH_DECRYPT;
                       my_req_id = smp_processor_id() + 64;
//uniqueness of ID does not solve problem described in mail :( .
               }



               err = create_request(alg, op->mode, operation, htonl(my_req_id),
op->key, op->iv, op->src, op->len, &req_buf, &req_len);


               if (err){
                       printk(KERN_EMERG"mydriver_transform : Error
CreateReuest :
errcode = %d\n", err);
                       //goto error;
               }

               err = write_request(req_buf, req_len);
               if (err){
                       printk(KERN_EMERG"mydriver_transform : Error WriteReuest
:
errcode = %d\n", err);
                       //goto error;
               }
               kfree(req_buf);
               req_buf = NULL;

               err = read_response(&res_buf, /*local_hdr.Length*/my_req_id);

               memcpy(op->dst, (res_buf + sizeof(struct response_hdr)),
op->len);

               kfree(res_buf);
               res_buf = NULL;
               return op->len;
}
//-----------
/* create_request wich builds packet for mydriver_transform */
static inline int create_request(int alg, char mode, char enc_dec, u32
request_id,
                  char *key, char *iv, char *data, int datalen,
                  u8 **outbuf, int *outlen)
{
       int req_len, n_padding, keylen, blocklen, algid;
       struct request_hdr *p_hdr;
       char *ptr;

       if (alg == 0){ //AES Algorithm
               keylen = 16;
               blocklen = 16;
               algid = 4;
       } else if (alg == 1){ //DES3 Algorithm
               keylen = 24;
               blocklen = 8;
               algid = 3;
       }

       req_len = sizeof(struct request_hdr) + keylen;
       if (keylen != 0 && keylen % 16 == 0)
               req_len += 8; //For request packet to be 128bit aligned
       if (mode == SHAMS_CBC)
               req_len += blocklen; // for IV len

       n_padding = (blocklen - (datalen % blocklen)) % blocklen; //padding
data to be multiple of 128 bits.

       req_len += (n_padding + datalen);
       *outbuf = kmalloc(req_len, GFP_ATOMIC);
       p_hdr = (struct request_hdr *) *outbuf;
       *outlen = p_hdr->Length = req_len;

       p_hdr->request_id = request_id;
       p_hdr->AlgID_Mode_EncDec = (enc_dec << 15) | (mode << 12) | algid;
       // Filling key
       ptr = *outbuf + sizeof(struct request_hdr);
       memcpy(ptr, key, keylen);
       ptr += keylen;
       if (keylen != 0 && keylen % 16 == 0){
               memset(ptr, 0, 8);
               ptr += 8;
       }
       // Filling IV
       if (mode == SHAMS_CBC){
               memcpy(ptr, iv, blocklen);
               ptr += blocklen;
       }
       // Copy data
       memcpy(ptr, data, datalen);
       ptr += datalen;
       // Zeroing padd bits
       memset(ptr, 0, n_padding);

       return 0;

}
//--------------------------------
/* write_request that writes the provided buffer to device */

static inline int write_request(u8 *buff, unsigned int count)
{
unsigned long  iflags;
u32 tlp_count, tlp_size;
dma_addr_t dma_addr;
struct x5pcie_dma_desc *desc_table = (struct x5pcie_dma_desc *)global_bar[0];

/** DMA operations:*/
       dma_addr = pci_map_single(global_dev, buff, count, PCI_DMA_TODEVICE);
       if (0 == dma_addr) {
               printk(KERN_EMERG"XPCIe_Read: Map error.\n");
               return -1;
       }

// Do DMA transfer here....
       count = count /4;//
       for (tlp_size = 32; tlp_size > 0; tlp_size--)
               if ((count % tlp_size) == 0){
                       tlp_count = count / tlp_size;
                       break;
               }

       tlp_size = tlp_count | (tlp_size << 16);
       spin_lock_irqsave(&wlock, iflags);
       //down(&my_sem);
//      if (down_interruptible(&my_sem)){
//              printk(KERN_EMERG "\nwrite_request: Error Acquire Semaphore!!");
//              return -ERESTARTSYS;
//      }
       writel(cpu_to_le32(tlp_size),&desc_table->rdmatlpc);             // read
DMA TLP count:  TLPs to transfer
       writel(cpu_to_le32(dma_addr),&desc_table->rdmatlpa);  // physical bus
address of DMA able buffer
       wmb();
       writew(cpu_to_le16(0x0001),(global_bar[0]+6));                // read
dma start bit[16] to ddmacr
       wmb();
       while(readw((global_bar[0]+6)) != 0x0101);
       spin_unlock_irqrestore(&wlock, iflags);
       //up(&my_sem);
       // Unmap the DMA buffer so it is safe for normal access again.
       pci_unmap_single(global_dev, dma_addr, count, PCI_DMA_TODEVICE);

       /** End of dma section*/
       return 0;

}
//--------------
/* read_response that reads the en/decrypted buffer from device */

static inline int read_response(u8 **buff,  u16 my_req_id)
{
       dma_addr_t dma_addr;
       u16 count, tmp_req_id;
       unsigned long  iflags1;//, iflags2;
       u32 tlp_count, tlp_size;
       struct x5pcie_dma_desc *desc_table = (struct x5pcie_dma_desc
*)global_bar[0];

       for(;;){

               spin_lock_irqsave(&alock, iflags1);
               tmp_req_id = readw((global_bar[0] + 82 + (fifo_entry * 4)));
               spin_unlock_irqrestore(&alock, iflags1);
               if(my_req_id == tmp_req_id) // Is the provided packet mine?
                       break;

       }

                       count = readw(global_bar[0] + 80 + (fifo_entry
* 4));//What is the size of my packet?
                       printk(KERN_EMERG "read_response : my_req_id = %d has
count = %d\n", my_req_id, count);

                       *buff = kmalloc(count, GFP_ATOMIC);
                       dma_addr = pci_map_single(global_dev, *buff, count,
PCI_DMA_FROMDEVICE);
                       if (0 == dma_addr){
                               printk(KERN_EMERG"XPCIe_Read: Map error.\n");
                               return -1;
                       }

                       count = count /4;//
                       for (tlp_size = 32; tlp_size > 0; tlp_size--)
                               if ((count % tlp_size) == 0){
                                       tlp_count = count / tlp_size;
                                       break;
                               }

                       tlp_size = tlp_count | (tlp_size << 16);
       //              down(&my_sem);
//                      if (down_interruptible(&my_sem)){
//                              printk(KERN_EMERG "\nread_response: Error
Acquire Semaphore!!");
//                              return -ERESTARTSYS;
//                      }
                       writel(cpu_to_le32(tlp_size),&desc_table->wdmatlpc);
       // read DMA TLP count:  TLPs to transfer
                       writel(cpu_to_le32(dma_addr),&desc_table->wdmatlpa);  //
physical bus address of DMA able buffer
                       wmb();
                       writew(cpu_to_le16(0x0001),(global_bar[0]+4));
    // read dma start bit[16] to ddmacr
                       wmb();
                       while(readw(global_bar[0]+4) != 0x0101);

                       fifo_entry = (fifo_entry + 1) % 9; // 9 : Number of
registers holding request_id and len of FiFo's elements .
                       //spin_unlock_irqrestore(&rlock, iflags2);
                       //up(&my_sem);
                       pci_unmap_single(global_dev, dma_addr, count,
PCI_DMA_FROMDEVICE);

                       return count;

}


Thanks in advance,
Hamid.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Fwd: crypto accelerator driver problems
  2010-12-19 12:58   ` crypto accelerator driver problems Hamid Nassiby
@ 2010-12-21 12:13     ` Hamid Nassiby
  2010-12-30 21:19       ` Herbert Xu
  0 siblings, 1 reply; 17+ messages in thread
From: Hamid Nassiby @ 2010-12-21 12:13 UTC (permalink / raw)
  To: linux-crypto

Hi,

As some good news and additional information, with the following patch
I no more get
"UDP bad cheksum" error as I mentioned erlier with Iperf in udp mode.
But some times I get the following call trace in dmesg after running
Iperf in UDP mode, more than one time (and ofcourse Iperf stops
transferring data while it uses 100% of CPU cycles.



[  130.171909] mydriver-aes: mydriver Crypto-Engine enabled.
[  134.767846] NET: Registered protocol family 15
[  200.031846] iperf: page allocation failure. order:0, mode:0x20
[  200.031850] Pid: 10935, comm: iperf Tainted: P            2.6.36-zen1 #1
[  200.031852] Call Trace:
[  200.031860]  [<ffffffff8108ab39>] ? __alloc_pages_nodemask+0x6d3/0x722
[  200.031864]  [<ffffffff810b454f>] ? virt_to_head_page+0x9/0x30
[  200.031867]  [<ffffffff810afac2>] ? alloc_pages_current+0xa5/0xce
[  200.031869]  [<ffffffff810899ad>] ? __get_free_pages+0x9/0x46
[  200.031872]  [<ffffffff8102bbbf>] ? need_resched+0x1a/0x23
[  200.031876]  [<ffffffff811a10ad>] ? blkcipher_walk_next+0x68/0x2d9
[  200.031882]  [<ffffffffa001dad4>] ? mydriver_cbc_encrypt+0x47/0x9c
[mydriver_aes2]
[  200.031886]  [<ffffffff81454789>] ? ipt_do_table+0x5d8/0x619
[  200.031888]  [<ffffffff811a0871>] ? async_encrypt+0x35/0x3a
[  200.031891]  [<ffffffff811a1e0c>] ? eseqiv_givencrypt+0x341/0x389
[  200.031894]  [<ffffffff813b8bb5>] ? __skb_to_sgvec+0x49/0x1ea
[  200.031897]  [<ffffffff813b8d1e>] ? __skb_to_sgvec+0x1b2/0x1ea
[  200.031899]  [<ffffffff811a8fc8>] ? crypto_authenc_givencrypt+0x60/0x7c
[  200.031902]  [<ffffffff814492dd>] ? esp_output+0x320/0x357
[  200.031905]  [<ffffffff814658cd>] ? xfrm_output_resume+0x38d/0x48f
[  200.031908]  [<ffffffff813e1f62>] ? nf_hook_slow+0xc8/0xd9
[  200.031911]  [<ffffffff81416f9f>] ? ip_push_pending_frames+0x2cc/0x328
[  200.031914]  [<ffffffff8143339e>] ? udp_push_pending_frames+0x2c4/0x342
[  200.031917]  [<ffffffff814350ca>] ? udp_sendmsg+0x508/0x600
[  200.031919]  [<ffffffff8102bbbf>] ? need_resched+0x1a/0x23
[  200.031923]  [<ffffffff813b3458>] ? sock_aio_write+0xd5/0xe9
[  200.031926]  [<ffffffff8100340e>] ? apic_timer_interrupt+0xe/0x20
[  200.031928]  [<ffffffff810ba2ea>] ? do_sync_write+0xb0/0xf2
[  200.031931]  [<ffffffff8100864b>] ? sched_clock+0x5/0x8
[  200.031934]  [<ffffffff8119c550>] ? security_file_permission+0x18/0x67
[  200.031937]  [<ffffffff810bac07>] ? vfs_write+0xbc/0x101
[  200.031939]  [<ffffffff810bad08>] ? sys_write+0x45/0x6e
[  200.031941]  [<ffffffff81002a42>] ? system_call_fastpath+0x16/0x1b
[  200.031942] Mem-Info:
[  200.031944] Node 0 DMA per-cpu:
[  200.031946] CPU    0: hi:    0, btch:   1 usd:   0
[  200.031947] CPU    1: hi:    0, btch:   1 usd:   0
[  200.031949] CPU    2: hi:    0, btch:   1 usd:   0
[  200.031950] CPU    3: hi:    0, btch:   1 usd:   0
[  200.031951] Node 0 DMA32 per-cpu:
[  200.031953] CPU    0: hi:  186, btch:  31 usd:  30
[  200.032016] CPU    1: hi:  186, btch:  31 usd:  23
[  200.032018] CPU    2: hi:  186, btch:  31 usd: 182
[  200.032019] CPU    3: hi:  186, btch:  31 usd: 171
[  200.032023] active_anon:248219 inactive_anon:82742 isolated_anon:7
[  200.032024]  active_file:10553 inactive_file:11106 isolated_file:27
[  200.032025]  unevictable:0 dirty:19 writeback:1881 unstable:0
[  200.032026]  free:2536 slab_reclaimable:2970 slab_unreclaimable:6490
[  200.032026]  mapped:19597 shmem:292 pagetables:12316 bounce:0
[  200.032028] Node 0 DMA free:8012kB min:40kB low:48kB high:60kB
active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB
unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15768kB
mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB
slab_reclaimable:0kB slab_unreclaimable:16kB kernel_stack:0kB
pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB
pages_scanned:0 all_unreclaimable? yes
[  200.032036] lowmem_reserve[]: 0 2002 2002 2002
[  200.032039] Node 0 DMA32 free:2132kB min:5704kB low:7128kB
high:8556kB active_anon:992876kB inactive_anon:330968kB
active_file:42212kB inactive_file:44424kB unevictable:0kB
isolated(anon):28kB isolated(file):108kB present:2050992kB mlocked:0kB
dirty:76kB writeback:7524kB mapped:78388kB shmem:1168kB
slab_reclaimable:11880kB slab_unreclaimable:25944kB
kernel_stack:2320kB pagetables:49264kB unstable:0kB bounce:0kB
writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
[  200.032050] lowmem_reserve[]: 0 0 0 0
[  200.032059] Node 0 DMA: 1*4kB 1*8kB 0*16kB 0*32kB 1*64kB 0*128kB
1*256kB 1*512kB 1*1024kB 1*2048kB 1*4096kB = 8012kB
[  200.032066] Node 0 DMA32: 1*4kB 0*8kB 1*16kB 0*32kB 1*64kB 0*128kB
0*256kB 0*512kB 0*1024kB 1*2048kB 0*4096kB = 2132kB
[  200.032072] 37527 total pagecache pages
[  200.032074] 15549 pages in swap cache
[  200.032075] Swap cache stats: add 72816, delete 57267, find 8267/8477
[  200.032076] Free swap  = 3832196kB
[  200.032078] Total swap = 4096568kB
[  200.040499] 523951 pages RAM
[  200.040501] 9684 pages reserved
[  200.040502] 231120 pages shared
[  200.040503] 486710 pages non-shared
[  200.040514] BUG: unable to handle kernel NULL pointer dereference at (null)
[  200.040517] IP: [<ffffffffa001d395>] mydriver_transform+0x1a3/0x6a8
[mydriver_aes2]
[  200.040523] PGD 7c3dd067 PUD 41dc067 PMD 0
[  200.040526] Oops: 0000 [#1] PREEMPT SMP
[  200.040528] last sysfs file: /sys/devices/virtual/misc/fuse/dev
[  200.040530] CPU 0
[  200.040531] Modules linked in: ctr twofish_generic twofish_x86_64
twofish_common camellia serpent blowfish cast5 xcbc rmd160
sha512_generic sha256_generic crypto_null af_key mydriver_aes2 fuse
nvidia(P) r8169 iTCO_wdt iTCO_vendor_support
[  200.040542]
[  200.040544] Pid: 10935, comm: iperf Tainted: P
2.6.36-zen1 #1 EP45-UD3P/EP45-UD3P
[  200.040546] RIP: 0010:[<ffffffffa001d395>]  [<ffffffffa001d395>]
mydriver_transform+0x1a3/0x6a8 [mydriver_aes2]
[  200.040550] RSP: 0018:ffff880072c5b898  EFLAGS: 00010246
[  200.040551] RAX: ffff880055a3a030 RBX: 0000000000000680 RCX: 00000000000005f0
[  200.040553] RDX: 0000000000000680 RSI: 0000000000000000 RDI: ffff880055a3a030
[  200.040555] RBP: 0000000000000000 R08: 0000000000000680 R09: 0000000000000018
[  200.040556] R10: 000000007d078004 R11: 0000000000013234 R12: 0000000000000010
[  200.040558] R13: 0000000000000004 R14: 00000000eaef0000 R15: 00000000000005f0
[  200.040561] FS:  0000000041767950(0063) GS:ffff880001a00000(0000)
knlGS:0000000000000000
[  200.040562] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[  200.040564] CR2: 0000000000000000 CR3: 000000007409d000 CR4: 00000000000406f0
[  200.040566] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  200.040568] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  200.040570] Process iperf (pid: 10935, threadinfo ffff880072c5a000,
task ffff880006d09000)
[  200.040571] Stack:
[  200.040572]  ffff880006d09000 0000000000000000 ffffffff817854f0
0000000000000020
[  200.040574] <0> 0000000000000000 0000efea72c5b9e8 ffff88007d4a7c58
00000001810afac2
[  200.040577] <0> 0000000000000000 ffff88004d53dc00 ffff880072c5b901
000000000000000f
[  200.040580] Call Trace:
[  200.040585]  [<ffffffff8102bbbf>] ? need_resched+0x1a/0x23
[  200.040588]  [<ffffffffa001db0b>] ? mydriver_cbc_encrypt+0x7e/0x9c
[mydriver_aes2]
[  200.040592]  [<ffffffff811a0871>] ? async_encrypt+0x35/0x3a
[  200.040595]  [<ffffffff811a1e0c>] ? eseqiv_givencrypt+0x341/0x389
[  200.040598]  [<ffffffff813b8bb5>] ? __skb_to_sgvec+0x49/0x1ea
[  200.040600]  [<ffffffff813b8d1e>] ? __skb_to_sgvec+0x1b2/0x1ea
[  200.040603]  [<ffffffff811a8fc8>] ? crypto_authenc_givencrypt+0x60/0x7c
[  200.040607]  [<ffffffff814492dd>] ? esp_output+0x320/0x357
[  200.040610]  [<ffffffff814658cd>] ? xfrm_output_resume+0x38d/0x48f
[  200.040613]  [<ffffffff813e1f62>] ? nf_hook_slow+0xc8/0xd9
[  200.040616]  [<ffffffff81416f9f>] ? ip_push_pending_frames+0x2cc/0x328
[  200.040619]  [<ffffffff8143339e>] ? udp_push_pending_frames+0x2c4/0x342
[  200.040621]  [<ffffffff814350ca>] ? udp_sendmsg+0x508/0x600
[  200.040623]  [<ffffffff8102bbbf>] ? need_resched+0x1a/0x23
[  200.040627]  [<ffffffff813b3458>] ? sock_aio_write+0xd5/0xe9
[  200.040630]  [<ffffffff8100340e>] ? apic_timer_interrupt+0xe/0x20
[  200.040633]  [<ffffffff810ba2ea>] ? do_sync_write+0xb0/0xf2
[  200.040636]  [<ffffffff8100864b>] ? sched_clock+0x5/0x8
[  200.040639]  [<ffffffff8119c550>] ? security_file_permission+0x18/0x67
[  200.040641]  [<ffffffff810bac07>] ? vfs_write+0xbc/0x101
[  200.040643]  [<ffffffff810bad08>] ? sys_write+0x45/0x6e
[  200.040646]  [<ffffffff81002a42>] ? system_call_fastpath+0x16/0x1b
[  200.040647] Code: 83 c0 08 80 7c 24 50 01 75 10 48 89 c7 49 63 cc
48 8b 74 24 48 f3 a4 48 89 f8 48 89 c7 48 8b 74 24 40 41 0f b7 d8 49
63 cf 89 da <f3> a4 4c 8b 2d 62 1d 00 00 48 8b 3d 53 1d 00 00 b1 01 48
8b 74
[  200.040668] RIP  [<ffffffffa001d395>]
mydriver_transform+0x1a3/0x6a8 [mydriver_aes2]
[  200.040671]  RSP <ffff880072c5b898>
[  200.040672] CR2: 0000000000000000
[  200.040733] ---[ end trace ae2865df0a025f7d ]---
[  221.687773] SysRq : Emergency Sync



BUT Iperf in TCP mode has its own problems yet ( the system freezes
with no response ).

Thank in advance,
Hamid.


--- mydriver1	2010-12-21 15:20:17.000000000 +0330
+++ mydriver2	2010-12-21 15:24:18.000000000 +0330
@@ -1,4 +1,3 @@
-
 static int
 mydriver_cbc_decrypt(struct blkcipher_desc *desc,
 		  struct scatterlist *dst, struct scatterlist *src,
@@ -14,18 +13,17 @@ mydriver_cbc_decrypt(struct blkcipher_desc
 	err = blkcipher_walk_virt(desc, &walk);
 	op->iv = walk.iv;
 	
-	while((nbytes = walk.nbytes)) {
+	
 		op->src = walk.src.virt.addr,
 		op->dst = walk.dst.virt.addr;
 		op->mode = AES_MODE_CBC;
-		op->len = nbytes - (nbytes % AES_MIN_BLOCK_SIZE);
+		op->len = nbytes;
 		op->dir = AES_DIR_DECRYPT;
-		
 		ret = mydriver_transform(op, 0);

 		nbytes -= ret;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
+	

 	return err;
 }
@@ -45,16 +43,17 @@ mydriver_cbc_encrypt(struct blkcipher_desc
 	err = blkcipher_walk_virt(desc, &walk);
 	op->iv = walk.iv;
 	
-	while((nbytes = walk.nbytes)) {
+	
 		op->src = walk.src.virt.addr,
 		op->dst = walk.dst.virt.addr;
 		op->mode = AES_MODE_CBC;
-		op->len = nbytes - (nbytes % AES_MIN_BLOCK_SIZE);
+		op->len = nbytes;
 		op->dir = AES_DIR_ENCRYPT;
 		ret = mydriver_transform(op, 0);
 		nbytes -= ret;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
+	

 	return err;
 }
+

---------- Forwarded message ----------
From: Hamid Nassiby <h.nassiby@gmail.com>
Date: Sun, Dec 19, 2010 at 4:28 PM
Subject: crypto accelerator driver problems
To: linux-crypto@vger.kernel.org


Hi All,

In a research project, we've developed a crypto accelerator based on Xilinx
Virtex5 FPGA family which is connected to PC through PCI-Express slot and is
used by IPSec to offload crypto processing from CPU. The accelerator only
provides AES and DES3_EDE algorithms and I am responsible for providing driver
of the stuff. I inspired much of driver work from geode_aes.c which is
located in "drivers/crypto" subdir of kernel source directory. Both algorithms
are registered as blkcipher providing cbc wrapper "cbc(aes)" just as one that is
registered in geode_aes. Now after months of work, the accelerator is ready to
work (Correctness of hardware operation is assured by direct crypto
test and not by IPSec) and it is time of driver to provide IPSec
access to accelerator. In first
try I could get  "ping" through the IPsec tunnel. One end of IPSec tunnel is
equipped by our accelerator and the other end is using kernel native IPSec and
built in AES and DES3_EDE algorithms. Now I am faced with 2 problems:

1. Ping will stop getting reply with packet sizes greater than 1426 Bytes
(ping dest_ip -s  1427). I guessed that it might be MTU problem, but reducing
mtu with "ifconfig eth1 mtu xxx" or
"echo 1 > /proc/sys/net/ipv4/ip_no_pmtu_disc"
 does not solve the problem. Also when I ping each of tunnel ends from
another end
simultaneously with "ping other_node_ip  -i 0.001", the kernel hangs
out completely.

2. Iperf problem. When I try to measure throughput of the IPSec gateway equipped
by our accelerator ( AES-MD5 ), using iperf in tcp mode, the kernel hangs such
that sometimes "Magic SysRq key" does not respond too! And so I could not trace
the problem anyway. Using iperf in udp mode works but I get "UDP bad cheksum" in
'dmesg' output of other end of tunnel (Native IPSec and built in kernel
algorithms).

Two gateways are connected by a cross cable and no router/switch is located
between them to cause mtu problems. In my test pcrypt is not used by now and
booting the kernel with nosmp (so no fear of thread contention) does not change
the situation.

So I request you to help me solve the problem. I bring some parts of driver
that is changed from geode_aes.c and might give useful information. If
it is required,
I'll post all driver text.
------------------------------ ----------------------------

static struct crypto_alg mydriver_cbc_alg = {
       .cra_name               =       "cbc(aes)",
       .cra_driver_name        =       "cbc-aes-mydriver",
       .cra_priority           =       400,
       .cra_flags                      =       CRYPTO_ALG_TYPE_BLKCIPHER |

CRYPTO_ALG_NEED_FALLBACK,
       .cra_init                       =       fallback_init_blk,
       .cra_exit                       =       fallback_exit_blk,
       .cra_blocksize          =       AES_MIN_BLOCK_SIZE,
       .cra_ctxsize            =       sizeof(struct mydriver_aes_op),
       .cra_alignmask          =       15,
       .cra_type                       =       &crypto_blkcipher_type,
       .cra_module                     =       THIS_MODULE,
       .cra_list                       =
LIST_HEAD_INIT(mydriver_cbc_alg.cra_list),
       .cra_u                          =       {
               .blkcipher      =       {
                       .min_keysize    =       AES_MIN_KEY_SIZE,
                       .max_keysize    =       AES_MIN_KEY_SIZE,
                       .setkey                 =       mydriver_setkey_blk,
                       .encrypt                =       mydriver_cbc_encrypt,
                       .decrypt                =       mydriver_cbc_decrypt,
                       .ivsize                 =       AES_IV_LENGTH,
               }
       }
};
//---------------
static int
mydriver_cbc_encrypt(struct blkcipher_desc *desc,
                 struct scatterlist *dst, struct scatterlist *src,
                 unsigned int nbytes)
{

       struct mydriver_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
       struct blkcipher_walk walk;
       int err, ret;

       if (unlikely(op->keylen != AES_KEYSIZE_128))
               return fallback_blk_enc(desc, dst, src, nbytes);

       blkcipher_walk_init(&walk, dst, src, nbytes);
       err = blkcipher_walk_virt(desc, &walk);
       op->iv = walk.iv;

       while((nbytes = walk.nbytes)) {

               op->src = walk.src.virt.addr,
               op->dst = walk.dst.virt.addr;
               op->mode = AES_MODE_CBC;
               op->len = nbytes - (nbytes % AES_MIN_BLOCK_SIZE);
               op->dir = AES_DIR_ENCRYPT;
                       //ret = mydriver_aes_crypt(op);
               ret = mydriver_transform(op, 0);
               nbytes -= ret;
               err = blkcipher_walk_done(desc, &walk, nbytes);
       }

       return err;
}
/*--------- mydriver_transform which makes a buffer containing key, iv, data
with
some additional header that is required by our accelerator, writes the buffer
to accelerator by DMA and then reads response from hardware.*/

static inline int mydriver_transform(struct mydriver_aes_op *op, int alg)
{

               int  req_len, err;
               u8 *req_buf = NULL, *res_buf = NULL;
               alg_operation operation;
               u32 my_req_id;
               if (op->len == 0)
                       return 0;

               if ((op->dir == AES_DIR_ENCRYPT) ||(op->dir ==
DES3_DIR_ENCRYPT)){
                        operation = SH_ENCRYPT;
                        my_req_id = smp_processor_id();// This ID is
put into our packet and is checked by each thread when the hardware
response is ready to see if the packet is its?
               }
               else {
                       operation = SH_DECRYPT;
                       my_req_id = smp_processor_id() + 64;
//uniqueness of ID does not solve problem described in mail :( .
               }



               err = create_request(alg, op->mode, operation, htonl(my_req_id),
op->key, op->iv, op->src, op->len, &req_buf, &req_len);


               if (err){
                       printk(KERN_EMERG"mydriver_transform : Error
CreateReuest :
errcode = %d\n", err);
                       //goto error;
               }

               err = write_request(req_buf, req_len);
               if (err){
                       printk(KERN_EMERG"mydriver_transform : Error WriteReuest
:
errcode = %d\n", err);
                       //goto error;
               }
               kfree(req_buf);
               req_buf = NULL;

               err = read_response(&res_buf, /*local_hdr.Length*/my_req_id);

               memcpy(op->dst, (res_buf + sizeof(struct response_hdr)),
op->len);

               kfree(res_buf);
               res_buf = NULL;
               return op->len;
}
//-----------
/* create_request wich builds packet for mydriver_transform */
static inline int create_request(int alg, char mode, char enc_dec, u32
request_id,
                  char *key, char *iv, char *data, int datalen,
                  u8 **outbuf, int *outlen)
{
       int req_len, n_padding, keylen, blocklen, algid;
       struct request_hdr *p_hdr;
       char *ptr;

       if (alg == 0){ //AES Algorithm
               keylen = 16;
               blocklen = 16;
               algid = 4;
       } else if (alg == 1){ //DES3 Algorithm
               keylen = 24;
               blocklen = 8;
               algid = 3;
       }

       req_len = sizeof(struct request_hdr) + keylen;
       if (keylen != 0 && keylen % 16 == 0)
               req_len += 8; //For request packet to be 128bit aligned
       if (mode == SHAMS_CBC)
               req_len += blocklen; // for IV len

       n_padding = (blocklen - (datalen % blocklen)) % blocklen; //padding
data to be multiple of 128 bits.

       req_len += (n_padding + datalen);
       *outbuf = kmalloc(req_len, GFP_ATOMIC);
       p_hdr = (struct request_hdr *) *outbuf;
       *outlen = p_hdr->Length = req_len;

       p_hdr->request_id = request_id;
       p_hdr->AlgID_Mode_EncDec = (enc_dec << 15) | (mode << 12) | algid;
       // Filling key
       ptr = *outbuf + sizeof(struct request_hdr);
       memcpy(ptr, key, keylen);
       ptr += keylen;
       if (keylen != 0 && keylen % 16 == 0){
               memset(ptr, 0, 8);
               ptr += 8;
       }
       // Filling IV
       if (mode == SHAMS_CBC){
               memcpy(ptr, iv, blocklen);
               ptr += blocklen;
       }
       // Copy data
       memcpy(ptr, data, datalen);
       ptr += datalen;
       // Zeroing padd bits
       memset(ptr, 0, n_padding);

       return 0;

}
//--------------------------------
/* write_request that writes the provided buffer to device */

static inline int write_request(u8 *buff, unsigned int count)
{
unsigned long  iflags;
u32 tlp_count, tlp_size;
dma_addr_t dma_addr;
struct x5pcie_dma_desc *desc_table = (struct x5pcie_dma_desc *)global_bar[0];

/** DMA operations:*/
       dma_addr = pci_map_single(global_dev, buff, count, PCI_DMA_TODEVICE);
       if (0 == dma_addr) {
               printk(KERN_EMERG"XPCIe_Read: Map error.\n");
               return -1;
       }

// Do DMA transfer here....
       count = count /4;//
       for (tlp_size = 32; tlp_size > 0; tlp_size--)
               if ((count % tlp_size) == 0){
                       tlp_count = count / tlp_size;
                       break;
               }

       tlp_size = tlp_count | (tlp_size << 16);
       spin_lock_irqsave(&wlock, iflags);
       //down(&my_sem);
//      if (down_interruptible(&my_sem)){
//              printk(KERN_EMERG "\nwrite_request: Error Acquire Semaphore!!");
//              return -ERESTARTSYS;
//      }
       writel(cpu_to_le32(tlp_size),&desc_table->rdmatlpc);             // read
DMA TLP count:  TLPs to transfer
       writel(cpu_to_le32(dma_addr),&desc_table->rdmatlpa);  // physical bus
address of DMA able buffer
       wmb();
       writew(cpu_to_le16(0x0001),(global_bar[0]+6));                // read
dma start bit[16] to ddmacr
       wmb();
       while(readw((global_bar[0]+6)) != 0x0101);
       spin_unlock_irqrestore(&wlock, iflags);
       //up(&my_sem);
       // Unmap the DMA buffer so it is safe for normal access again.
       pci_unmap_single(global_dev, dma_addr, count, PCI_DMA_TODEVICE);

       /** End of dma section*/
       return 0;

}
//--------------
/* read_response that reads the en/decrypted buffer from device */

static inline int read_response(u8 **buff,  u16 my_req_id)
{
       dma_addr_t dma_addr;
       u16 count, tmp_req_id;
       unsigned long  iflags1;//, iflags2;
       u32 tlp_count, tlp_size;
       struct x5pcie_dma_desc *desc_table = (struct x5pcie_dma_desc
*)global_bar[0];

       for(;;){

               spin_lock_irqsave(&alock, iflags1);
               tmp_req_id = readw((global_bar[0] + 82 + (fifo_entry * 4)));
               spin_unlock_irqrestore(&alock, iflags1);
               if(my_req_id == tmp_req_id) // Is the provided packet mine?
                       break;

       }

                       count = readw(global_bar[0] + 80 + (fifo_entry
* 4));//What is the size of my packet?
                       printk(KERN_EMERG "read_response : my_req_id = %d has
count = %d\n", my_req_id, count);

                       *buff = kmalloc(count, GFP_ATOMIC);
                       dma_addr = pci_map_single(global_dev, *buff, count,
PCI_DMA_FROMDEVICE);
                       if (0 == dma_addr){
                               printk(KERN_EMERG"XPCIe_Read: Map error.\n");
                               return -1;
                       }

                       count = count /4;//
                       for (tlp_size = 32; tlp_size > 0; tlp_size--)
                               if ((count % tlp_size) == 0){
                                       tlp_count = count / tlp_size;
                                       break;
                               }

                       tlp_size = tlp_count | (tlp_size << 16);
       //              down(&my_sem);
//                      if (down_interruptible(&my_sem)){
//                              printk(KERN_EMERG "\nread_response: Error
Acquire Semaphore!!");
//                              return -ERESTARTSYS;
//                      }
                       writel(cpu_to_le32(tlp_size),&desc_table->wdmatlpc);
       // read DMA TLP count:  TLPs to transfer
                       writel(cpu_to_le32(dma_addr),&desc_table->wdmatlpa);  //
physical bus address of DMA able buffer
                       wmb();
                       writew(cpu_to_le16(0x0001),(global_bar[0]+4));
    // read dma start bit[16] to ddmacr
                       wmb();
                       while(readw(global_bar[0]+4) != 0x0101);

                       fifo_entry = (fifo_entry + 1) % 9; // 9 : Number of
registers holding request_id and len of FiFo's elements .
                       //spin_unlock_irqrestore(&rlock, iflags2);
                       //up(&my_sem);
                       pci_unmap_single(global_dev, dma_addr, count,
PCI_DMA_FROMDEVICE);

                       return count;

}


Thanks in advance,
Hamid.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Fwd: crypto accelerator driver problems
  2010-12-21 12:13     ` Fwd: " Hamid Nassiby
@ 2010-12-30 21:19       ` Herbert Xu
  2011-01-08  7:39         ` Hamid Nassiby
  0 siblings, 1 reply; 17+ messages in thread
From: Herbert Xu @ 2010-12-30 21:19 UTC (permalink / raw)
  To: Hamid Nassiby; +Cc: linux-crypto

Hamid Nassiby <h.nassiby@gmail.com> wrote:
> Hi,
> 
> As some good news and additional information, with the following patch
> I no more get
> "UDP bad cheksum" error as I mentioned erlier with Iperf in udp mode.
> But some times I get the following call trace in dmesg after running
> Iperf in UDP mode, more than one time (and ofcourse Iperf stops
> transferring data while it uses 100% of CPU cycles.
> 
> 
> 
> [  130.171909] mydriver-aes: mydriver Crypto-Engine enabled.
> [  134.767846] NET: Registered protocol family 15
> [  200.031846] iperf: page allocation failure. order:0, mode:0x20
> [  200.031850] Pid: 10935, comm: iperf Tainted: P            2.6.36-zen1 #1
> [  200.031852] Call Trace:
> [  200.031860]  [<ffffffff8108ab39>] ? __alloc_pages_nodemask+0x6d3/0x722
> [  200.031864]  [<ffffffff810b454f>] ? virt_to_head_page+0x9/0x30
> [  200.031867]  [<ffffffff810afac2>] ? alloc_pages_current+0xa5/0xce
> [  200.031869]  [<ffffffff810899ad>] ? __get_free_pages+0x9/0x46
> [  200.031872]  [<ffffffff8102bbbf>] ? need_resched+0x1a/0x23
> [  200.031876]  [<ffffffff811a10ad>] ? blkcipher_walk_next+0x68/0x2d9

This means that your box has run out of memory temporarily.
If all errors were handled correctly it should continue at this
point.
 
> --- mydriver1   2010-12-21 15:20:17.000000000 +0330
> +++ mydriver2   2010-12-21 15:24:18.000000000 +0330
> @@ -1,4 +1,3 @@
> -
> static int
> mydriver_cbc_decrypt(struct blkcipher_desc *desc,
>                  struct scatterlist *dst, struct scatterlist *src,
> @@ -14,18 +13,17 @@ mydriver_cbc_decrypt(struct blkcipher_desc
>        err = blkcipher_walk_virt(desc, &walk);
>        op->iv = walk.iv;
>        
> -       while((nbytes = walk.nbytes)) {
> +       

However, your patch removes the error checking (and the loop
condition) which is why it crashes.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Fwd: crypto accelerator driver problems
  2010-12-30 21:19       ` Herbert Xu
@ 2011-01-08  7:39         ` Hamid Nassiby
       [not found]           ` <AANLkTin8au=98mmfsaJjOSyJNibk3foZWihj6EGTGWK-@mail.gmail.com>
  2011-01-26  7:16           ` Hamid Nassiby
  0 siblings, 2 replies; 17+ messages in thread
From: Hamid Nassiby @ 2011-01-08  7:39 UTC (permalink / raw)
  To: Herbert Xu; +Cc: linux-crypto

On Fri, Dec 31, 2010 at 12:49 AM, Herbert Xu
<herbert@gondor.apana.org.au> wrote:
>
> Hamid Nassiby <h.nassiby@gmail.com> wrote:
> > Hi,
> >
> > As some good news and additional information, with the following patch
> > I no more get
> > "UDP bad cheksum" error as I mentioned erlier with Iperf in udp mode.
> > But some times I get the following call trace in dmesg after running
> > Iperf in UDP mode, more than one time (and ofcourse Iperf stops
> > transferring data while it uses 100% of CPU cycles.
> >
> >
> >
> > [  130.171909] mydriver-aes: mydriver Crypto-Engine enabled.
> > [  134.767846] NET: Registered protocol family 15
> > [  200.031846] iperf: page allocation failure. order:0, mode:0x20
> > [  200.031850] Pid: 10935, comm: iperf Tainted: P            2.6.36-zen1 #1
> > [  200.031852] Call Trace:
> > [  200.031860]  [<ffffffff8108ab39>] ? __alloc_pages_nodemask+0x6d3/0x722
> > [  200.031864]  [<ffffffff810b454f>] ? virt_to_head_page+0x9/0x30
> > [  200.031867]  [<ffffffff810afac2>] ? alloc_pages_current+0xa5/0xce
> > [  200.031869]  [<ffffffff810899ad>] ? __get_free_pages+0x9/0x46
> > [  200.031872]  [<ffffffff8102bbbf>] ? need_resched+0x1a/0x23
> > [  200.031876]  [<ffffffff811a10ad>] ? blkcipher_walk_next+0x68/0x2d9
>
> This means that your box has run out of memory temporarily.
> If all errors were handled correctly it should continue at this
> point.
>
> > --- mydriver1   2010-12-21 15:20:17.000000000 +0330
> > +++ mydriver2   2010-12-21 15:24:18.000000000 +0330
> > @@ -1,4 +1,3 @@
> > -
> > static int
> > mydriver_cbc_decrypt(struct blkcipher_desc *desc,
> >                  struct scatterlist *dst, struct scatterlist *src,
> > @@ -14,18 +13,17 @@ mydriver_cbc_decrypt(struct blkcipher_desc
> >        err = blkcipher_walk_virt(desc, &walk);
> >        op->iv = walk.iv;
> >
> > -       while((nbytes = walk.nbytes)) {
> > +
>
> However, your patch removes the error checking (and the loop
> condition) which is why it crashes.
>
> Cheers,
> --
> Email: Herbert Xu <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt



Hi Herbert,

First I should notice that by removing while loop iteration, "UDP bad checksum"
error in dmesg output is no longer seen. Diving deeper in problem, It seemed
to me that when mydriver_transform returns 0, I must not get any more bytes
(belonging to previous request) to process in the next iteration of while loop.
But I see that the behavior is not as it has to be (By removing while loop
mydriver_transform gets for example one 1500 byte request, processes it and
copies it back to destination, But in existence of while loop It gets same
request as one 1300 byte request, processes and copies it back to destination,
returning 0, and getting remaining 200 bytes of request in second iteration of
while, so on the other end of tunnel I see "UDP bad checksum"). So I conclude
that blkcipher_walk_done behaves strange, assigns incorrect value to walk.nbytes
resulting in iterating while loop one time more!


Second note is about our accelerator's architecture and the way we should
utilize it. Our device has several crypto engines built in. So for maximum
utilization of device we should feed it with multiple crypto requests
simultaneously (I intended for doing  it by using pcrypt) and here is the point
everything freezes. From other point of view, I found that if I protect entering
write_request and read_response in mydriver_transform by one lock
(spin_unlock(x) before write_request and spin_unlock(x) after read_reasponse in
mydriver_transform as shown in following code snippet), I would be able to run
"iperf" in tcp mode successfully. This leads me to uncertainty, because in
such a situation, we only utilize one crypto engine of device and each request
is followed by its response sequentially and arrangement of requests and
responses is not interleaved. So I guess that getting multiple requests to
device and receiving the responses not in the same arrangement they delivered to
device, might cause TCP transfer to freeze, and here my question arises: If my
conclusion is true, SHOULD I change the driver approach to ablkcipher?


Code snippet in the way write_request and read_response are protected by lock
and iperf in TCP mode progresses:


static inline int mydriver_transform(struct mydriver_aes_op *op, int alg)
{
		.
		.
		.
		spin_lock_irqsave(&glock, tflag);
		write_request(req_buf, req_len);
		kfree(req_buf);
		req_buf = NULL;
		err = read_response(&res_buf,my_req_id);
		spin_unlock_irqrestore(&glock, tflag2);
		if (err == 0){
				kfree(res_buf);
				res_buf = NULL;
				return 0;
		}

		memcpy(op->dst, (res_buf + sizeof(struct response_hdr)),
op->len);
		
		kfree(res_buf);
		res_buf = NULL;
		return op->len;
}


I'm looking forward to hearing you soon.
Thanks,

Hamid.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Fwd: crypto accelerator driver problems
       [not found]           ` <AANLkTin8au=98mmfsaJjOSyJNibk3foZWihj6EGTGWK-@mail.gmail.com>
@ 2011-01-26  7:09             ` Herbert Xu
  2011-01-26  7:50               ` Hamid Nassiby
  2013-04-25  3:45               ` Vakul Garg
  0 siblings, 2 replies; 17+ messages in thread
From: Herbert Xu @ 2011-01-26  7:09 UTC (permalink / raw)
  To: Hamid Nassiby; +Cc: linux-crypto

On Wed, Jan 26, 2011 at 10:26:33AM +0330, Hamid Nassiby wrote:
>
> As you know, I posted my problem again to crypto list and no one answered.
> Now I
> emphasize one aspect of the problem as a concept related to IPSec protocol,
> free
> of my problem's nature, and I hope to get some guidelines at this time. The
> question is as following:
> If IPSec delivers IP packets to hardware crypto accelerator in sequential
> manner
> (e.g, packets in order: 1, 2, 3, ..., 36, 37, 38,...) and crypto accelerator
> possibly returns back packets out of entering order to IPSec (e.g, packet
> 37 is returned back before the packet 36 to IPSec, so the order of packets
> is
> not the same before entering crypto accelerator and after exiting it); Is it
> possible to rise any problem here?

We do not allow such reordering.  All crypto drivers must ensure
ordering within a single tfm.  Between different tfms there is no
ordering requirement.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Fwd: crypto accelerator driver problems
  2011-01-08  7:39         ` Hamid Nassiby
       [not found]           ` <AANLkTin8au=98mmfsaJjOSyJNibk3foZWihj6EGTGWK-@mail.gmail.com>
@ 2011-01-26  7:16           ` Hamid Nassiby
  1 sibling, 0 replies; 17+ messages in thread
From: Hamid Nassiby @ 2011-01-26  7:16 UTC (permalink / raw)
  To: Herbert Xu; +Cc: linux-crypto

On Sat, Jan 8, 2011 at 11:09 AM, Hamid Nassiby <h.nassiby@gmail.com> wrote:
>
> On Fri, Dec 31, 2010 at 12:49 AM, Herbert Xu
> <herbert@gondor.apana.org.au> wrote:
> >
> > Hamid Nassiby <h.nassiby@gmail.com> wrote:
> > > Hi,
> > >
> > > As some good news and additional information, with the following patch
> > > I no more get
> > > "UDP bad cheksum" error as I mentioned erlier with Iperf in udp mode.
> > > But some times I get the following call trace in dmesg after running
> > > Iperf in UDP mode, more than one time (and ofcourse Iperf stops
> > > transferring data while it uses 100% of CPU cycles.
> > >
> > >
> > >
> > > [  130.171909] mydriver-aes: mydriver Crypto-Engine enabled.
> > > [  134.767846] NET: Registered protocol family 15
> > > [  200.031846] iperf: page allocation failure. order:0, mode:0x20
> > > [  200.031850] Pid: 10935, comm: iperf Tainted: P            2.6.36-zen1 #1
> > > [  200.031852] Call Trace:
> > > [  200.031860]  [<ffffffff8108ab39>] ? __alloc_pages_nodemask+0x6d3/0x722
> > > [  200.031864]  [<ffffffff810b454f>] ? virt_to_head_page+0x9/0x30
> > > [  200.031867]  [<ffffffff810afac2>] ? alloc_pages_current+0xa5/0xce
> > > [  200.031869]  [<ffffffff810899ad>] ? __get_free_pages+0x9/0x46
> > > [  200.031872]  [<ffffffff8102bbbf>] ? need_resched+0x1a/0x23
> > > [  200.031876]  [<ffffffff811a10ad>] ? blkcipher_walk_next+0x68/0x2d9
> >
> > This means that your box has run out of memory temporarily.
> > If all errors were handled correctly it should continue at this
> > point.
> >
> > > --- mydriver1   2010-12-21 15:20:17.000000000 +0330
> > > +++ mydriver2   2010-12-21 15:24:18.000000000 +0330
> > > @@ -1,4 +1,3 @@
> > > -
> > > static int
> > > mydriver_cbc_decrypt(struct blkcipher_desc *desc,
> > >                  struct scatterlist *dst, struct scatterlist *src,
> > > @@ -14,18 +13,17 @@ mydriver_cbc_decrypt(struct blkcipher_desc
> > >        err = blkcipher_walk_virt(desc, &walk);
> > >        op->iv = walk.iv;
> > >
> > > -       while((nbytes = walk.nbytes)) {
> > > +
> >
> > However, your patch removes the error checking (and the loop
> > condition) which is why it crashes.
> >
> > Cheers,
> > --
> > Email: Herbert Xu <herbert@gondor.apana.org.au>
> > Home Page: http://gondor.apana.org.au/~herbert/
> > PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
>
>
>
> Hi Herbert,
>
> First I should notice that by removing while loop iteration, "UDP bad checksum"
> error in dmesg output is no longer seen. Diving deeper in problem, It seemed
> to me that when mydriver_transform returns 0, I must not get any more bytes
> (belonging to previous request) to process in the next iteration of while loop.
> But I see that the behavior is not as it has to be (By removing while loop
> mydriver_transform gets for example one 1500 byte request, processes it and
> copies it back to destination, But in existence of while loop It gets same
> request as one 1300 byte request, processes and copies it back to destination,
> returning 0, and getting remaining 200 bytes of request in second iteration of
> while, so on the other end of tunnel I see "UDP bad checksum"). So I conclude
> that blkcipher_walk_done behaves strange, assigns incorrect value to walk.nbytes
> resulting in iterating while loop one time more!
>
>
> Second note is about our accelerator's architecture and the way we should
> utilize it. Our device has several crypto engines built in. So for maximum
> utilization of device we should feed it with multiple crypto requests
> simultaneously (I intended for doing  it by using pcrypt) and here is the point
> everything freezes. From other point of view, I found that if I protect entering
> write_request and read_response in mydriver_transform by one lock
> (spin_unlock(x) before write_request and spin_unlock(x) after read_reasponse in
> mydriver_transform as shown in following code snippet), I would be able to run
> "iperf" in tcp mode successfully. This leads me to uncertainty, because in
> such a situation, we only utilize one crypto engine of device and each request
> is followed by its response sequentially and arrangement of requests and
> responses is not interleaved. So I guess that getting multiple requests to
> device and receiving the responses not in the same arrangement they delivered to
> device, might cause TCP transfer to freeze, and here my question arises: If my
> conclusion is true, SHOULD I change the driver approach to ablkcipher?
>
>
> Code snippet in the way write_request and read_response are protected by lock
> and iperf in TCP mode progresses:
>
>
> static inline int mydriver_transform(struct mydriver_aes_op *op, int alg)
> {
>                .
>                .
>                .
>                spin_lock_irqsave(&glock, tflag);
>                write_request(req_buf, req_len);
>                kfree(req_buf);
>                req_buf = NULL;
>                err = read_response(&res_buf,my_req_id);
>                spin_unlock_irqrestore(&glock, tflag2);
>                if (err == 0){
>                                kfree(res_buf);
>                                res_buf = NULL;
>                                return 0;
>                }
>
>                memcpy(op->dst, (res_buf + sizeof(struct response_hdr)),
> op->len);
>
>                kfree(res_buf);
>                res_buf = NULL;
>                return op->len;
> }
>
>
> I'm looking forward to hearing you soon.
> Thanks,
>
> Hamid.



Hi,

As you know, I posted my problem again to crypto list and no one answered. Now I
emphasize one aspect of the problem as a concept related to IPSec protocol, free
of my problem's nature, and I hope to get some guidelines at this time. The
question is as following:
If IPSec delivers IP packets to hardware crypto accelerator in sequential manner
(e.g, packets in order: 1, 2, 3, ..., 36, 37, 38,...) and crypto accelerator
possibly returns back packets out of entering order to IPSec (e.g, packet
37 is returned back before the packet 36 to IPSec, so the order of packets is
not the same before entering crypto accelerator and after exiting it); Is it
possible to rise any problem here?

Thanks in advance,

Hamid.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Fwd: crypto accelerator driver problems
  2011-01-26  7:09             ` Herbert Xu
@ 2011-01-26  7:50               ` Hamid Nassiby
  2011-01-26 23:33                 ` Herbert Xu
  2013-04-25  3:45               ` Vakul Garg
  1 sibling, 1 reply; 17+ messages in thread
From: Hamid Nassiby @ 2011-01-26  7:50 UTC (permalink / raw)
  To: Herbert Xu; +Cc: linux-crypto

On Wed, Jan 26, 2011 at 10:39 AM, Herbert Xu
<herbert@gondor.apana.org.au> wrote:
> On Wed, Jan 26, 2011 at 10:26:33AM +0330, Hamid Nassiby wrote:
>>
>> As you know, I posted my problem again to crypto list and no one answered.
>> Now I
>> emphasize one aspect of the problem as a concept related to IPSec protocol,
>> free
>> of my problem's nature, and I hope to get some guidelines at this time. The
>> question is as following:
>> If IPSec delivers IP packets to hardware crypto accelerator in sequential
>> manner
>> (e.g, packets in order: 1, 2, 3, ..., 36, 37, 38,...) and crypto accelerator
>> possibly returns back packets out of entering order to IPSec (e.g, packet
>> 37 is returned back before the packet 36 to IPSec, so the order of packets
>> is
>> not the same before entering crypto accelerator and after exiting it); Is it
>> possible to rise any problem here?
>
> We do not allow such reordering.  All crypto drivers must ensure
> ordering within a single tfm.  Between different tfms there is no
> ordering requirement.
>
> Cheers,
> --
> Email: Herbert Xu <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
>


Do you mean that different IP packets fit into one single Block Cipher tfm?
Would you please explain expansively?

Thanks a lot,

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Fwd: crypto accelerator driver problems
  2011-01-26  7:50               ` Hamid Nassiby
@ 2011-01-26 23:33                 ` Herbert Xu
  2011-07-05  6:45                   ` Hamid Nassiby
  0 siblings, 1 reply; 17+ messages in thread
From: Herbert Xu @ 2011-01-26 23:33 UTC (permalink / raw)
  To: Hamid Nassiby; +Cc: linux-crypto

On Wed, Jan 26, 2011 at 11:20:22AM +0330, Hamid Nassiby wrote:
>
> Do you mean that different IP packets fit into one single Block Cipher tfm?
> Would you please explain expansively?

We allocate one tfm per SA.  So as long as ordering is guaranteed
per SA then it's guaranteed per SA which is all that's needed.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Fwd: crypto accelerator driver problems
  2011-01-26 23:33                 ` Herbert Xu
@ 2011-07-05  6:45                   ` Hamid Nassiby
  2011-07-05  6:53                     ` Herbert Xu
  0 siblings, 1 reply; 17+ messages in thread
From: Hamid Nassiby @ 2011-07-05  6:45 UTC (permalink / raw)
  To: Herbert Xu; +Cc: linux-crypto

On Thu, Jan 27, 2011 at 3:03 AM, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Wed, Jan 26, 2011 at 11:20:22AM +0330, Hamid Nassiby wrote:
> >
> > Do you mean that different IP packets fit into one single Block Cipher tfm?
> > Would you please explain expansively?
>
> We allocate one tfm per SA.  So as long as ordering is guaranteed
> per SA then it's guaranteed per SA which is all that's needed.
>
> Cheers,
> --
> Email: Herbert Xu <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Dears,
Referring to my previous posts related to a hardware AES accelerator (that is
to be used to accelerate IPSec block cipher operations) driver, I would like to
ask you about an possibly algorithmic problem exists in our solution.
As I said earlier our driver is inspired by geode_aes driver, so assume that we
have defined our supported  algorithm as:

static struct crypto_alg shams_cbc_alg = {
        .cra_name               =       "cbc(aes)",
        .cra_driver_name        =       "cbc-aes-mine",
        .cra_priority           =       400,
        .cra_flags                      =       CRYPTO_ALG_TYPE_BLKCIPHER |

CRYPTO_ALG_NEED_FALLBACK,
        .cra_init                       =       fallback_init_blk,
        .cra_exit                       =       fallback_exit_blk,
        .cra_blocksize          =       AES_MIN_BLOCK_SIZE,
        .cra_ctxsize            =       sizeof(struct my_aes_op),
        .cra_alignmask          =       0,
        .cra_type                       =       &crypto_blkcipher_type,
        .cra_module                     =       THIS_MODULE,
        .cra_list                       =
LIST_HEAD_INIT(shams_cbc_alg.cra_list),
        .cra_u                          =       {
                .blkcipher      =       {
                        .min_keysize    =       AES_MIN_KEY_SIZE,
                        .max_keysize    =       AES_MIN_KEY_SIZE,
                        .setkey                 =       my_setkey_blk,
                        .encrypt                =       my_cbc_encrypt,
                        .decrypt                =       my_cbc_decrypt,
                        .ivsize                 =       AES_IV_LENGTH,
                }
        }
};

And our encrypt function, my_cbc_encrypt, looks like:

static int
my_cbc_encrypt(struct blkcipher_desc *desc,
                  struct scatterlist *dst, struct scatterlist *src,
                  unsigned int nbytes)
{
        struct my_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
        struct blkcipher_walk walk;
        int err, ret;
        unsigned long flag1, c2flag;
        u32 my_req_id;

        spin_lock_irqsave(&reqlock, c2flag);
        /*Our request id sent to device and then retrieved to be able
to distinguish between device responses. */
        my_req_id = (global_reqid++) % 63000;
        spin_unlock_irqrestore(&reqlock, c2flag);


        if (unlikely(op->keylen != AES_KEYSIZE_128))
                return fallback_blk_enc(desc, dst, src, nbytes);

        blkcipher_walk_init(&walk, dst, src, nbytes);
        err = blkcipher_walk_virt(desc, &walk);
        op->iv = walk.iv;

        while((nbytes = walk.nbytes)) {
                op->src = walk.src.virt.addr,
                op->dst = walk.dst.virt.addr;
                op->mode = AES_MODE_CBC;
                op->len = nbytes /*- (nbytes % AES_MIN_BLOCK_SIZE)*/;
                op->dir = AES_DIR_ENCRYPT;

            /* Critical PSEUDO code */
              spin_lock_irqsave(&1lock, flag1);
                 write_to_device(op, 0, my_req_id);
              spin_unlock_irqrestore(&lock1, flag1);

              spin_lock_irqsave(&lock1, flag1);
                ret = read_from_device(op, 0, my_req_id);
              spin_unlock_irqrestore(&lock1, flag1);
            /* End of Critical PSEUDO code*/
                nbytes -= ret;
                err = blkcipher_walk_done(desc, &walk, nbytes);
        }

        return err;
}

As I mentioned earlier we have multiple AES engines in our hardware, so to
utilize hardware as much as possible, we would like to have the possibility to
give multiple requests to device and get responses from it as soon as one
becomes ready.

Now look at that section of my_cbc_encrypt, commented as "Critical PSEUDO code".
This section gives requests to device and reads back responses (And is the damn
bottleneck) . If we protect write_to_device and read_from_device call, by one
pair of lock/unlock as:

/* Critical PSEUDO code */
              spin_lock_irqsave(&lock1, flag1);
                 write_to_device(op, 0, my_req_id);
                ret = read_from_device(op, 0, my_req_id);
              spin_unlock_irqrestore(&lock1, flag1);
/* End of Critical PSEUDO code*/

then we would have no problem, system works and IPSec en/decrypts by our
hardware. But ONLY one aes engine of our hardware is utilized; Good(system
works), Bad (only one engine is utilized) and the Ugly (throughput is not
awesome). So we must change the section to:

/* Critical PSEUDO code */
              spin_lock_irqsave(&lock1, flag1);
                 write_to_device(op, 0, my_req_id);
              spin_unlock_irqrestore(&lock1, flag1);

              spin_lock_irqsave(&glock, t2flag);
                ret = read_from_device(op, 0, my_req_id);
              spin_unlock_irqrestore(&glock, t2flag);
/* End of Critical PSEUDO code */

and preferably to :

/* Critical PSEUDO code */
/* distinct locks for write_to_device and read_from_device */
              spin_lock_irqsave(&lock1, flag1);
                 write_to_device(op, 0, my_req_id);
              spin_unlock_irqrestore(&lock1, flag1);

              spin_lock_irqsave(&lock2, flag2);
                ret = read_from_device(op, 0, my_req_id);
              spin_unlock_irqrestore(&lock2, flag2);
/* End of Critical PSEUDO*/


Here, it seems we must have no problem, but as soon as one TCP flow starts
the system hangs.
Finally, I request your guidelines about the problem.

Thanks in advance,
Hamid.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Fwd: crypto accelerator driver problems
  2011-07-05  6:45                   ` Hamid Nassiby
@ 2011-07-05  6:53                     ` Herbert Xu
  2011-10-01  9:08                       ` Hamid Nassiby
  0 siblings, 1 reply; 17+ messages in thread
From: Herbert Xu @ 2011-07-05  6:53 UTC (permalink / raw)
  To: Hamid Nassiby; +Cc: linux-crypto

On Tue, Jul 05, 2011 at 10:15:08AM +0330, Hamid Nassiby wrote:
>
> and preferably to :
> 
> /* Critical PSEUDO code */
> /* distinct locks for write_to_device and read_from_device */
>               spin_lock_irqsave(&lock1, flag1);
>                  write_to_device(op, 0, my_req_id);
>               spin_unlock_irqrestore(&lock1, flag1);
> 
>               spin_lock_irqsave(&lock2, flag2);
>                 ret = read_from_device(op, 0, my_req_id);
>               spin_unlock_irqrestore(&lock2, flag2);
> /* End of Critical PSEUDO*/
> 
> 
> Here, it seems we must have no problem, but as soon as one TCP flow starts
> the system hangs.

Do you know why it hangs?

It sounds like the problem isn't with the synchronisation itself,
which at worst will produce bogus packets, but something else in
your code that is leading to the dead-lock.

Please enable lockdep and related debugging features to track down
the problem.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Fwd: crypto accelerator driver problems
  2011-07-05  6:53                     ` Herbert Xu
@ 2011-10-01  9:08                       ` Hamid Nassiby
  2011-10-04  7:57                         ` Steffen Klassert
  0 siblings, 1 reply; 17+ messages in thread
From: Hamid Nassiby @ 2011-10-01  9:08 UTC (permalink / raw)
  To: Herbert Xu, linux-crypto, Steffen Klassert

Hi all,

Referring my previous posts in crypto list related to our hardware aes
accelerator project, I finally could deploy device in IPSec successfully. As I
mentioned earlier, my driver registers itself in kernel as blkcipher for
cbc(aes) as follows:

static struct crypto_alg my_cbc_alg = {
	.cra_name		=	"cbc(aes)",
	.cra_driver_name	=	"cbc-aes-my",
	.cra_priority		=	400,
	.cra_flags			=	CRYPTO_ALG_TYPE_BLKCIPHER |
							CRYPTO_ALG_NEED_FALLBACK,
	.cra_init			=	fallback_init_blk,
	.cra_exit			=	fallback_exit_blk,
	.cra_blocksize		=	AES_MIN_BLOCK_SIZE,
	.cra_ctxsize		=	sizeof(struct my_aes_op),
 	.cra_alignmask		=	15,
	.cra_type			=	&crypto_blkcipher_type,
	.cra_module			=	THIS_MODULE,
	.cra_list			=   LIST_HEAD_INIT(my_cbc_alg.cra_list),
	.cra_u				=	{
		.blkcipher	=	{
			.min_keysize	=	AES_MIN_KEY_SIZE,
			.max_keysize	=	AES_MIN_KEY_SIZE,
			.setkey			=	my_setkey_blk,
			.encrypt		=	my_cbc_encrypt,
			.decrypt		=	my_cbc_decrypt,
			.ivsize			=	AES_IV_LENGTH,
		}
	}
};

And my_cbc_encrypt function as PSEUDO/real code (for simplicity of
representation) is as:

static int
my_cbc_encrypt(struct blkcipher_desc *desc,
		  struct scatterlist *dst, struct scatterlist *src,
		  unsigned int nbytes)
{
		SOME__common_preparation_and_initializations;	
		
		spin_lock_irqsave(&myloc, myflags);
		send_request_to_device(&dev); /*sends request to device. After
					    processing request,device writes
					    result to destination*/
		while(!readl(complete_flag)); /*here we wait for a flag in
			  device register space indicating completion. */
		spin_unlock_irqrestore(&mylock, myflags);
	
	
}

With above code, I can successfully test IPSec gateway equipped with our
hardware and get a 200Mbps throughput using Iperf. Now I am facing with another
poblem. As I mentioned earlier, our hardware has 4 aes engines builtin. With
above code I only utilize one of them.
>From this point, we want to go a step further and utilize more than one aes
engines of our device. Simplest solution appears to me is to deploy
pcrypt/padata, made by Steffen Klassert. First instantiate in a dual
core gateway :
	modprobe tcrypt alg="pcrypt(authenc(hmac(md5),cbc(aes)))" type=3
 and test again. Running Iperf now gives me a very low
throughput about 20Mbps while dmesg shows the following:

   BUG: workqueue leaked lock or atomic: kworker/0:1/0x00000001/10
       last function: padata_parallel_worker+0x0/0x80
   Pid: 10, comm: kworker/0:1 Not tainted 2.6.37 #1
   Call Trace:
    [<c03e2d7d>] ? printk+0x18/0x1b
    [<c014a2b7>] process_one_work+0x177/0x370
    [<c0199980>] ? padata_parallel_worker+0x0/0x80
    [<c014c467>] worker_thread+0x127/0x390
    [<c014c340>] ? worker_thread+0x0/0x390
    [<c014fd74>] kthread+0x74/0x80
    [<c014fd00>] ? kthread+0x0/0x80
    [<c01033f6>] kernel_thread_helper+0x6/0x10
   BUG: scheduling while atomic: kworker/0:1/10/0x00000002
   Modules linked in: pcrypt my_aes2 binfmt_misc bridge stp
bnep sco rfcomm l2cap crc16 bluetooth rfkill ppdev acpi_cpufreq mperf
cpufreq_stats cpufreq_conservative cpufreq_ondemand cpufreq_userspace
cpufreq_powersave freq_table pci_slot sbs container video output sbshc battery
iptable_filter ip_tables x_tables decnet ctr twofish_i586 twofish_generic
twofish_common camellia serpent blowfish cast5 aes_i586 aes_generic xcbc rmd160
sha512_generic sha256_generic crypto_null af_key ac lp snd_hda_codec_realtek
snd_hda_intel snd_hda_codec snd_pcm_oss evdev snd_mixer_oss snd_pcm psmouse
serio_raw snd_seq_dummy pcspkr parport_pc parport snd_seq_oss snd_seq_midi
snd_rawmidi snd_seq_midi_event option usb_wwan snd_seq usbserial snd_timer
snd_seq_device button processor iTCO_wdt iTCO_vendor_support snd intel_agp
soundcore intel_gtt snd_page_alloc agpgart shpchp pci_hotplug ext3 jbd mbcache
sr_mod cdrom sd_mod sg ata_generic pata_jmicron ata_piix pata_acpi libata floppy
r8169 mii
  scsi_mod uhci_hcd ehci_hcd usbcore thermal fan fuse
   Pid: 10, comm: kworker/0:1 Not tainted 2.6.37 #1
   Call Trace:
    [<c012d459>] __schedule_bug+0x59/0x70
    [<c03e3757>] schedule+0x6a7/0xa70
    [<c0105bf7>] ? show_trace_log_lvl+0x47/0x60
    [<c03e2be9>] ? dump_stack+0x6e/0x75
    [<c014a308>] ? process_one_work+0x1c8/0x370
    [<c0199980>] ? padata_parallel_worker+0x0/0x80
    [<c014c51f>] worker_thread+0x1df/0x390
    [<c014c340>] ? worker_thread+0x0/0x390
    [<c014fd74>] kthread+0x74/0x80
    [<c014fd00>] ? kthread+0x0/0x80
    [<c01033f6>] kernel_thread_helper+0x6/0x10

I must emphasize again that goal of deploying pcrypt/padata is to have more than
one request present in our hardware (e.g. in a quad cpu system we'll have 4
encryption and 4 decryption requests sent into our hardware). Also I tried using
pcrypt/padata in a single cpu system with one change in pcrypt_init_padata
function of pcrypt.c: passing 4 as max_active parameter of alloc_workqueue.
In fact I called alloc_workqueue as:

alloc_workqueue(name, WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, 4);
instead of :
alloc_workqueue(name, WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, 1);

But this did not give me 4 encryption requests.
I know that one promising solution might be to choose ablkcipher over blkcipher
scheme, but as we need a quicker solution and we are pressed with
time, I request
 your comments about my problem.
Can I solve my problem with pcrypt/padata anyway with any change in my current
 blkcipher driver en/deccrypt function or in pcrypt iself? Or should I
take another way?

Please take in mind that minor changes to our current solution is highly
recommended because of our little time.

Thanks in advance,

Hamid.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Fwd: crypto accelerator driver problems
  2011-10-01  9:08                       ` Hamid Nassiby
@ 2011-10-04  7:57                         ` Steffen Klassert
  2011-10-05 10:03                           ` Hamid Nassiby
  0 siblings, 1 reply; 17+ messages in thread
From: Steffen Klassert @ 2011-10-04  7:57 UTC (permalink / raw)
  To: Hamid Nassiby; +Cc: Herbert Xu, linux-crypto

On Sat, Oct 01, 2011 at 12:38:19PM +0330, Hamid Nassiby wrote:
> 
> And my_cbc_encrypt function as PSEUDO/real code (for simplicity of
> representation) is as:
> 
> static int
> my_cbc_encrypt(struct blkcipher_desc *desc,
> 		  struct scatterlist *dst, struct scatterlist *src,
> 		  unsigned int nbytes)
> {
> 		SOME__common_preparation_and_initializations;	
> 		
> 		spin_lock_irqsave(&myloc, myflags);
> 		send_request_to_device(&dev); /*sends request to device. After
> 					    processing request,device writes
> 					    result to destination*/
> 		while(!readl(complete_flag)); /*here we wait for a flag in
> 			  device register space indicating completion. */
> 		spin_unlock_irqrestore(&mylock, myflags);
> 	
> 	
> }

As I told you already in the private mail, it makes not too much sense
to parallelize the crypto layer and to hold a global lock during the
crypto operation. So if you really need this lock, you are much better
off without a parallelization.

> 
> With above code, I can successfully test IPSec gateway equipped with our
> hardware and get a 200Mbps throughput using Iperf. Now I am facing with another
> poblem. As I mentioned earlier, our hardware has 4 aes engines builtin. With
> above code I only utilize one of them.
> >From this point, we want to go a step further and utilize more than one aes
> engines of our device. Simplest solution appears to me is to deploy
> pcrypt/padata, made by Steffen Klassert. First instantiate in a dual
> core gateway :
> 	modprobe tcrypt alg="pcrypt(authenc(hmac(md5),cbc(aes)))" type=3
>  and test again. Running Iperf now gives me a very low
> throughput about 20Mbps while dmesg shows the following:
> 
>    BUG: workqueue leaked lock or atomic: kworker/0:1/0x00000001/10
>        last function: padata_parallel_worker+0x0/0x80

This looks like the parallel worker exited in atomic context,
but I can't tell you much more as long as you don't show us your code.

> 
> I must emphasize again that goal of deploying pcrypt/padata is to have more than
> one request present in our hardware (e.g. in a quad cpu system we'll have 4
> encryption and 4 decryption requests sent into our hardware). Also I tried using
> pcrypt/padata in a single cpu system with one change in pcrypt_init_padata
> function of pcrypt.c: passing 4 as max_active parameter of alloc_workqueue.
> In fact I called alloc_workqueue as:
> 
> alloc_workqueue(name, WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, 4);

This does not make sense. max_active has to be 1 as we have to care about the
order of the work items, so we don't want to have more than one work item
executing at the same time per CPU. And as we run the parallel workers with BHs
off, it is not even possible to execute more than one work item at the same
time per CPU.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Fwd: crypto accelerator driver problems
  2011-10-04  7:57                         ` Steffen Klassert
@ 2011-10-05 10:03                           ` Hamid Nassiby
  2011-10-11  9:42                             ` Steffen Klassert
  0 siblings, 1 reply; 17+ messages in thread
From: Hamid Nassiby @ 2011-10-05 10:03 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: Herbert Xu, linux-crypto

On Tue, Oct 4, 2011 at 11:27 AM, Steffen Klassert
<steffen.klassert@secunet.com> wrote:
>
> On Sat, Oct 01, 2011 at 12:38:19PM +0330, Hamid Nassiby wrote:
> >
> > And my_cbc_encrypt function as PSEUDO/real code (for simplicity of
> > representation) is as:
> >
> > static int
> > my_cbc_encrypt(struct blkcipher_desc *desc,
> >                 struct scatterlist *dst, struct scatterlist *src,
> >                 unsigned int nbytes)
> > {
> >               SOME__common_preparation_and_initializations;
> >
> >               spin_lock_irqsave(&myloc, myflags);
> >               send_request_to_device(&dev); /*sends request to device. After
> >                                           processing request,device writes
> >                                           result to destination*/
> >               while(!readl(complete_flag)); /*here we wait for a flag in
> >                         device register space indicating completion. */
> >               spin_unlock_irqrestore(&mylock, myflags);
> >
> >
> > }
>
> As I told you already in the private mail, it makes not too much sense
> to parallelize the crypto layer and to hold a global lock during the
> crypto operation. So if you really need this lock, you are much better
> off without a parallelization.
>
Hi Steffen,
Thanks for your reply :).

It makes sense in two manners:
1. If request transmit time to device is much shorter than request
processing time
 spent in device and the device has more than one processing engine.

 2. It also can be advantageous when device has only one processing
engine and we
have multiple blkcipher requests pending behind entrance port of device,
because delay between request entrances to device will be shorter. The overall
advantage will be that our IPSec throughput gets nearer to our device bulk
encryption throughput. (It is interesting to note that with our
current driver and device
configuration, if I test gateway throughput with a traffic belonging to two SAs,
traveling through one link that connects them, I'll get a rate about
280Mbps(80Mbps
increase in comparison with one SA's traffic), while our device's bulk
processing is
about 400Mbps.)

Currently we want to take advantage of the latter case and then extend it.

>
>
>
> >
> > With above code, I can successfully test IPSec gateway equipped with our
> > hardware and get a 200Mbps throughput using Iperf. Now I am facing with another
> > poblem. As I mentioned earlier, our hardware has 4 aes engines builtin. With
> > above code I only utilize one of them.
> > >From this point, we want to go a step further and utilize more than one aes
> > engines of our device. Simplest solution appears to me is to deploy
> > pcrypt/padata, made by Steffen Klassert. First instantiate in a dual
> > core gateway :
> >       modprobe tcrypt alg="pcrypt(authenc(hmac(md5),cbc(aes)))" type=3
> >  and test again. Running Iperf now gives me a very low
> > throughput about 20Mbps while dmesg shows the following:
> >
> >    BUG: workqueue leaked lock or atomic: kworker/0:1/0x00000001/10
> >        last function: padata_parallel_worker+0x0/0x80
>
> This looks like the parallel worker exited in atomic context,
> but I can't tell you much more as long as you don't show us your code.

OK, I represented code as PSEUSO, just to simplify and concentrate problem's
aspects ;),  (but it is also possible that I've concentrated it in a
wrong way :D)
This is my_cbc_encrypt code and functions it calls, bottom-up:

int write_request(u8 *buff, unsigned int count)
{

	u32  tlp_size = 32;
	struct my_dma_desc *desc_table = (struct my_dma_desc *)global_bar[0];
	tlp_size = (count/128) | (tlp_size << 16);
	memcpy(g_mydev->rdmaBuf_va, buff, count);
	wmb();

	writel(cpu_to_le32(tlp_size),(&desc_table->wdmaperf));
	wmb();

	while((readl(&desc_table->ddmacr) | 0xFFFF0000)!= 0xFFFF0101);/*wait for
 						transfer compeltion*/
	return 0;
}

 int my_transform(struct my_aes_op *op, int alg)
{

		int  req_len, err;
		unsigned long iflagsq, tflag;
		u8 *req_buf = NULL, *res_buf = NULL;
		alg_operation operation;
		if (op->len == 0)
			return 0;
		operation = !(op->dir);

		create_request(alg, op->mode, operation, 0, op->key,
			  op->iv, op->src, op->len, &req_buf, &req_len); /*add
			header to original request and copy it to req_buf*/

 		spin_lock_irqsave(&glock, tflag);
		
		write_request(req_buf, req_len);/*now req_buf is sent to device
				, device en/decrypts request and writes the
				the result to a fixed dma mapped address*/
		if (err){
			printk(KERN_EMERG"Error WriteReuest:errcode=%d\n", err);
			//handle exception (never occured)
		}
		kfree(req_buf);
		req_buf = NULL;

		memcpy(op->dst, (g_mydev->wdmaBuf_va, op->len);/*copy result from
			 fixed coherent dma mapped memory to destination*/
		spin_unlock_irqrestore(&glock, tflag);
		
		return op->len;
}

static int
my_cbc_encrypt(struct blkcipher_desc *desc,
		  struct scatterlist *dst, struct scatterlist *src,
		  unsigned int nbytes)
{
	struct my_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
	struct blkcipher_walk walk;
	int err, ret;
	unsigned long c2flag;
	if (unlikely(op->keylen != AES_KEYSIZE_128))
		return fallback_blk_enc(desc, dst, src, nbytes);


	blkcipher_walk_init(&walk, dst, src, nbytes);
	err = blkcipher_walk_virt(desc, &walk);
	op->iv = walk.iv;

	while((nbytes = walk.nbytes)) {

		op->src = walk.src.virt.addr,
		op->dst = walk.dst.virt.addr;
		op->mode = AES_MODE_CBC;
		op->len = nbytes /*- (nbytes % AES_MIN_BLOCK_SIZE)*/;
		op->dir = AES_DIR_ENCRYPT;
		ret = my_transform(op, 0);
		nbytes -= ret;
		err = blkcipher_walk_done(desc, &walk, nbytes);
	}

	return err;
}

>
> >
> > I must emphasize again that goal of deploying pcrypt/padata is to have more than
> > one request present in our hardware (e.g. in a quad cpu system we'll have 4
> > encryption and 4 decryption requests sent into our hardware). Also I tried using
> > pcrypt/padata in a single cpu system with one change in pcrypt_init_padata
> > function of pcrypt.c: passing 4 as max_active parameter of alloc_workqueue.
> > In fact I called alloc_workqueue as:
> >
> > alloc_workqueue(name, WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, 4);
>
> This does not make sense. max_active has to be 1 as we have to care about the
> order of the work items, so we don't want to have more than one work item
> executing at the same time per CPU. And as we run the parallel workers with BHs
> off, it is not even possible to execute more than one work item at the same
> time per CPU.
>

Did you turn BHs off, to prevent deadlocks  between your workqueues and
network's softirqs?
If there is any other thing that will help, I am pleased to hear.

Thanks.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Fwd: crypto accelerator driver problems
  2011-10-05 10:03                           ` Hamid Nassiby
@ 2011-10-11  9:42                             ` Steffen Klassert
  2011-10-15 11:26                               ` Hamid Nassiby
  0 siblings, 1 reply; 17+ messages in thread
From: Steffen Klassert @ 2011-10-11  9:42 UTC (permalink / raw)
  To: Hamid Nassiby; +Cc: Herbert Xu, linux-crypto

On Wed, Oct 05, 2011 at 01:33:33PM +0330, Hamid Nassiby wrote:
> 
> OK, I represented code as PSEUSO, just to simplify and concentrate problem's
> aspects ;),  (but it is also possible that I've concentrated it in a
> wrong way :D)
> This is my_cbc_encrypt code and functions it calls, bottom-up:
> 
> int write_request(u8 *buff, unsigned int count)
> {
> 
> 	u32  tlp_size = 32;
> 	struct my_dma_desc *desc_table = (struct my_dma_desc *)global_bar[0];
> 	tlp_size = (count/128) | (tlp_size << 16);
> 	memcpy(g_mydev->rdmaBuf_va, buff, count);
> 	wmb();
> 
> 	writel(cpu_to_le32(tlp_size),(&desc_table->wdmaperf));
> 	wmb();
> 
> 	while((readl(&desc_table->ddmacr) | 0xFFFF0000)!= 0xFFFF0101);/*wait for
>  						transfer compeltion*/
> 	return 0;
> }
> 
>  int my_transform(struct my_aes_op *op, int alg)
> {
> 
> 		int  req_len, err;
> 		unsigned long iflagsq, tflag;
> 		u8 *req_buf = NULL, *res_buf = NULL;
> 		alg_operation operation;
> 		if (op->len == 0)
> 			return 0;
> 		operation = !(op->dir);
> 
> 		create_request(alg, op->mode, operation, 0, op->key,
> 			  op->iv, op->src, op->len, &req_buf, &req_len); /*add
> 			header to original request and copy it to req_buf*/
> 
>  		spin_lock_irqsave(&glock, tflag);
> 		
> 		write_request(req_buf, req_len);/*now req_buf is sent to device
> 				, device en/decrypts request and writes the
> 				the result to a fixed dma mapped address*/
> 		if (err){
> 			printk(KERN_EMERG"Error WriteReuest:errcode=%d\n", err);
> 			//handle exception (never occured)
> 		}
> 		kfree(req_buf);
> 		req_buf = NULL;
> 
> 		memcpy(op->dst, (g_mydev->wdmaBuf_va, op->len);/*copy result from
> 			 fixed coherent dma mapped memory to destination*/
> 		spin_unlock_irqrestore(&glock, tflag);
> 		
> 		return op->len;
> }
> 
> static int
> my_cbc_encrypt(struct blkcipher_desc *desc,
> 		  struct scatterlist *dst, struct scatterlist *src,
> 		  unsigned int nbytes)
> {
> 	struct my_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
> 	struct blkcipher_walk walk;
> 	int err, ret;
> 	unsigned long c2flag;
> 	if (unlikely(op->keylen != AES_KEYSIZE_128))
> 		return fallback_blk_enc(desc, dst, src, nbytes);
> 
> 
> 	blkcipher_walk_init(&walk, dst, src, nbytes);
> 	err = blkcipher_walk_virt(desc, &walk);
> 	op->iv = walk.iv;
> 
> 	while((nbytes = walk.nbytes)) {
> 
> 		op->src = walk.src.virt.addr,
> 		op->dst = walk.dst.virt.addr;
> 		op->mode = AES_MODE_CBC;
> 		op->len = nbytes /*- (nbytes % AES_MIN_BLOCK_SIZE)*/;
> 		op->dir = AES_DIR_ENCRYPT;
> 		ret = my_transform(op, 0);
> 		nbytes -= ret;
> 		err = blkcipher_walk_done(desc, &walk, nbytes);
> 	}
> 
> 	return err;
> }
> 

I can't tell much when looking at this code snippet. One guess would be
someone (maybe you) has set the CRYPTO_TFM_REQ_MAY_SLEEP flag, as
blkcipher_walk_done calls crypto_yield() which in turn might call
schedule() if this flag is set. prcypt removes this flag explicit.

> 
> Did you turn BHs off, to prevent deadlocks  between your workqueues and
> network's softirqs?
> If there is any other thing that will help, I am pleased to hear.
> 

Basically, the bottom halves are off to keep up with the network softirqs.
They run with much higher priority and would interrupt the parallel
workers frequently.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Fwd: crypto accelerator driver problems
  2011-10-11  9:42                             ` Steffen Klassert
@ 2011-10-15 11:26                               ` Hamid Nassiby
  0 siblings, 0 replies; 17+ messages in thread
From: Hamid Nassiby @ 2011-10-15 11:26 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: Herbert Xu, linux-crypto

On 10/11/11, Steffen Klassert <steffen.klassert@secunet.com> wrote:

>
> I can't tell much when looking at this code snippet. One guess would be
> someone (maybe you) has set the CRYPTO_TFM_REQ_MAY_SLEEP flag, as
> blkcipher_walk_done calls crypto_yield() which in turn might call
> schedule() if this flag is set. prcypt removes this flag explicit.
>

I've not set such a flag.

>
> Basically, the bottom halves are off to keep up with the network softirqs.
> They run with much higher priority and would interrupt the parallel
> workers frequently.
>

Do you mean that with BHs on, we only have some performance degrades?

Thanks for your reply.
Any other idea?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Fwd: crypto accelerator driver problems
  2011-01-26  7:09             ` Herbert Xu
  2011-01-26  7:50               ` Hamid Nassiby
@ 2013-04-25  3:45               ` Vakul Garg
  2013-04-25  9:31                 ` Herbert Xu
  1 sibling, 1 reply; 17+ messages in thread
From: Vakul Garg @ 2013-04-25  3:45 UTC (permalink / raw)
  To: linux-crypto

Herbert Xu <herbert <at> gondor.apana.org.au> writes:

> 
> On Wed, Jan 26, 2011 at 10:26:33AM +0330, Hamid Nassiby wrote:
> >
> > As you know, I posted my problem again to crypto list and no one 
answered.
> > Now I
> > emphasize one aspect of the problem as a concept related to IPSec 
protocol,
> > free
> > of my problem's nature, and I hope to get some guidelines at this time. 
The
> > question is as following:
> > If IPSec delivers IP packets to hardware crypto accelerator in 
sequential
> > manner
> > (e.g, packets in order: 1, 2, 3, ..., 36, 37, 38,...) and crypto 
accelerator
> > possibly returns back packets out of entering order to IPSec (e.g, 
packet
> > 37 is returned back before the packet 36 to IPSec, so the order of 
packets
> > is
> > not the same before entering crypto accelerator and after exiting it); 
Is it
> > possible to rise any problem here?
> 
> We do not allow such reordering.  All crypto drivers must ensure
> ordering within a single tfm.  Between different tfms there is no
> ordering requirement.
> 
> Cheers,


Hello Herbert,

Does this mean that processing of all the crypto requests from a single tfm 
must be serialized even if they execute on multiple different cores?

Regards

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Fwd: crypto accelerator driver problems
  2013-04-25  3:45               ` Vakul Garg
@ 2013-04-25  9:31                 ` Herbert Xu
  0 siblings, 0 replies; 17+ messages in thread
From: Herbert Xu @ 2013-04-25  9:31 UTC (permalink / raw)
  To: Vakul Garg; +Cc: linux-crypto

Vakul Garg <vakul@freescale.com> wrote:
> Herbert Xu <herbert <at> gondor.apana.org.au> writes:
> 
>> 
>> On Wed, Jan 26, 2011 at 10:26:33AM +0330, Hamid Nassiby wrote:
>> >
>> > As you know, I posted my problem again to crypto list and no one 
> answered.
>> > Now I
>> > emphasize one aspect of the problem as a concept related to IPSec 
> protocol,
>> > free
>> > of my problem's nature, and I hope to get some guidelines at this time. 
> The
>> > question is as following:
>> > If IPSec delivers IP packets to hardware crypto accelerator in 
> sequential
>> > manner
>> > (e.g, packets in order: 1, 2, 3, ..., 36, 37, 38,...) and crypto 
> accelerator
>> > possibly returns back packets out of entering order to IPSec (e.g, 
> packet
>> > 37 is returned back before the packet 36 to IPSec, so the order of 
> packets
>> > is
>> > not the same before entering crypto accelerator and after exiting it); 
> Is it
>> > possible to rise any problem here?
>> 
>> We do not allow such reordering.  All crypto drivers must ensure
>> ordering within a single tfm.  Between different tfms there is no
>> ordering requirement.
>> 
>> Cheers,
> 
> 
> Hello Herbert,
> 
> Does this mean that processing of all the crypto requests from a single tfm 
> must be serialized even if they execute on multiple different cores?

Correct.  Conceptually a single tfm is like a thread, if one
wanted parallelism then multiple tfms should be used.  Of course
there are exceptions such as pcrypt.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2013-04-25  9:31 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <AANLkTikdiSbQ_hAAfQC2P1hFiFAE8Wr5T+O2o1Yts6wH@mail.gmail.com>
     [not found] ` <AANLkTikSfb1W21NxQJ0JzMWX7sqg-2D6HAJpfXTNNAHR@mail.gmail.com>
2010-12-19 12:58   ` crypto accelerator driver problems Hamid Nassiby
2010-12-21 12:13     ` Fwd: " Hamid Nassiby
2010-12-30 21:19       ` Herbert Xu
2011-01-08  7:39         ` Hamid Nassiby
     [not found]           ` <AANLkTin8au=98mmfsaJjOSyJNibk3foZWihj6EGTGWK-@mail.gmail.com>
2011-01-26  7:09             ` Herbert Xu
2011-01-26  7:50               ` Hamid Nassiby
2011-01-26 23:33                 ` Herbert Xu
2011-07-05  6:45                   ` Hamid Nassiby
2011-07-05  6:53                     ` Herbert Xu
2011-10-01  9:08                       ` Hamid Nassiby
2011-10-04  7:57                         ` Steffen Klassert
2011-10-05 10:03                           ` Hamid Nassiby
2011-10-11  9:42                             ` Steffen Klassert
2011-10-15 11:26                               ` Hamid Nassiby
2013-04-25  3:45               ` Vakul Garg
2013-04-25  9:31                 ` Herbert Xu
2011-01-26  7:16           ` Hamid Nassiby

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.