crypto accelerator driver problems

* crypto accelerator driver problems
       [not found] ` <AANLkTikSfb1W21NxQJ0JzMWX7sqg-2D6HAJpfXTNNAHR@mail.gmail.com>
@ 2010-12-19 12:58   ` Hamid Nassiby
  2010-12-21 12:13     ` Fwd: " Hamid Nassiby
  0 siblings, 1 reply; 17+ messages in thread
From: Hamid Nassiby @ 2010-12-19 12:58 UTC (permalink / raw)
  To: linux-crypto

Hi All,

In a research project, we've developed a crypto accelerator based on Xilinx
Virtex5 FPGA family which is connected to PC through PCI-Express slot and is
used by IPSec to offload crypto processing from CPU. The accelerator only
provides AES and DES3_EDE algorithms and I am responsible for providing driver
of the stuff. I inspired much of driver work from geode_aes.c which is
located in "drivers/crypto" subdir of kernel source directory. Both algorithms
are registered as blkcipher providing cbc wrapper "cbc(aes)" just as one that is
registered in geode_aes. Now after months of work, the accelerator is ready to
work (Correctness of hardware operation is assured by direct crypto
test and not by IPSec) and it is time of driver to provide IPSec
access to accelerator. In first
try I could get  "ping" through the IPsec tunnel. One end of IPSec tunnel is
equipped by our accelerator and the other end is using kernel native IPSec and
built in AES and DES3_EDE algorithms. Now I am faced with 2 problems:

1. Ping will stop getting reply with packet sizes greater than 1426 Bytes
(ping dest_ip -s  1427). I guessed that it might be MTU problem, but reducing
mtu with "ifconfig eth1 mtu xxx" or
"echo 1 > /proc/sys/net/ipv4/ip_no_pmtu_disc"
 does not solve the problem. Also when I ping each of tunnel ends from
another end
simultaneously with "ping other_node_ip  -i 0.001", the kernel hangs
out completely.

2. Iperf problem. When I try to measure throughput of the IPSec gateway equipped
by our accelerator ( AES-MD5 ), using iperf in tcp mode, the kernel hangs such
that sometimes "Magic SysRq key" does not respond too! And so I could not trace
the problem anyway. Using iperf in udp mode works but I get "UDP bad cheksum" in
'dmesg' output of other end of tunnel (Native IPSec and built in kernel
algorithms).

Two gateways are connected by a cross cable and no router/switch is located
between them to cause mtu problems. In my test pcrypt is not used by now and
booting the kernel with nosmp (so no fear of thread contention) does not change
the situation.

So I request you to help me solve the problem. I bring some parts of driver
that is changed from geode_aes.c and might give useful information. If
it is required,
I'll post all driver text.
------------------------------ ----------------------------

static struct crypto_alg mydriver_cbc_alg = {
       .cra_name               =       "cbc(aes)",
       .cra_driver_name        =       "cbc-aes-mydriver",
       .cra_priority           =       400,
       .cra_flags                      =       CRYPTO_ALG_TYPE_BLKCIPHER |

CRYPTO_ALG_NEED_FALLBACK,
       .cra_init                       =       fallback_init_blk,
       .cra_exit                       =       fallback_exit_blk,
       .cra_blocksize          =       AES_MIN_BLOCK_SIZE,
       .cra_ctxsize            =       sizeof(struct mydriver_aes_op),
       .cra_alignmask          =       15,
       .cra_type                       =       &crypto_blkcipher_type,
       .cra_module                     =       THIS_MODULE,
       .cra_list                       =
LIST_HEAD_INIT(mydriver_cbc_alg.cra_list),
       .cra_u                          =       {
               .blkcipher      =       {
                       .min_keysize    =       AES_MIN_KEY_SIZE,
                       .max_keysize    =       AES_MIN_KEY_SIZE,
                       .setkey                 =       mydriver_setkey_blk,
                       .encrypt                =       mydriver_cbc_encrypt,
                       .decrypt                =       mydriver_cbc_decrypt,
                       .ivsize                 =       AES_IV_LENGTH,
               }
       }
};
//---------------
static int
mydriver_cbc_encrypt(struct blkcipher_desc *desc,
                 struct scatterlist *dst, struct scatterlist *src,
                 unsigned int nbytes)
{

       struct mydriver_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
       struct blkcipher_walk walk;
       int err, ret;

       if (unlikely(op->keylen != AES_KEYSIZE_128))
               return fallback_blk_enc(desc, dst, src, nbytes);

       blkcipher_walk_init(&walk, dst, src, nbytes);
       err = blkcipher_walk_virt(desc, &walk);
       op->iv = walk.iv;

       while((nbytes = walk.nbytes)) {

               op->src = walk.src.virt.addr,
               op->dst = walk.dst.virt.addr;
               op->mode = AES_MODE_CBC;
               op->len = nbytes - (nbytes % AES_MIN_BLOCK_SIZE);
               op->dir = AES_DIR_ENCRYPT;
                       //ret = mydriver_aes_crypt(op);
               ret = mydriver_transform(op, 0);
               nbytes -= ret;
               err = blkcipher_walk_done(desc, &walk, nbytes);
       }

       return err;
}
/*--------- mydriver_transform which makes a buffer containing key, iv, data
with
some additional header that is required by our accelerator, writes the buffer
to accelerator by DMA and then reads response from hardware.*/

static inline int mydriver_transform(struct mydriver_aes_op *op, int alg)
{

               int  req_len, err;
               u8 *req_buf = NULL, *res_buf = NULL;
               alg_operation operation;
               u32 my_req_id;
               if (op->len == 0)
                       return 0;

               if ((op->dir == AES_DIR_ENCRYPT) ||(op->dir ==
DES3_DIR_ENCRYPT)){
                        operation = SH_ENCRYPT;
                        my_req_id = smp_processor_id();// This ID is
put into our packet and is checked by each thread when the hardware
response is ready to see if the packet is its?
               }
               else {
                       operation = SH_DECRYPT;
                       my_req_id = smp_processor_id() + 64;
//uniqueness of ID does not solve problem described in mail :( .
               }

               err = create_request(alg, op->mode, operation, htonl(my_req_id),
op->key, op->iv, op->src, op->len, &req_buf, &req_len);

               if (err){
                       printk(KERN_EMERG"mydriver_transform : Error
CreateReuest :
errcode = %d\n", err);
                       //goto error;
               }

               err = write_request(req_buf, req_len);
               if (err){
                       printk(KERN_EMERG"mydriver_transform : Error WriteReuest
:
errcode = %d\n", err);
                       //goto error;
               }
               kfree(req_buf);
               req_buf = NULL;

               err = read_response(&res_buf, /*local_hdr.Length*/my_req_id);

               memcpy(op->dst, (res_buf + sizeof(struct response_hdr)),
op->len);

               kfree(res_buf);
               res_buf = NULL;
               return op->len;
}
//-----------
/* create_request wich builds packet for mydriver_transform */
static inline int create_request(int alg, char mode, char enc_dec, u32
request_id,
                  char *key, char *iv, char *data, int datalen,
                  u8 **outbuf, int *outlen)
{
       int req_len, n_padding, keylen, blocklen, algid;
       struct request_hdr *p_hdr;
       char *ptr;

       if (alg == 0){ //AES Algorithm
               keylen = 16;
               blocklen = 16;
               algid = 4;
       } else if (alg == 1){ //DES3 Algorithm
               keylen = 24;
               blocklen = 8;
               algid = 3;
       }

       req_len = sizeof(struct request_hdr) + keylen;
       if (keylen != 0 && keylen % 16 == 0)
               req_len += 8; //For request packet to be 128bit aligned
       if (mode == SHAMS_CBC)
               req_len += blocklen; // for IV len

       n_padding = (blocklen - (datalen % blocklen)) % blocklen; //padding
data to be multiple of 128 bits.

       req_len += (n_padding + datalen);
       *outbuf = kmalloc(req_len, GFP_ATOMIC);
       p_hdr = (struct request_hdr *) *outbuf;
       *outlen = p_hdr->Length = req_len;

       p_hdr->request_id = request_id;
       p_hdr->AlgID_Mode_EncDec = (enc_dec << 15) | (mode << 12) | algid;
       // Filling key
       ptr = *outbuf + sizeof(struct request_hdr);
       memcpy(ptr, key, keylen);
       ptr += keylen;
       if (keylen != 0 && keylen % 16 == 0){
               memset(ptr, 0, 8);
               ptr += 8;
       }
       // Filling IV
       if (mode == SHAMS_CBC){
               memcpy(ptr, iv, blocklen);
               ptr += blocklen;
       }
       // Copy data
       memcpy(ptr, data, datalen);
       ptr += datalen;
       // Zeroing padd bits
       memset(ptr, 0, n_padding);

       return 0;

}
//--------------------------------
/* write_request that writes the provided buffer to device */

static inline int write_request(u8 *buff, unsigned int count)
{
unsigned long  iflags;
u32 tlp_count, tlp_size;
dma_addr_t dma_addr;
struct x5pcie_dma_desc *desc_table = (struct x5pcie_dma_desc *)global_bar[0];

/** DMA operations:*/
       dma_addr = pci_map_single(global_dev, buff, count, PCI_DMA_TODEVICE);
       if (0 == dma_addr) {
               printk(KERN_EMERG"XPCIe_Read: Map error.\n");
               return -1;
       }

// Do DMA transfer here....
       count = count /4;//
       for (tlp_size = 32; tlp_size > 0; tlp_size--)
               if ((count % tlp_size) == 0){
                       tlp_count = count / tlp_size;
                       break;
               }

       tlp_size = tlp_count | (tlp_size << 16);
       spin_lock_irqsave(&wlock, iflags);
       //down(&my_sem);
//      if (down_interruptible(&my_sem)){
//              printk(KERN_EMERG "\nwrite_request: Error Acquire Semaphore!!");
//              return -ERESTARTSYS;
//      }
       writel(cpu_to_le32(tlp_size),&desc_table->rdmatlpc);             // read
DMA TLP count:  TLPs to transfer
       writel(cpu_to_le32(dma_addr),&desc_table->rdmatlpa);  // physical bus
address of DMA able buffer
       wmb();
       writew(cpu_to_le16(0x0001),(global_bar[0]+6));                // read
dma start bit[16] to ddmacr
       wmb();
       while(readw((global_bar[0]+6)) != 0x0101);
       spin_unlock_irqrestore(&wlock, iflags);
       //up(&my_sem);
       // Unmap the DMA buffer so it is safe for normal access again.
       pci_unmap_single(global_dev, dma_addr, count, PCI_DMA_TODEVICE);

       /** End of dma section*/
       return 0;

}
//--------------
/* read_response that reads the en/decrypted buffer from device */

static inline int read_response(u8 **buff,  u16 my_req_id)
{
       dma_addr_t dma_addr;
       u16 count, tmp_req_id;
       unsigned long  iflags1;//, iflags2;
       u32 tlp_count, tlp_size;
       struct x5pcie_dma_desc *desc_table = (struct x5pcie_dma_desc
*)global_bar[0];

       for(;;){

               spin_lock_irqsave(&alock, iflags1);
               tmp_req_id = readw((global_bar[0] + 82 + (fifo_entry * 4)));
               spin_unlock_irqrestore(&alock, iflags1);
               if(my_req_id == tmp_req_id) // Is the provided packet mine?
                       break;

       }

                       count = readw(global_bar[0] + 80 + (fifo_entry
* 4));//What is the size of my packet?
                       printk(KERN_EMERG "read_response : my_req_id = %d has
count = %d\n", my_req_id, count);

                       *buff = kmalloc(count, GFP_ATOMIC);
                       dma_addr = pci_map_single(global_dev, *buff, count,
PCI_DMA_FROMDEVICE);
                       if (0 == dma_addr){
                               printk(KERN_EMERG"XPCIe_Read: Map error.\n");
                               return -1;
                       }

                       count = count /4;//
                       for (tlp_size = 32; tlp_size > 0; tlp_size--)
                               if ((count % tlp_size) == 0){
                                       tlp_count = count / tlp_size;
                                       break;
                               }

                       tlp_size = tlp_count | (tlp_size << 16);
       //              down(&my_sem);
//                      if (down_interruptible(&my_sem)){
//                              printk(KERN_EMERG "\nread_response: Error
Acquire Semaphore!!");
//                              return -ERESTARTSYS;
//                      }
                       writel(cpu_to_le32(tlp_size),&desc_table->wdmatlpc);
       // read DMA TLP count:  TLPs to transfer
                       writel(cpu_to_le32(dma_addr),&desc_table->wdmatlpa);  //
physical bus address of DMA able buffer
                       wmb();
                       writew(cpu_to_le16(0x0001),(global_bar[0]+4));
    // read dma start bit[16] to ddmacr
                       wmb();
                       while(readw(global_bar[0]+4) != 0x0101);

                       fifo_entry = (fifo_entry + 1) % 9; // 9 : Number of
registers holding request_id and len of FiFo's elements .
                       //spin_unlock_irqrestore(&rlock, iflags2);
                       //up(&my_sem);
                       pci_unmap_single(global_dev, dma_addr, count,
PCI_DMA_FROMDEVICE);

                       return count;

}

Thanks in advance,
Hamid.

^ permalink raw reply	[flat|nested] 17+ messages in thread