From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755603Ab1IGToI (ORCPT ); Wed, 7 Sep 2011 15:44:08 -0400 Received: from cdptpa-bc-oedgelb.mail.rr.com ([75.180.133.32]:44782 "EHLO cdptpa-bc-oedgelb.mail.rr.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752161Ab1IGToG (ORCPT ); Wed, 7 Sep 2011 15:44:06 -0400 Authentication-Results: cdptpa-bc-oedgelb.mail.rr.com smtp.user=rpearson@systemfabricworks.com; auth=pass (LOGIN) X-Authority-Analysis: v=1.1 cv=40Z/dbZBr1wgzPkGSf8y7qdCkiWp+M7NvixVUiz+qMg= c=1 sm=0 a=xO0yJxRua38A:10 a=ozIaqLvjkoIA:10 a=kj9zAlcOel0A:10 a=DCwX0kaxZCiV3mmbfDr8nQ==:17 a=Z4Rwk6OoAAAA:8 a=YORvzBCaAAAA:8 a=VwQbUJbxAAAA:8 a=EezHunaUZ7hFXId9htQA:9 a=h9OX2j0IclOK23BT_yAA:7 a=CjuIK1q_8ugA:10 a=jbrJJM5MRmoA:10 a=VV2__AUApEoA:10 a=DCwX0kaxZCiV3mmbfDr8nQ==:117 X-Cloudmark-Score: 0 X-Originating-IP: 67.79.195.91 From: "Bob Pearson" To: "'Joakim Tjernlund'" Cc: , , "'George Spelvin'" , References: <20110831213729.395283830@systemfabricworks.com> <4E5EB608.3000905@systemfabricworks.com> In-Reply-To: Subject: RE: [PATCH v6 08/10] crc32-add-slicing-by-8.diff Date: Wed, 7 Sep 2011 14:44:02 -0500 Message-ID: <03bc01cc6d96$7f3cba30$7db62e90$@systemfabricworks.com> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit X-Mailer: Microsoft Outlook 14.0 Thread-Index: AQJdND57YfohkECc5nAnrPUtKDDU5gJXC2PMAiY/az+T/NewsA== Content-Language: en-us Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org > -----Original Message----- > From: Joakim Tjernlund [mailto:joakim.tjernlund@transmode.se] > Sent: Wednesday, September 07, 2011 2:31 AM > To: Bob Pearson > Cc: akpm@linux-foundation.org; fzago@systemfabricworks.com; George > Spelvin; linux-kernel@vger.kernel.org > Subject: Re: [PATCH v6 08/10] crc32-add-slicing-by-8.diff > > Bob Pearson wrote on 2011/09/01 > 00:30:32: > > > > add slicing-by-8 algorithm to the existing > > slicing-by-4 algorithm. This consists of: > > - extend largest BITS size from 32 to 64 > > - extend tables from tab[4][256] to up to tab[8][256] > > - Add code for inner loop. > > > > Signed-off-by: Bob Pearson > > > > --- > > lib/crc32.c | 40 ++++++++++++++++++++++++++++------------ > > lib/crc32defs.h | 29 +++++++++++++++++++++-------- > > lib/gen_crc32table.c | 43 +++++++++++++++++++++++++++---------------- > > 3 files changed, 76 insertions(+), 36 deletions(-) > > > > Index: for-next/lib/crc32.c > > > ========================================================== > ========= > > --- for-next.orig/lib/crc32.c > > +++ for-next/lib/crc32.c > > @@ -47,25 +47,28 @@ MODULE_LICENSE("GPL"); > > > > #if CRC_LE_BITS > 8 || CRC_BE_BITS > 8 > > > > +/* implements slicing-by-4 or slicing-by-8 algorithm */ > > static inline u32 > > crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 > (*tab)[256]) > > { > > # ifdef __LITTLE_ENDIAN > > # define DO_CRC(x) (crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8)) > > -# define DO_CRC4 crc = t3[(crc) & 255] ^ \ > > - t2[(crc >> 8) & 255] ^ \ > > - t1[(crc >> 16) & 255] ^ \ > > - t0[(crc >> 24) & 255] > > +# define DO_CRC4 (t3[(q) & 255] ^ t2[(q >> 8) & 255] ^ \ > > + t1[(q >> 16) & 255] ^ t0[(q >> 24) & 255]) > > +# define DO_CRC8 (t7[(q) & 255] ^ t6[(q >> 8) & 255] ^ \ > > + t5[(q >> 16) & 255] ^ t4[(q >> 24) & 255]) > > # else > > # define DO_CRC(x) (crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8)) > > -# define DO_CRC4 crc = t0[(crc) & 255] ^ \ > > - t1[(crc >> 8) & 255] ^ \ > > - t2[(crc >> 16) & 255] ^ \ > > - t3[(crc >> 24) & 255] > > +# define DO_CRC4 (t0[(q) & 255] ^ t1[(q >> 8) & 255] ^ \ > > + t2[(q >> 16) & 255] ^ t3[(q >> 24) & 255]) > > +# define DO_CRC8 (t4[(q) & 255] ^ t5[(q >> 8) & 255] ^ \ > > + t6[(q >> 16) & 255] ^ t7[(q >> 24) & 255]) > > Don't like the new DO_CRC8 macro. You could get by with my earlier > suggestion: > # define DO_CRC4(crc, x0, x1, x2, x3) \ > x3[(crc) & 255] ^ \ > x2[(crc >> 8) & 255] ^ \ > x1[(crc >> 16) & 255] ^ \ > x0[(crc >> 24) & 255] > > Then the code becomes something like > if (bits == 64) { > crc = DO_CRC4(crc, t4, t5, t6, t7); > ++b; > crc ^= DO_CRC4(*b, t0, t1, t2, t3); > } else > crc = DO_CRC4(crc, t0, t1, t2, t3); OK by me. > > > # endif > > const u32 *b; > > - size_t rem_len; > > + size_t rem_len; > > const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3]; > > + const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7]; > > t4 to t7 is only used in 64 bit mode. Yes but with the CEC_XX_BITS->bits change this point is moot. > > BTW, it the 64 CRC bits on 32 bits BE arch bug fixed? I have a 64 bit sparc machine and a bunch of x86 variants. As far as I am able to tell these are working for all the different algorithms. If you have a 32 bit PPC machine please try to see if this is still a problem. > > > + u32 q; > > > > /* Align it */ > > if (unlikely((long)buf & 3 && len)) { > > @@ -73,13 +76,25 @@ crc32_body(u32 crc, unsigned char const > > DO_CRC(*buf++); > > } while ((--len) && ((long)buf)&3); > > } > > + > > +# if CRC_LE_BITS == 32 > > rem_len = len & 3; > > - /* load data 32 bits wide, xor data 32 bits wide. */ > > len = len >> 2; > > +# else > > + rem_len = len & 7; > > + len = len >> 3; > > I still fail to see why this is needed. You still do 32 bit loads so this > only makes the code uglier, harder to maintain and makes small unaligned crc > bufs > slower. You suggested and I changed the initial alignment code to pick the first available 4 byte boundary. After that, the inner loop consumes respectively 4 or 8 bytes of data per iteration for the 32/64 version of the algorithm. So the computation of len and rem_len *must* be made differently in the two cases. > > .... > > > Index: for-next/lib/gen_crc32table.c > > > ========================================================== > ========= > > --- for-next.orig/lib/gen_crc32table.c > > +++ for-next/lib/gen_crc32table.c > > @@ -1,23 +1,28 @@ > .. > > > > -static void output_table(uint32_t (*table)[256], int len, char *trans) > > +static void output_table(uint32_t (*table)[256], int rows, int len, char > *trans) > > This table is not always 256 entries. I suggested a cleaner impl. earlier. > Something like this: > > -static void output_table(uint32_t table[4][256], int len, char *trans) > +static void output_table(uint32_t table[], int len, char *trans) > { > - int i, j; > - > - for (j = 0 ; j < 4; j++) { > - printf("{"); > - for (i = 0; i < len - 1; i++) { > - if (i % ENTRIES_PER_LINE == 0) > - printf("\n"); > - printf("%s(0x%8.8xL), ", trans, table[j][i]); > - } > - printf("%s(0x%8.8xL)},\n", trans, table[j][len - 1]); > + int i; > + > + printf("{"); > + for (i = 0; i < len - 1; i++) { > + if (i % ENTRIES_PER_LINE == 0) > + printf("\n"); > + printf("%s(0x%8.8xL), ", trans, table[i]); > } > + printf("%s(0x%8.8xL)},\n", trans, table[len - 1]); > } > > int main(int argc, char** argv) > { > + int i; > + > printf("/* this file is generated - do not edit */\n\n"); > > if (CRC_LE_BITS > 1) { > crc32init_le(); > - printf("static const u32 crc32table_le[4][256] = {"); > - output_table(crc32table_le, LE_TABLE_SIZE, "tole"); > + printf("static const u32 crc32table_le[%d][%d] = {", > + LE_TABLE_ROWS, LE_TABLE_SIZE); > + for (i = 0 ; i < LE_TABLE_ROWS; i++) > + output_table(crc32table_le[i], LE_TABLE_SIZE, > "tole"); > printf("};\n"); > } > > if (CRC_BE_BITS > 1) { > crc32init_be(); > - printf("static const u32 crc32table_be[4][256] = {"); > - output_table(crc32table_be, BE_TABLE_SIZE, "tobe"); > + printf("static const u32 crc32table_be[%d][%d] = {", > + BE_TABLE_ROWS, BE_TABLE_SIZE); > + for (i = 0 ; i < BE_TABLE_ROWS; i++) > + output_table(crc32table_be[i], BE_TABLE_SIZE, > "tobe"); > printf("};\n"); > } >