Skip to content
Snippets Groups Projects
Commit f8719f5a authored by Mark Adler's avatar Mark Adler
Browse files

Speed up software CRC-32 computation by a factor of 1.5 to 3.

Use the interleaved method of Kadatch and Jenkins in order to make
use of pipelined instructions through multiple ALUs in a single
core. This also speeds up and simplifies the combination of CRCs,
and updates the functions to pre-calculate and use an operator for
CRC combination.
parent 41d86c73
No related branches found
No related tags found
No related merge requests found
Loading
Loading
@@ -367,8 +367,11 @@ else
try()
{
show $*
( $* ) >> configure.log 2>&1
got=`( $* ) 2>&1`
ret=$?
if test "$got" != ""; then
printf "%s\n" "$got" >> configure.log
fi
if test $ret -ne 0; then
echo "(exit code "$ret")" >> configure.log
fi
Loading
Loading
@@ -381,8 +384,11 @@ tryboth()
show $*
got=`( $* ) 2>&1`
ret=$?
printf %s "$got" >> configure.log
if test "$got" != ""; then
printf "%s\n" "$got" >> configure.log
fi
if test $ret -ne 0; then
echo "(exit code "$ret")" >> configure.log
return $ret
fi
test "$got" = ""
Loading
Loading
@@ -457,17 +463,11 @@ size_t dummy = 0;
EOF
if try $CC -c $CFLAGS $test.c; then
echo "Checking for size_t... Yes." | tee -a configure.log
need_sizet=0
else
echo "Checking for size_t... No." | tee -a configure.log
need_sizet=1
fi
echo >> configure.log
# find the size_t integer type, if needed
if test $need_sizet -eq 1; then
cat > $test.c <<EOF
# find a size_t integer type
# check for long long
cat > $test.c << EOF
long long dummy = 0;
EOF
if try $CC -c $CFLAGS $test.c; then
Loading
Loading
@@ -495,17 +495,13 @@ EOF
if try $CC $CFLAGS -o $test $test.c; then
sizet=`./$test`
echo "Checking for a pointer-size integer type..." $sizet"." | tee -a configure.log
CFLAGS="${CFLAGS} -DNO_SIZE_T=${sizet}"
SFLAGS="${SFLAGS} -DNO_SIZE_T=${sizet}"
else
echo "Failed to find a pointer-size integer type." | tee -a configure.log
leave 1
echo "Checking for a pointer-size integer type... not found." | tee -a configure.log
fi
fi
 
if test $need_sizet -eq 1; then
CFLAGS="${CFLAGS} -DNO_SIZE_T=${sizet}"
SFLAGS="${SFLAGS} -DNO_SIZE_T=${sizet}"
fi
echo >> configure.log
 
# check for large file support, and if none, check for fseeko()
Loading
Loading
@@ -849,7 +845,6 @@ echo SHAREDLIBV = $SHAREDLIBV >> configure.log
echo STATICLIB = $STATICLIB >> configure.log
echo TEST = $TEST >> configure.log
echo VER = $VER >> configure.log
echo Z_U4 = $Z_U4 >> configure.log
echo SRCDIR = $SRCDIR >> configure.log
echo exec_prefix = $exec_prefix >> configure.log
echo includedir = $includedir >> configure.log
Loading
Loading
Loading
Loading
@@ -2,11 +2,9 @@
* Copyright (C) 1995-2006, 2010, 2011, 2012, 2016, 2018 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
* CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
* tables for updating the shift register in one step with three exclusive-ors
* instead of four steps with four exclusive-ors. This results in about a
* factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
* This interleaved implementation of a CRC makes use of pipelined multiple
* arithmetic-logic units, commonly found in modern CPU cores. It is due to
* Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
*/
 
/* @(#) $Id$ */
Loading
Loading
@@ -14,13 +12,12 @@
/*
Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
protection on the static variables used to control the first-use generation
of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
first call get_crc_table() to initialize the tables before allowing more than
one thread to use crc32().
 
DYNAMIC_CRC_TABLE and MAKECRCH can be #defined to write out crc32.h. A main()
routine is also produced, so that this one source file can be compiled to an
executable.
MAKECRCH can be #defined to write out crc32.h. A main() routine is also
produced, so that this one source file can be compiled to an executable.
*/
 
#ifdef MAKECRCH
Loading
Loading
@@ -30,161 +27,164 @@
# endif /* !DYNAMIC_CRC_TABLE */
#endif /* MAKECRCH */
 
#include "zutil.h" /* for STDC and FAR definitions */
#include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
/*
A CRC of a message is computed on N braids of words in the message, where
each word consists of W bytes (4 or 8). If N is 3, for example, then three
running sparse CRCs are calculated respectively on each braid, at these
indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
This is done starting at a word boundary, and continues until as many blocks
of N * W bytes as are available have been processed. The results are combined
into a single CRC at the end. For this code, N must be in the range 1..6 and
W must be 4 or 8. The upper limit on N can be increased if desired by adding
more #if blocks, extending the patterns apparent in the code. In addition,
crc32.h would need to be regenerated, if the maximum N value is increased.
N and W are chosen empirically by benchmarking the execution time on a given
processor. The choices for N and W below were based on testing on Intel Kaby
Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
Octeon II processors. The Intel, AMD, and ARM processors were all fastest
with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
They were all tested with either gcc or clang, all using the -O3 optimization
level. Your mileage may vary.
*/
 
/* Definitions for doing the crc four data bytes at a time. */
#if !defined(NOBYFOUR) && defined(Z_U4)
# define BYFOUR
#endif
#ifdef BYFOUR
local unsigned long crc32_little OF((unsigned long,
const unsigned char FAR *, z_size_t));
local unsigned long crc32_big OF((unsigned long,
const unsigned char FAR *, z_size_t));
# define TBLS 8
/* Define N */
#ifdef Z_TESTN
# define N Z_TESTN
#else
# define TBLS 1
#endif /* BYFOUR */
# define N 5
#endif
#if N < 1 || N > 6
# error N must be in 1..6
#endif
 
/* Local functions for crc concatenation */
#define GF2_DIM 32 /* dimension of GF(2) vectors (length of CRC) */
local z_crc_t gf2_matrix_times OF((const z_crc_t *mat, z_crc_t vec));
local uLong crc32_combine_ OF((uLong crc1, uLong crc2, z_off64_t len2));
local void crc32_combine_gen_ OF((z_crc_t *op, z_off64_t len2));
/*
z_crc_t must be at least 32 bits. z_word_t must be at least as long as
z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
that bytes are eight bits.
*/
 
/* ========================================================================= */
local z_crc_t gf2_matrix_times(mat, vec)
const z_crc_t *mat;
z_crc_t vec;
{
z_crc_t sum;
sum = 0;
while (vec) {
if (vec & 1)
sum ^= *mat;
vec >>= 1;
mat++;
}
return sum;
}
/*
Define W and the associated z_word_t type. If W is not defined, then a
braided calculation is not used, and the associated tables and code are not
compiled.
*/
#ifdef Z_TESTW
# if Z_TESTW-1 != -1
# define W Z_TESTW
# endif
#else
# ifdef MAKECRCH
# define W 8 /* required for MAKECRCH */
# else
# if defined(__x86_64__) || defined(__aarch64__)
# define W 8
# else
# define W 4
# endif
# endif
#endif
#ifdef W
# if W == 8 && defined(Z_U8)
typedef Z_U8 z_word_t;
# elif defined(Z_U4)
# undef W
# define W 4
typedef Z_U4 z_word_t;
# else
# undef W
# endif
#endif
/* Local functions. */
local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
#ifdef W
local z_word_t byte_swap OF((z_word_t word));
local z_crc_t crc_word OF((z_word_t data));
local z_word_t crc_word_big OF((z_word_t data));
#endif /* W */
 
/* CRC polynomial. */
#define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */
 
#ifdef DYNAMIC_CRC_TABLE
 
local volatile int crc_table_empty = 1;
local z_crc_t FAR crc_table[TBLS][256];
local z_crc_t FAR crc_comb[GF2_DIM][GF2_DIM];
local z_crc_t FAR crc_table[256];
local z_crc_t FAR x2n_table[32];
local void make_crc_table OF((void));
local void gf2_matrix_square OF((z_crc_t *square, const z_crc_t *mat));
#ifdef W
local z_word_t FAR crc_big_table[256];
local z_crc_t FAR crc_braid_table[W][256];
local z_word_t FAR crc_braid_big_table[W][256];
local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
#endif
#ifdef MAKECRCH
local void write_table OF((FILE *, const z_crc_t FAR *, int));
local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
local void write_table64 OF((FILE *, const z_word_t FAR *, int));
#endif /* MAKECRCH */
 
/* ========================================================================= */
local void gf2_matrix_square(square, mat)
z_crc_t *square;
const z_crc_t *mat;
{
int n;
for (n = 0; n < GF2_DIM; n++)
square[n] = gf2_matrix_times(mat, mat[n]);
}
/*
Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
 
Polynomials over GF(2) are represented in binary, one bit per coefficient,
with the lowest powers in the most significant bit. Then adding polynomials
with the lowest powers in the most significant bit. Then adding polynomials
is just exclusive-or, and multiplying a polynomial by x is a right shift by
one. If we call the above polynomial p, and represent a byte as the
one. If we call the above polynomial p, and represent a byte as the
polynomial q, also with the lowest power in the most significant bit (so the
byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p,
where a mod b means the remainder after dividing a by b.
 
This calculation is done using the shift-register method of multiplying and
taking the remainder. The register is initialized to zero, and for each
taking the remainder. The register is initialized to zero, and for each
incoming bit, x^32 is added mod p to the register if the bit is a one (where
x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by
x (which is shifting right by one and adding x^32 mod p if the bit shifted
out is a one). We start with the highest power (least significant bit) of
q and repeat for all eight bits of q.
The first table is simply the CRC of all possible eight bit values. This is
all the information needed to generate CRCs on data a byte at a time for all
combinations of CRC register values and incoming bytes. The remaining tables
allow for word-at-a-time CRC calculation for both big-endian and little-
endian machines, where a word is four bytes.
*/
x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
(which is shifting right by one and adding x^32 mod p if the bit shifted out
is a one). We start with the highest power (least significant bit) of q and
repeat for all eight bits of q.
The table is simply the CRC of all possible eight bit values. This is all the
information needed to generate CRCs on data a byte at a time for all
combinations of CRC register values and incoming bytes.
*/
local void make_crc_table()
{
z_crc_t c;
int n, k;
z_crc_t poly; /* polynomial exclusive-or pattern */
/* terms of polynomial defining this crc (except x^32): */
z_crc_t p;
static volatile int first = 1; /* flag to limit concurrent making */
static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
 
/* See if another task is already doing this (not thread-safe, but better
than nothing -- significantly reduces duration of vulnerability in
case the advice about DYNAMIC_CRC_TABLE is ignored) */
if (first) {
unsigned i, j, n;
first = 0;
 
/* make exclusive-or pattern from polynomial (0xedb88320UL) */
poly = 0;
for (n = 0; n < (int)(sizeof(p)/sizeof(unsigned char)); n++)
poly |= (z_crc_t)1 << (31 - p[n]);
/* generate a crc for every 8-bit value */
for (n = 0; n < 256; n++) {
c = (z_crc_t)n;
for (k = 0; k < 8; k++)
c = c & 1 ? poly ^ (c >> 1) : c >> 1;
crc_table[0][n] = c;
}
#ifdef BYFOUR
/* generate crc for each value followed by one, two, and three zeros,
and then the byte reversal of those as well as the first table */
for (n = 0; n < 256; n++) {
c = crc_table[0][n];
crc_table[4][n] = ZSWAP32(c);
for (k = 1; k < 4; k++) {
c = crc_table[0][c & 0xff] ^ (c >> 8);
crc_table[k][n] = c;
crc_table[k + 4][n] = ZSWAP32(c);
}
}
#endif /* BYFOUR */
/* generate zero operators table for crc32_combine() */
/* generate the operator to apply a single zero bit to a CRC -- the
first row adds the polynomial if the low bit is a 1, and the
remaining rows shift the CRC right one bit */
k = GF2_DIM - 3;
crc_comb[k][0] = 0xedb88320UL; /* CRC-32 polynomial */
z_crc_t row = 1;
for (n = 1; n < GF2_DIM; n++) {
crc_comb[k][n] = row;
row <<= 1;
/* initialize the CRC of bytes tables */
for (i = 0; i < 256; i++) {
p = i;
for (j = 0; j < 8; j++)
p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
crc_table[i] = p;
#ifdef W
crc_big_table[i] = byte_swap(p);
#endif
}
 
/* generate operators that apply 2, 4, and 8 zeros to a CRC, putting
the last one, the operator for one zero byte, at the 0 position */
gf2_matrix_square(crc_comb[k + 1], crc_comb[k]);
gf2_matrix_square(crc_comb[k + 2], crc_comb[k + 1]);
gf2_matrix_square(crc_comb[0], crc_comb[k + 2]);
/* generate operators for applying 2^n zero bytes to a CRC, filling out
the remainder of the table -- the operators repeat after GF2_DIM
values of n, so the table only needs GF2_DIM entries, regardless of
the size of the length being processed */
for (n = 1; n < k; n++)
gf2_matrix_square(crc_comb[n], crc_comb[n - 1]);
/* initialize the x^2^n mod p(x) table */
p = (z_crc_t)1 << 30; /* x^1 */
x2n_table[0] = p;
for (n = 1; n < 32; n++)
x2n_table[n] = p = multmodp(p, p);
#ifdef W
/* initialize the braiding tables -- needs x2n_table[] */
braid(crc_braid_table, crc_braid_big_table, N, W);
#endif
 
/* mark tables as complete, in case someone else is waiting */
crc_table_empty = 0;
Loading
Loading
@@ -196,42 +196,145 @@ local void make_crc_table()
}
#ifdef MAKECRCH
{
/*
The crc32.h header file contains tables for both 32-bit and 64-bit
z_word_t's, and so requires a 64-bit type be available. In that case,
z_word_t must be defined to be 64-bits. This code then also generates
and writes out the tables for the case that z_word_t is 32 bits.
*/
#if !defined(W) || W != 8
# error Need a 64-bit integer type in order to generate crc32.h.
#endif
FILE *out;
int k, n;
z_crc_t ltl[8][256];
z_word_t big[8][256];
 
out = fopen("crc32.h", "w");
if (out == NULL) return;
 
/* write out CRC table to crc32.h */
fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n");
fprintf(out, " * Generated automatically by crc32.c\n */\n\n");
fprintf(out, "local const z_crc_t FAR ");
fprintf(out, "crc_table[%d][256] =\n{\n {\n", TBLS);
write_table(out, crc_table[0], 256);
# ifdef BYFOUR
fprintf(out, "#ifdef BYFOUR\n");
for (k = 1; k < 8; k++) {
fprintf(out, " },\n {\n");
write_table(out, crc_table[k], 256);
}
fprintf(out, "#endif\n");
# endif /* BYFOUR */
fprintf(out, " }\n};\n");
/* write out zero operator table to crc32.h */
fprintf(out, "\nlocal const z_crc_t FAR ");
fprintf(out, "crc_comb[%d][%d] =\n{\n {\n", GF2_DIM, GF2_DIM);
write_table(out, crc_comb[0], GF2_DIM);
for (k = 1; k < GF2_DIM; k++) {
fprintf(out, " },\n {\n");
write_table(out, crc_comb[k], GF2_DIM);
/* write out little-endian CRC table to crc32.h */
fprintf(out,
"/* crc32.h -- tables for rapid CRC calculation\n"
" * Generated automatically by crc32.c\n */\n"
"\n"
"local const z_crc_t FAR crc_table[] = {\n"
" ");
write_table(out, crc_table, 256);
fprintf(out,
"};\n");
/* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
fprintf(out,
"\n"
"#ifdef W\n"
"\n"
"#if W == 8\n"
"\n"
"local const z_word_t FAR crc_big_table[] = {\n"
" ");
write_table64(out, crc_big_table, 256);
fprintf(out,
"};\n");
/* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
fprintf(out,
"\n"
"#else /* W == 4 */\n"
"\n"
"local const z_word_t FAR crc_big_table[] = {\n"
" ");
write_table32hi(out, crc_big_table, 256);
fprintf(out,
"};\n"
"\n"
"#endif\n");
/* write out braid tables for each value of N */
for (n = 1; n <= 6; n++) {
fprintf(out,
"\n"
"#if N == %d\n", n);
/* compute braid tables for this N and 64-bit word_t */
braid(ltl, big, n, 8);
/* write out braid tables for 64-bit z_word_t to crc32.h */
fprintf(out,
"\n"
"#if W == 8\n"
"\n"
"local const z_crc_t FAR crc_braid_table[][256] = {\n");
for (k = 0; k < 8; k++) {
fprintf(out, " {");
write_table(out, ltl[k], 256);
fprintf(out, "}%s", k < 7 ? ",\n" : "");
}
fprintf(out,
"};\n"
"\n"
"local const z_word_t FAR crc_braid_big_table[][256] = {\n");
for (k = 0; k < 8; k++) {
fprintf(out, " {");
write_table64(out, big[k], 256);
fprintf(out, "}%s", k < 7 ? ",\n" : "");
}
fprintf(out,
"};\n");
/* compute braid tables for this N and 32-bit word_t */
braid(ltl, big, n, 4);
/* write out braid tables for 32-bit z_word_t to crc32.h */
fprintf(out,
"\n"
"#else /* W == 4 */\n"
"\n"
"local const z_crc_t FAR crc_braid_table[][256] = {\n");
for (k = 0; k < 4; k++) {
fprintf(out, " {");
write_table(out, ltl[k], 256);
fprintf(out, "}%s", k < 3 ? ",\n" : "");
}
fprintf(out,
"};\n"
"\n"
"local const z_word_t FAR crc_braid_big_table[][256] = {\n");
for (k = 0; k < 4; k++) {
fprintf(out, " {");
write_table32hi(out, big[k], 256);
fprintf(out, "}%s", k < 3 ? ",\n" : "");
}
fprintf(out,
"};\n"
"\n"
"#endif\n"
"\n"
"#endif\n");
}
fprintf(out, " }\n};\n");
fprintf(out,
"\n"
"#endif\n");
/* write out zeros operator table to crc32.h */
fprintf(out,
"\n"
"local const z_crc_t FAR x2n_table[] = {\n"
" ");
write_table(out, x2n_table, 32);
fprintf(out,
"};\n");
fclose(out);
}
#endif /* MAKECRCH */
}
 
#ifdef MAKECRCH
/*
Write the 32-bit values in table[0..k-1] to out, five per line in
hexadecimal separated by commas.
*/
local void write_table(out, table, k)
FILE *out;
const z_crc_t FAR *table;
Loading
Loading
@@ -240,26 +343,194 @@ local void write_table(out, table, k)
int n;
 
for (n = 0; n < k; n++)
fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : " ",
fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ",
(unsigned long)(table[n]),
n == k - 1 ? "\n" : (n % 5 == 4 ? ",\n" : ", "));
n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
}
/*
Write the high 32-bits of each value in table[0..k-1] to out, five per line
in hexadecimal separated by commas.
*/
local void write_table32hi(out, table, k)
FILE *out;
const z_word_t FAR *table;
int k;
{
int n;
for (n = 0; n < k; n++)
fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ",
(unsigned long)(table[n] >> 32),
n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
}
 
/*
Write the 64-bit values in table[0..k-1] to out, three per line in
hexadecimal separated by commas. This assumes that if there is a 64-bit
type, then there is also a long long integer type, and it is at least 64
bits. If not, then the type cast and format string can be adjusted
accordingly.
*/
local void write_table64(out, table, k)
FILE *out;
const z_word_t FAR *table;
int k;
{
int n;
for (n = 0; n < k; n++)
fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : " ",
(unsigned long long)(table[n]),
n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
}
/* Actually do the deed. */
int main()
{
make_crc_table();
return 0;
}
#endif /* MAKECRCH */
 
#ifdef W
/*
Generate the little and big-endian braid tables for the given n and z_word_t
size w. Each array must have room for w blocks of 256 elements.
*/
local void braid(ltl, big, n, w)
z_crc_t ltl[][256];
z_word_t big[][256];
int n;
int w;
{
int k;
z_crc_t i, p, q;
for (k = 0; k < w; k++) {
p = x2nmodp((n * w + 3 - k) << 3, 0);
ltl[k][0] = 0;
big[w - 1 - k][0] = 0;
for (i = 1; i < 256; i++) {
ltl[k][i] = q = multmodp(i << 24, p);
big[w - 1 - k][i] = byte_swap(q);
}
}
}
#endif
#else /* !DYNAMIC_CRC_TABLE */
/* ========================================================================
* Tables of CRC-32s of all single-byte values, made by make_crc_table(),
* and tables of zero operator matrices for crc32_combine().
* Tables for byte-wise and braided CRC-32 calculations, and a table of powers
* of x for combining CRC-32s, all made by make_crc_table().
*/
#include "crc32.h"
#endif /* DYNAMIC_CRC_TABLE */
 
/* ========================================================================
* Routines used for CRC calculation. Some are also required for the table
* generation above.
*/
/*
Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
reflected. For speed, this requires that a not be zero.
*/
local z_crc_t multmodp(a, b)
z_crc_t a;
z_crc_t b;
{
z_crc_t m, p;
m = (z_crc_t)1 << 31;
p = 0;
for (;;) {
if (a & m) {
p ^= b;
if ((a & (m - 1)) == 0)
break;
}
m >>= 1;
b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
}
return p;
}
/*
Return x^(n+k) modulo p(x). Requires that x2n_table[] has been initialized.
*/
local z_crc_t x2nmodp(n, k)
z_off64_t n;
unsigned k;
{
z_crc_t p;
p = (z_crc_t)1 << 31; /* x^0 == 1 */
while (n) {
if (n & 1)
p = multmodp(x2n_table[k & 31], p);
n >>= 1;
k++;
}
return p;
}
#ifdef W
/*
Swap the bytes in a z_word_t to convert between little and big endian. Any
self-respecting compiler will optimize this to a single machine byte-swap
instruction, if one is available. This assumes that word_t is either 32 bits
or 64 bits.
*/
local z_word_t byte_swap(word)
z_word_t word;
{
#if W == 8
return
(word & 0xff00000000000000) >> 56 |
(word & 0xff000000000000) >> 40 |
(word & 0xff0000000000) >> 24 |
(word & 0xff00000000) >> 8 |
(word & 0xff000000) << 8 |
(word & 0xff0000) << 24 |
(word & 0xff00) << 40 |
(word & 0xff) << 56;
#else /* W == 4 */
return
(word & 0xff000000) >> 24 |
(word & 0xff0000) >> 8 |
(word & 0xff00) << 8 |
(word & 0xff) << 24;
#endif
}
/*
Return the CRC of the W bytes in the word_t data, taking the
least-significant byte of the word as the first byte of data, without any pre
or post conditioning. This is used to combine the CRCs of each braid.
*/
local z_crc_t crc_word(data)
z_word_t data;
{
int k;
for (k = 0; k < W; k++)
data = (data >> 8) ^ crc_table[data & 0xff];
return (z_crc_t)data;
}
local z_word_t crc_word_big(data)
z_word_t data;
{
int k;
for (k = 0; k < W; k++)
data = (data << 8) ^
crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
return data;
}
#endif /* W */
/* =========================================================================
* This function can be used by asm versions of crc32()
*/
Loading
Loading
@@ -272,169 +543,349 @@ const z_crc_t FAR * ZEXPORT get_crc_table()
return (const z_crc_t FAR *)crc_table;
}
 
/* ========================================================================= */
#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
/* ========================================================================= */
unsigned long ZEXPORT crc32_z(crc, buf, len)
unsigned long crc;
const unsigned char FAR *buf;
z_size_t len;
{
if (buf == Z_NULL) return 0UL;
/* Return initial CRC, if requested. */
if (buf == Z_NULL) return 0;
 
#ifdef DYNAMIC_CRC_TABLE
if (crc_table_empty)
make_crc_table();
#endif /* DYNAMIC_CRC_TABLE */
 
#ifdef BYFOUR
if (sizeof(void *) == sizeof(z_size_t)) {
z_crc_t endian;
/* Pre-condition the CRC */
crc ^= 0xffffffff;
#ifdef W
/* If provided enough bytes, do a braided CRC calculation. */
if (len >= N * W + W - 1) {
z_size_t blks;
z_word_t const *words;
unsigned endian;
int k;
/* Compute the CRC up to a z_word_t boundary. */
while (len && ((z_size_t)buf & (W - 1)) != 0) {
len--;
crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
}
/* Compute the CRC on as many N z_word_t blocks as are available. */
blks = len / (N * W);
len -= blks * N * W;
words = (z_word_t const *)buf;
 
/* Do endian check at execution time instead of compile time, since ARM
processors can change the endianess at execution time. If the
compiler knows what the endianess will be, it can optimize out the
check and the unused branch. */
endian = 1;
if (*((unsigned char *)(&endian)))
return crc32_little(crc, buf, len);
else
return crc32_big(crc, buf, len);
}
#endif /* BYFOUR */
crc = crc ^ 0xffffffffUL;
while (len >= 8) {
DO8;
len -= 8;
}
if (len) do {
DO1;
} while (--len);
return crc ^ 0xffffffffUL;
}
if (*(unsigned char *)&endian) {
/* Little endian. */
z_crc_t crc0;
z_word_t word0;
#if N > 1
z_crc_t crc1;
z_word_t word1;
#if N > 2
z_crc_t crc2;
z_word_t word2;
#if N > 3
z_crc_t crc3;
z_word_t word3;
#if N > 4
z_crc_t crc4;
z_word_t word4;
#if N > 5
z_crc_t crc5;
z_word_t word5;
#endif
#endif
#endif
#endif
#endif
 
/* ========================================================================= */
unsigned long ZEXPORT crc32(crc, buf, len)
unsigned long crc;
const unsigned char FAR *buf;
uInt len;
{
return crc32_z(crc, buf, len);
}
/* Initialize the CRC for each braid. */
crc0 = crc;
#if N > 1
crc1 = 0;
#if N > 2
crc2 = 0;
#if N > 3
crc3 = 0;
#if N > 4
crc4 = 0;
#if N > 5
crc5 = 0;
#endif
#endif
#endif
#endif
#endif
 
#ifdef BYFOUR
/*
Process the first blks-1 blocks, computing the CRCs on each braid
independently.
*/
while (--blks) {
/* Load the word for each braid into registers. */
word0 = crc0 ^ words[0];
#if N > 1
word1 = crc1 ^ words[1];
#if N > 2
word2 = crc2 ^ words[2];
#if N > 3
word3 = crc3 ^ words[3];
#if N > 4
word4 = crc4 ^ words[4];
#if N > 5
word5 = crc5 ^ words[5];
#endif
#endif
#endif
#endif
#endif
words += N;
/* Compute and update the CRC for each word. The loop should
get unrolled. */
crc0 = crc_braid_table[0][word0 & 0xff];
#if N > 1
crc1 = crc_braid_table[0][word1 & 0xff];
#if N > 2
crc2 = crc_braid_table[0][word2 & 0xff];
#if N > 3
crc3 = crc_braid_table[0][word3 & 0xff];
#if N > 4
crc4 = crc_braid_table[0][word4 & 0xff];
#if N > 5
crc5 = crc_braid_table[0][word5 & 0xff];
#endif
#endif
#endif
#endif
#endif
for (k = 1; k < W; k++) {
crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
#if N > 1
crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
#if N > 2
crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
#if N > 3
crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
#if N > 4
crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
#if N > 5
crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
#endif
#endif
#endif
#endif
#endif
}
}
 
/*
This BYFOUR code accesses the passed unsigned char * buffer with a 32-bit
integer pointer type. This violates the strict aliasing rule, where a
compiler can assume, for optimization purposes, that two pointers to
fundamentally different types won't ever point to the same memory. This can
manifest as a problem only if one of the pointers is written to. This code
only reads from those pointers. So long as this code remains isolated in
this compilation unit, there won't be a problem. For this reason, this code
should not be copied and pasted into a compilation unit in which other code
writes to the buffer that is passed to these routines.
*/
/*
Process the last block, combining the CRCs of the N braids at the
same time.
*/
crc = crc_word(crc0 ^ words[0]);
#if N > 1
crc = crc_word(crc1 ^ words[1] ^ crc);
#if N > 2
crc = crc_word(crc2 ^ words[2] ^ crc);
#if N > 3
crc = crc_word(crc3 ^ words[3] ^ crc);
#if N > 4
crc = crc_word(crc4 ^ words[4] ^ crc);
#if N > 5
crc = crc_word(crc5 ^ words[5] ^ crc);
#endif
#endif
#endif
#endif
#endif
words += N;
}
else {
/* Big endian. */
z_word_t crc0, word0, comb;
#if N > 1
z_word_t crc1, word1;
#if N > 2
z_word_t crc2, word2;
#if N > 3
z_word_t crc3, word3;
#if N > 4
z_word_t crc4, word4;
#if N > 5
z_word_t crc5, word5;
#endif
#endif
#endif
#endif
#endif
 
/* ========================================================================= */
#define DOLIT4 c ^= *buf4++; \
c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
/* Initialize the CRC for each braid. */
crc0 = byte_swap(crc);
#if N > 1
crc1 = 0;
#if N > 2
crc2 = 0;
#if N > 3
crc3 = 0;
#if N > 4
crc4 = 0;
#if N > 5
crc5 = 0;
#endif
#endif
#endif
#endif
#endif
 
/* ========================================================================= */
local unsigned long crc32_little(crc, buf, len)
unsigned long crc;
const unsigned char FAR *buf;
z_size_t len;
{
register z_crc_t c;
register const z_crc_t FAR *buf4;
/*
Process the first blks-1 blocks, computing the CRCs on each braid
independently.
*/
while (--blks) {
/* Load the word for each braid into registers. */
word0 = crc0 ^ words[0];
#if N > 1
word1 = crc1 ^ words[1];
#if N > 2
word2 = crc2 ^ words[2];
#if N > 3
word3 = crc3 ^ words[3];
#if N > 4
word4 = crc4 ^ words[4];
#if N > 5
word5 = crc5 ^ words[5];
#endif
#endif
#endif
#endif
#endif
words += N;
/* Compute and update the CRC for each word. The loop should
get unrolled. */
crc0 = crc_braid_big_table[0][word0 & 0xff];
#if N > 1
crc1 = crc_braid_big_table[0][word1 & 0xff];
#if N > 2
crc2 = crc_braid_big_table[0][word2 & 0xff];
#if N > 3
crc3 = crc_braid_big_table[0][word3 & 0xff];
#if N > 4
crc4 = crc_braid_big_table[0][word4 & 0xff];
#if N > 5
crc5 = crc_braid_big_table[0][word5 & 0xff];
#endif
#endif
#endif
#endif
#endif
for (k = 1; k < W; k++) {
crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
#if N > 1
crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
#if N > 2
crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
#if N > 3
crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
#if N > 4
crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
#if N > 5
crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
#endif
#endif
#endif
#endif
#endif
}
}
 
c = (z_crc_t)crc;
c = ~c;
while (len && ((z_size_t)buf & 3)) {
c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
len--;
/*
Process the last block, combining the CRCs of the N braids at the
same time.
*/
comb = crc_word_big(crc0 ^ words[0]);
#if N > 1
comb = crc_word_big(crc1 ^ words[1] ^ comb);
#if N > 2
comb = crc_word_big(crc2 ^ words[2] ^ comb);
#if N > 3
comb = crc_word_big(crc3 ^ words[3] ^ comb);
#if N > 4
comb = crc_word_big(crc4 ^ words[4] ^ comb);
#if N > 5
comb = crc_word_big(crc5 ^ words[5] ^ comb);
#endif
#endif
#endif
#endif
#endif
words += N;
crc = byte_swap(comb);
}
/*
Update the pointer to the remaining bytes to process.
*/
buf = (unsigned char const *)words;
}
 
buf4 = (const z_crc_t FAR *)(const void FAR *)buf;
while (len >= 32) {
DOLIT32;
len -= 32;
#endif /* W */
/* Complete the computation of the CRC on any remaining bytes. */
while (len >= 8) {
len -= 8;
crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
}
while (len >= 4) {
DOLIT4;
len -= 4;
while (len) {
len--;
crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
}
buf = (const unsigned char FAR *)buf4;
 
if (len) do {
c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
} while (--len);
c = ~c;
return (unsigned long)c;
/* Return the CRC, post-conditioned. */
return crc ^ 0xffffffff;
}
 
/* ========================================================================= */
#define DOBIG4 c ^= *buf4++; \
c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
/* ========================================================================= */
local unsigned long crc32_big(crc, buf, len)
unsigned long ZEXPORT crc32(crc, buf, len)
unsigned long crc;
const unsigned char FAR *buf;
z_size_t len;
uInt len;
{
register z_crc_t c;
register const z_crc_t FAR *buf4;
c = ZSWAP32((z_crc_t)crc);
c = ~c;
while (len && ((z_size_t)buf & 3)) {
c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
len--;
}
buf4 = (const z_crc_t FAR *)(const void FAR *)buf;
while (len >= 32) {
DOBIG32;
len -= 32;
}
while (len >= 4) {
DOBIG4;
len -= 4;
}
buf = (const unsigned char FAR *)buf4;
if (len) do {
c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
} while (--len);
c = ~c;
return (unsigned long)(ZSWAP32(c));
return crc32_z(crc, buf, len);
}
 
#endif /* BYFOUR */
/* ========================================================================= */
local uLong crc32_combine_(crc1, crc2, len2)
uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
uLong crc1;
uLong crc2;
z_off64_t len2;
{
int n;
#ifdef DYNAMIC_CRC_TABLE
if (crc_table_empty)
make_crc_table();
#endif /* DYNAMIC_CRC_TABLE */
if (len2 > 0)
/* operator for 2^n zeros repeats every GF2_DIM n values */
for (n = 0; len2; n = (n + 1) % GF2_DIM, len2 >>= 1)
if (len2 & 1)
crc1 = gf2_matrix_times(crc_comb[n], crc1);
return crc1 ^ crc2;
return multmodp(x2nmodp(len2, 3), crc1) ^ crc2;
}
 
/* ========================================================================= */
Loading
Loading
@@ -443,87 +894,32 @@ uLong ZEXPORT crc32_combine(crc1, crc2, len2)
uLong crc2;
z_off_t len2;
{
return crc32_combine_(crc1, crc2, len2);
}
uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
uLong crc1;
uLong crc2;
z_off64_t len2;
{
return crc32_combine_(crc1, crc2, len2);
return crc32_combine64(crc1, crc2, len2);
}
 
/* ========================================================================= */
local void crc32_combine_gen_(op, len2)
z_crc_t *op;
uLong ZEXPORT crc32_combine_gen64(len2)
z_off64_t len2;
{
z_crc_t row;
int j;
unsigned i;
#ifdef DYNAMIC_CRC_TABLE
if (crc_table_empty)
make_crc_table();
#endif /* DYNAMIC_CRC_TABLE */
/* if len2 is zero or negative, return the identity matrix */
if (len2 <= 0) {
row = 1;
for (j = 0; j < GF2_DIM; j++) {
op[j] = row;
row <<= 1;
}
return;
}
/* at least one bit in len2 is set -- find it, and copy the operator
corresponding to that position into op */
i = 0;
for (;;) {
if (len2 & 1) {
for (j = 0; j < GF2_DIM; j++)
op[j] = crc_comb[i][j];
break;
}
len2 >>= 1;
i = (i + 1) % GF2_DIM;
}
/* for each remaining bit set in len2 (if any), multiply op by the operator
corresponding to that position */
for (;;) {
len2 >>= 1;
i = (i + 1) % GF2_DIM;
if (len2 == 0)
break;
if (len2 & 1)
for (j = 0; j < GF2_DIM; j++)
op[j] = gf2_matrix_times(crc_comb[i], op[j]);
}
return x2nmodp(len2, 3);
}
 
/* ========================================================================= */
void ZEXPORT crc32_combine_gen(op, len2)
z_crc_t *op;
uLong ZEXPORT crc32_combine_gen(len2)
z_off_t len2;
{
crc32_combine_gen_(op, len2);
}
void ZEXPORT crc32_combine_gen64(op, len2)
z_crc_t *op;
z_off64_t len2;
{
crc32_combine_gen_(op, len2);
return crc32_combine_gen64(len2);
}
 
/* ========================================================================= */
uLong crc32_combine_op(crc1, crc2, op)
uLong crc1;
uLong crc2;
const z_crc_t *op;
uLong op;
{
return gf2_matrix_times(op, crc1) ^ crc2;
return multmodp(op, crc1) ^ crc2;
}
Source diff could not be displayed: it is too large. Options to address this: view the blob.
File added
Loading
Loading
@@ -1724,7 +1724,7 @@ ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2,
negative, the result has no meaning or utility.
*/
 
ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len));
ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len));
/*
Update a running CRC-32 with the bytes buf[0..len-1] and return the
updated CRC-32. If buf is Z_NULL, this function returns the required
Loading
Loading
@@ -1758,19 +1758,17 @@ ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2));
*/
 
/*
ZEXTERN void ZEXPORT crc32_combine_gen OF((z_crc_t op[32], z_off_t len2));
ZEXTERN uLong ZEXPORT crc32_combine_gen OF((z_off_t len2));
 
Generate the operator op corresponding to length len2, to be used with
crc32_combine_op(). op must have room for 32 z_crc_t values. (32 is the
number of bits in the CRC.)
Return the operator corresponding to length len2, to be used with
crc32_combine_op().
*/
 
ZEXTERN uLong ZEXPORT crc32_combine_op OF((uLong crc1, uLong crc2,
const z_crc_t *op));
ZEXTERN uLong ZEXPORT crc32_combine_op OF((uLong crc1, uLong crc2, uLong op));
/*
Give the same result as crc32_combine(), using op in place of len2. op is
is generated from len2 by crc32_combine_gen(). This will be faster than
crc32_combine() if the generated op is used many times.
crc32_combine() if the generated op is used more than once.
*/
 
 
Loading
Loading
@@ -1860,7 +1858,7 @@ ZEXTERN int ZEXPORT gzgetc_ OF((gzFile file)); /* backward compatibility */
ZEXTERN z_off64_t ZEXPORT gzoffset64 OF((gzFile));
ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off64_t));
ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off64_t));
ZEXTERN void ZEXPORT crc32_combine_gen64 OF((z_crc_t *op, z_off64_t));
ZEXTERN uLong ZEXPORT crc32_combine_gen64 OF((z_off64_t));
#endif
 
#if !defined(ZLIB_INTERNAL) && defined(Z_WANT64)
Loading
Loading
@@ -1888,7 +1886,7 @@ ZEXTERN int ZEXPORT gzgetc_ OF((gzFile file)); /* backward compatibility */
ZEXTERN z_off_t ZEXPORT gzoffset64 OF((gzFile));
ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off_t));
ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off_t));
ZEXTERN void ZEXPORT crc32_combine_gen64 OF((z_crc_t *op, z_off_t));
ZEXTERN uLong ZEXPORT crc32_combine_gen64 OF((z_off_t));
# endif
#else
ZEXTERN gzFile ZEXPORT gzopen OF((const char *, const char *));
Loading
Loading
@@ -1897,14 +1895,14 @@ ZEXTERN int ZEXPORT gzgetc_ OF((gzFile file)); /* backward compatibility */
ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile));
ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t));
ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t));
ZEXTERN void ZEXPORT crc32_combine_gen OF((z_crc_t *op, z_off_t));
ZEXTERN uLong ZEXPORT crc32_combine_gen OF((z_off_t));
#endif
 
#else /* Z_SOLO */
 
ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t));
ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t));
ZEXTERN void ZEXPORT crc32_combine_gen OF((z_crc_t *op, z_off_t));
ZEXTERN uLong ZEXPORT crc32_combine_gen OF((z_off_t));
 
#endif /* !Z_SOLO */
 
Loading
Loading
Loading
Loading
@@ -42,6 +42,17 @@ typedef unsigned short ush;
typedef ush FAR ushf;
typedef unsigned long ulg;
 
#if !defined(Z_U8) && !defined(Z_SOLO) && defined(STDC)
# include <limits.h>
# if (ULONG_MAX == 0xffffffffffffffffULL)
# define Z_U8 unsigned long
# elif (ULLONG_MAX == 0xffffffffffffffffULL)
# define Z_U8 unsigned long long
# elif (UINT_MAX == 0xffffffffffffffffULL)
# define Z_U8 unsigned
# endif
#endif
extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
/* (size given to avoid silly warnings with Visual C++) */
 
Loading
Loading
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment