chiark - git - ian - stressapptest/blob - src/adler32memcpy.cc

   1 // Copyright 2008 Google Inc. All Rights Reserved.
   2
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "adler32memcpy.h"
  16
  17 // We are using (a modified form of) adler-32 checksum algorithm instead
  18 // of CRC since adler-32 is faster than CRC.
  19 // (Comparison: http://guru.multimedia.cx/crc32-vs-adler32/)
  20 // This form of adler is bit modified, instead of treating the data in
  21 // units of bytes, 32-bit data is taken as a unit and two 64-bit
  22 // checksums are done (we could have one checksum but two checksums
  23 // make the code run faster).
  24
  25 // Adler-32 implementation:
  26 //   Data is treated as 1-byte numbers and,
  27 //   there are two 16-bit numbers a and b
  28 //   Initialize a with 1 and b with 0.
  29 //   for each data unit 'd'
  30 //      a += d
  31 //      b += a
  32 //   checksum = a<<16 + b
  33 //   This sum should never overflow.
  34 //
  35 // Adler-64+64 implementation:
  36 //   (applied in this code)
  37 //   Data is treated as 32-bit numbers and whole data is separated into two
  38 //   streams, and hence the two checksums a1, a2, b1 and b2.
  39 //   Initialize a1 and a2 with 1, b1 and b2 with 0
  40 //   add first dataunit to a1
  41 //   add a1 to b1
  42 //   add second dataunit to a1
  43 //   add a1 to b1
  44 //   add third dataunit to a2
  45 //   add a2 to b2
  46 //   add fourth dataunit to a2
  47 //   add a2 to b2
  48 //   ...
  49 //   repeat the sequence back for next 4 dataunits
  50 //
  51 //   variable A = XMM6 and variable B = XMM7.
  52 //   (a1 = lower 8 bytes of XMM6 and b1 = lower 8 bytes of XMM7)
  53
  54 // Assumptions
  55 // 1. size_in_bytes is a multiple of 16.
  56 // 2. srcmem and dstmem are 16 byte aligned.
  57 // 3. size_in_bytes is less than 2^19 bytes.
  58
  59 // Assumption 3 ensures that there is no overflow when numbers are being
  60 // added (we can remove this assumption by doing modulus with a prime
  61 // number when it is just about to overflow but that would be a very costly
  62 // exercise)
  63
  64 // Returns true if the checksums are equal.
  65 bool AdlerChecksum::Equals(const AdlerChecksum &other) const {
  66   return ( (a1_ == other.a1_) && (a2_ == other.a2_) &&
  67            (b1_ == other.b1_) && (b2_ == other.b2_) );
  68 }
  69
  70 // Returns string representation of the Adler checksum.
  71 string AdlerChecksum::ToHexString() const {
  72   char buffer[128];
  73   snprintf(buffer, sizeof(buffer), "%016llx %016llx %016llx %016llx", a1_, a2_, b1_, b2_);
  74   return string(buffer);
  75 }
  76
  77 // Sets components of the Adler checksum.
  78 void AdlerChecksum::Set(uint64 a1, uint64 a2, uint64 b1, uint64 b2) {
  79   a1_ = a1;
  80   a2_ = a2;
  81   b1_ = b1;
  82   b2_ = b2;
  83 }
  84
  85 // Calculates Adler checksum for supplied data.
  86 bool CalculateAdlerChecksum(uint64 *data64, unsigned int size_in_bytes,
  87                             AdlerChecksum *checksum) {
  88   // Use this data wrapper to access memory with 64bit read/write.
  89   datacast_t data;
  90   unsigned int count = size_in_bytes / sizeof(data);
  91
  92   if (count > (1U) << 19) {
  93     // Size is too large, must be strictly less than 512 KB.
  94     return false;
  95   }
  96
  97   uint64 a1 = 1;
  98   uint64 a2 = 1;
  99   uint64 b1 = 0;
 100   uint64 b2 = 0;
 101
 102   unsigned int i = 0;
 103   while (i < count) {
 104     // Process 64 bits at a time.
 105     data.l64 = data64[i];
 106     a1 = a1 + data.l32.l;
 107     b1 = b1 + a1;
 108     a1 = a1 + data.l32.h;
 109     b1 = b1 + a1;
 110     i++;
 111
 112     data.l64 = data64[i];
 113     a2 = a2 + data.l32.l;
 114     b2 = b2 + a2;
 115     a2 = a2 + data.l32.h;
 116     b2 = b2 + a2;
 117     i++;
 118   }
 119   checksum->Set(a1, a2, b1, b2);
 120   return true;
 121 }
 122
 123 // C implementation of Adler memory copy.
 124 bool AdlerMemcpyC(uint64 *dstmem64, uint64 *srcmem64,
 125                   unsigned int size_in_bytes, AdlerChecksum *checksum) {
 126   // Use this data wrapper to access memory with 64bit read/write.
 127   datacast_t data;
 128   unsigned int count = size_in_bytes / sizeof(data);
 129
 130   if (count > ((1U) << 19)) {
 131     // Size is too large, must be strictly less than 512 KB.
 132     return false;
 133   }
 134
 135   uint64 a1 = 1;
 136   uint64 a2 = 1;
 137   uint64 b1 = 0;
 138   uint64 b2 = 0;
 139
 140   unsigned int i = 0;
 141   while (i < count) {
 142     // Process 64 bits at a time.
 143     data.l64 = srcmem64[i];
 144     a1 = a1 + data.l32.l;
 145     b1 = b1 + a1;
 146     a1 = a1 + data.l32.h;
 147     b1 = b1 + a1;
 148     dstmem64[i] = data.l64;
 149     i++;
 150
 151     data.l64 = srcmem64[i];
 152     a2 = a2 + data.l32.l;
 153     b2 = b2 + a2;
 154     a2 = a2 + data.l32.h;
 155     b2 = b2 + a2;
 156     dstmem64[i] = data.l64;
 157     i++;
 158   }
 159   checksum->Set(a1, a2, b1, b2);
 160   return true;
 161 }
 162
 163 // C implementation of Adler memory copy with some float point ops,
 164 // attempting to warm up the CPU.
 165 bool AdlerMemcpyWarmC(uint64 *dstmem64, uint64 *srcmem64,
 166                       unsigned int size_in_bytes, AdlerChecksum *checksum) {
 167   // Use this data wrapper to access memory with 64bit read/write.
 168   datacast_t data;
 169   unsigned int count = size_in_bytes / sizeof(data);
 170
 171   if (count > ((1U) << 19)) {
 172     // Size is too large, must be strictly less than 512 KB.
 173     return false;
 174   }
 175
 176   uint64 a1 = 1;
 177   uint64 a2 = 1;
 178   uint64 b1 = 0;
 179   uint64 b2 = 0;
 180
 181   double a = 2.0 * static_cast<double>(srcmem64[0]);
 182   double b = 5.0 * static_cast<double>(srcmem64[0]);
 183   double c = 7.0 * static_cast<double>(srcmem64[0]);
 184   double d = 9.0 * static_cast<double>(srcmem64[0]);
 185
 186   unsigned int i = 0;
 187   while (i < count) {
 188     // Process 64 bits at a time.
 189     data.l64 = srcmem64[i];
 190     a1 = a1 + data.l32.l;
 191     b1 = b1 + a1;
 192     a1 = a1 + data.l32.h;
 193     b1 = b1 + a1;
 194     dstmem64[i] = data.l64;
 195     i++;
 196
 197     // Warm cpu up.
 198     a = a * b;
 199     b = b + c;
 200
 201     data.l64 = srcmem64[i];
 202     a2 = a2 + data.l32.l;
 203     b2 = b2 + a2;
 204     a2 = a2 + data.l32.h;
 205     b2 = b2 + a2;
 206     dstmem64[i] = data.l64;
 207     i++;
 208
 209     // Warm cpu up.
 210     c = c * d;
 211     d = d + d;
 212   }
 213
 214   // Warm cpu up.
 215   d = a + b + c + d;
 216   if (d == 1.0) {
 217     // Reference the result so that it can't be discarded by the compiler.
 218     printf("Log: This will probably never happen.\n");
 219   }
 220
 221   checksum->Set(a1, a2, b1, b2);
 222   return true;
 223 }
 224
 225 // x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy.
 226 bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
 227                     unsigned int size_in_bytes, AdlerChecksum *checksum) {
 228 // Use assembly implementation where supported.
 229 #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
 230
 231 // Pull a bit of tricky preprocessing to make the inline asm both
 232 // 32 bit and 64 bit.
 233 #ifdef STRESSAPPTEST_CPU_I686  // Instead of coding both, x86...
 234 #define rAX "%%eax"
 235 #define rCX "%%ecx"
 236 #define rDX "%%edx"
 237 #define rBX "%%ebx"
 238 #define rSP "%%esp"
 239 #define rBP "%%ebp"
 240 #define rSI "%%esi"
 241 #define rDI "%%edi"
 242 #endif
 243
 244 #ifdef STRESSAPPTEST_CPU_X86_64  // ...and x64, we use rXX macros.
 245 #define rAX "%%rax"
 246 #define rCX "%%rcx"
 247 #define rDX "%%rdx"
 248 #define rBX "%%rbx"
 249 #define rSP "%%rsp"
 250 #define rBP "%%rbp"
 251 #define rSI "%%rsi"
 252 #define rDI "%%rdi"
 253 #endif
 254
 255   // Elements 0 to 3 are used for holding checksum terms a1, a2,
 256   // b1, b2 respectively. These elements are filled by asm code.
 257   // Elements 4 and 5 are used by asm code to for ANDing MMX data and removing
 258   // 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are
 259   // setting word index 0 and word index 2 to zero).
 260   // Element 6 and 7 are used for setting a1 and a2 to 1.
 261   volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
 262       {0, 0, 0, 0, 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1};
 263
 264   if ((size_in_bytes >> 19) > 0) {
 265     // Size is too large. Must be less than 2^19 bytes = 512 KB.
 266     return false;
 267   }
 268
 269   // Number of 32-bit words which are not added to a1/a2 in the main loop.
 270   uint32 remaining_words = (size_in_bytes % 48) / 4;
 271
 272   // Since we are moving 48 bytes at a time number of iterations = total size/48
 273   // is value of counter.
 274   uint32 num_of_48_byte_units = size_in_bytes / 48;
 275
 276   asm volatile (
 277       // Source address is in ESI (extended source index)
 278       // destination is in EDI (extended destination index)
 279       // and counter is already in ECX (extended counter
 280       // index).
 281       "cmp  $0, " rCX ";"   // Compare counter to zero.
 282       "jz END;"
 283
 284       // XMM6 is initialized with 1 and XMM7 with 0.
 285       "prefetchnta  0(" rSI ");"
 286       "prefetchnta 64(" rSI ");"
 287       "movdqu   48(" rAX "), %%xmm6;"
 288       "xorps      %%xmm7, %%xmm7;"
 289
 290       // Start of the loop which copies 48 bytes from source to dst each time.
 291       "TOP:\n"
 292
 293       // Make 6 moves each of 16 bytes from srcmem to XMM registers.
 294       // We are using 2 words out of 4 words in each XMM register,
 295       // word index 0 and word index 2
 296       "movdqa   0(" rSI "), %%xmm0;"
 297       "movdqu   4(" rSI "), %%xmm1;"  // Be careful to use unaligned move here.
 298       "movdqa  16(" rSI "), %%xmm2;"
 299       "movdqu  20(" rSI "), %%xmm3;"
 300       "movdqa  32(" rSI "), %%xmm4;"
 301       "movdqu  36(" rSI "), %%xmm5;"
 302
 303       // Move 3 * 16 bytes from XMM registers to dstmem.
 304       // Note: this copy must be performed before pinsrw instructions since
 305       // they will modify the XMM registers.
 306       "movntdq %%xmm0,  0(" rDI ");"
 307       "movntdq %%xmm2, 16(" rDI ");"
 308       "movntdq %%xmm4, 32(" rDI ");"
 309
 310       // Sets the word[1] and word[3] of XMM0 to XMM5 to zero.
 311       "andps 32(" rAX "), %%xmm0;"
 312       "andps 32(" rAX "), %%xmm1;"
 313       "andps 32(" rAX "), %%xmm2;"
 314       "andps 32(" rAX "), %%xmm3;"
 315       "andps 32(" rAX "), %%xmm4;"
 316       "andps 32(" rAX "), %%xmm5;"
 317
 318       // Add XMM0 to XMM6 and then add XMM6 to XMM7.
 319       // Repeat this for XMM1, ..., XMM5.
 320       // Overflow(for XMM7) can occur only if there are more
 321       // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
 322       // if size_in_bytes > 2^19 than overflow occurs.
 323       "paddq %%xmm0, %%xmm6;"
 324       "paddq %%xmm6, %%xmm7;"
 325       "paddq %%xmm1, %%xmm6;"
 326       "paddq %%xmm6, %%xmm7;"
 327       "paddq %%xmm2, %%xmm6;"
 328       "paddq %%xmm6, %%xmm7;"
 329       "paddq %%xmm3, %%xmm6;"
 330       "paddq %%xmm6, %%xmm7;"
 331       "paddq %%xmm4, %%xmm6;"
 332       "paddq %%xmm6, %%xmm7;"
 333       "paddq %%xmm5, %%xmm6;"
 334       "paddq %%xmm6, %%xmm7;"
 335
 336       // Increment ESI and EDI by 48 bytes and decrement counter by 1.
 337       "add $48, " rSI ";"
 338       "add $48, " rDI ";"
 339       "prefetchnta  0(" rSI ");"
 340       "prefetchnta 64(" rSI ");"
 341       "dec " rCX ";"
 342       "jnz TOP;"
 343
 344       // Now only remaining_words 32-bit words are left.
 345       // make a loop, add first two words to a1 and next two to a2 (just like
 346       // above loop, the only extra thing we are doing is rechecking
 347       // rDX (=remaining_words) everytime we add a number to a1/a2.
 348       "REM_IS_STILL_NOT_ZERO:\n"
 349       // Unless remaining_words becomes less than 4 words(16 bytes)
 350       // there is not much issue and remaining_words will always
 351       // be a multiple of four by assumption.
 352       "cmp $4, " rDX ";"
 353       // In case for some weird reasons if remaining_words becomes
 354       // less than 4 but not zero then also break the code and go off to END.
 355       "jl END;"
 356       // Otherwise just go on and copy data in chunks of 4-words at a time till
 357       // whole data (<48 bytes) is copied.
 358       "movdqa  0(" rSI "), %%xmm0;"    // Copy next 4-words to XMM0 and to XMM1.
 359
 360       "movdqa  0(" rSI "), %%xmm5;"    // Accomplish movdqu 4(%rSI) without
 361       "pshufd $0x39, %%xmm5, %%xmm1;"  // indexing off memory boundary.
 362
 363       "movntdq %%xmm0,  0(" rDI ");"   // Copy 4-words to destination.
 364       "andps  32(" rAX "), %%xmm0;"
 365       "andps  32(" rAX "), %%xmm1;"
 366       "paddq     %%xmm0, %%xmm6;"
 367       "paddq     %%xmm6, %%xmm7;"
 368       "paddq     %%xmm1, %%xmm6;"
 369       "paddq     %%xmm6, %%xmm7;"
 370       "add $16, " rSI ";"
 371       "add $16, " rDI ";"
 372       "sub $4, " rDX ";"
 373       // Decrement %rDX by 4 since %rDX is number of 32-bit
 374       // words left after considering all 48-byte units.
 375       "jmp REM_IS_STILL_NOT_ZERO;"
 376
 377       "END:\n"
 378       // Report checksum values A and B (both right now are two concatenated
 379       // 64 bit numbers and have to be converted to 64 bit numbers)
 380       // seems like Adler128 (since size of each part is 4 byte rather than
 381       // 1 byte).
 382       "movdqa %%xmm6,   0(" rAX ");"
 383       "movdqa %%xmm7,  16(" rAX ");"
 384       "sfence;"
 385
 386       // No output registers.
 387       :
 388       // Input registers.
 389       : "S" (srcmem64), "D" (dstmem64), "a" (checksum_arr),
 390         "c" (num_of_48_byte_units), "d" (remaining_words)
 391   );  // asm.
 392
 393   if (checksum != NULL) {
 394     checksum->Set(checksum_arr[0], checksum_arr[1],
 395                   checksum_arr[2], checksum_arr[3]);
 396   }
 397
 398   // Everything went fine, so return true (this does not mean
 399   // that there is no problem with memory this just mean that data was copied
 400   // from src to dst and checksum was calculated successfully).
 401   return true;
 402 #elif defined(STRESSAPPTEST_CPU_ARMV7A) && defined(__ARM_NEON__)
 403   // Elements 0 to 3 are used for holding checksum terms a1, a2,
 404   // b1, b2 respectively. These elements are filled by asm code.
 405   // Checksum is seeded with the null checksum.
 406   volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
 407       {1, 1, 0, 0};
 408
 409   if ((size_in_bytes >> 19) > 0) {
 410     // Size is too large. Must be less than 2^19 bytes = 512 KB.
 411     return false;
 412   }
 413
 414   // Since we are moving 64 bytes at a time number of iterations = total size/64
 415   uint32 blocks = size_in_bytes / 64;
 416
 417   uint64 *dst = dstmem64;
 418   uint64 *src = srcmem64;
 419
 420   #define src_r "r3"
 421   #define dst_r "r4"
 422   #define blocks_r "r5"
 423   #define crc_r "r6"
 424
 425   asm volatile (
 426       "mov " src_r ", %[src];           \n"
 427       "mov " dst_r ", %[dst];           \n"
 428       "mov " crc_r ", %[crc];           \n"
 429       "mov " blocks_r ", %[blocks];     \n"
 430
 431       // Loop over block count.
 432       "cmp " blocks_r ", #0;    \n"   // Compare counter to zero.
 433       "ble END;                 \n"
 434
 435
 436       // Preload upcoming cacheline.
 437       "pld [" src_r ", #0x0];   \n"
 438       "pld [" src_r ", #0x20];  \n"
 439
 440       // Init checksum
 441       "vldm " crc_r ", {q0};            \n"
 442       "vmov.i32 q1, #0;                 \n"
 443
 444       // Start of the loop which copies 48 bytes from source to dst each time.
 445       "TOP:                     \n"
 446
 447       // Make 3 moves each of 16 bytes from srcmem to qX registers.
 448       // We are using 2 words out of 4 words in each qX register,
 449       // word index 0 and word index 2. We'll swizzle them in a bit.
 450       // Copy it.
 451       "vldm " src_r "!, {q8, q9, q10, q11};     \n"
 452       "vstm " dst_r "!, {q8, q9, q10, q11};     \n"
 453
 454       // Arrange it.
 455       "vmov.i64 q12, #0;        \n"
 456       "vmov.i64 q13, #0;        \n"
 457       "vmov.i64 q14, #0;        \n"
 458       "vmov.i64 q15, #0;        \n"
 459       // This exchenges words 1,3 in the filled registers with
 460       // words 0,2 in the empty registers.
 461       "vtrn.32 q8, q12;         \n"
 462       "vtrn.32 q9, q13;         \n"
 463       "vtrn.32 q10, q14;        \n"
 464       "vtrn.32 q11, q15;        \n"
 465
 466       // Sum into q0, then into q1.
 467       // Repeat this for q8 - q13.
 468       // Overflow can occur only if there are more
 469       // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
 470       // if size_in_bytes > 2^19 than overflow occurs.
 471       "vadd.i64 q0, q0, q8;     \n"
 472       "vadd.i64 q1, q1, q0;     \n"
 473       "vadd.i64 q0, q0, q12;    \n"
 474       "vadd.i64 q1, q1, q0;     \n"
 475       "vadd.i64 q0, q0, q9;     \n"
 476       "vadd.i64 q1, q1, q0;     \n"
 477       "vadd.i64 q0, q0, q13;    \n"
 478       "vadd.i64 q1, q1, q0;     \n"
 479
 480       "vadd.i64 q0, q0, q10;    \n"
 481       "vadd.i64 q1, q1, q0;     \n"
 482       "vadd.i64 q0, q0, q14;    \n"
 483       "vadd.i64 q1, q1, q0;     \n"
 484       "vadd.i64 q0, q0, q11;    \n"
 485       "vadd.i64 q1, q1, q0;     \n"
 486       "vadd.i64 q0, q0, q15;    \n"
 487       "vadd.i64 q1, q1, q0;     \n"
 488
 489       // Increment counter and loop.
 490       "sub " blocks_r ", " blocks_r ", #1;      \n"
 491       "cmp " blocks_r ", #0;    \n"   // Compare counter to zero.
 492       "bgt TOP; \n"
 493
 494
 495       "END:\n"
 496       // Report checksum values A and B (both right now are two concatenated
 497       // 64 bit numbers and have to be converted to 64 bit numbers)
 498       // seems like Adler128 (since size of each part is 4 byte rather than
 499       // 1 byte).
 500       "vstm " crc_r ", {q0, q1};        \n"
 501
 502       // Output registers.
 503       :
 504       // Input registers.
 505       : [src] "r"(src), [dst] "r"(dst), [blocks] "r"(blocks) , [crc] "r"(checksum_arr)
 506       : "memory", "cc", "r3", "r4", "r5", "r6", "q0", "q1", "q8","q9","q10", "q11", "q12","q13","q14","q15"
 507   );  // asm.
 508
 509   if (checksum != NULL) {
 510     checksum->Set(checksum_arr[0], checksum_arr[1],
 511                   checksum_arr[2], checksum_arr[3]);
 512   }
 513
 514   // Everything went fine, so return true (this does not mean
 515   // that there is no problem with memory this just mean that data was copied
 516   // from src to dst and checksum was calculated successfully).
 517   return true;
 518 #else
 519   #warning "No vector copy defined for this architecture."
 520   // Fall back to C implementation for anything else.
 521   return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
 522 #endif
 523 }