1 // Copyright 2008 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "adler32memcpy.h"
17 // We are using (a modified form of) adler-32 checksum algorithm instead
18 // of CRC since adler-32 is faster than CRC.
19 // (Comparison: http://guru.multimedia.cx/crc32-vs-adler32/)
20 // This form of adler is bit modified, instead of treating the data in
21 // units of bytes, 32-bit data is taken as a unit and two 64-bit
22 // checksums are done (we could have one checksum but two checksums
23 // make the code run faster).
25 // Adler-32 implementation:
26 // Data is treated as 1-byte numbers and,
27 // there are two 16-bit numbers a and b
28 // Initialize a with 1 and b with 0.
29 // for each data unit 'd'
32 // checksum = a<<16 + b
33 // This sum should never overflow.
35 // Adler-64+64 implementation:
36 // (applied in this code)
37 // Data is treated as 32-bit numbers and whole data is separated into two
38 // streams, and hence the two checksums a1, a2, b1 and b2.
39 // Initialize a1 and a2 with 1, b1 and b2 with 0
40 // add first dataunit to a1
42 // add second dataunit to a1
44 // add third dataunit to a2
46 // add fourth dataunit to a2
49 // repeat the sequence back for next 4 dataunits
51 // variable A = XMM6 and variable B = XMM7.
52 // (a1 = lower 8 bytes of XMM6 and b1 = lower 8 bytes of XMM7)
55 // 1. size_in_bytes is a multiple of 16.
56 // 2. srcmem and dstmem are 16 byte aligned.
57 // 3. size_in_bytes is less than 2^19 bytes.
59 // Assumption 3 ensures that there is no overflow when numbers are being
60 // added (we can remove this assumption by doing modulus with a prime
61 // number when it is just about to overflow but that would be a very costly
64 // Returns true if the checksums are equal.
65 bool AdlerChecksum::Equals(const AdlerChecksum &other) const {
66 return ( (a1_ == other.a1_) && (a2_ == other.a2_) &&
67 (b1_ == other.b1_) && (b2_ == other.b2_) );
70 // Returns string representation of the Adler checksum.
71 string AdlerChecksum::ToHexString() const {
73 snprintf(buffer, sizeof(buffer), "%llx%llx%llx%llx", a1_, a2_, b1_, b2_);
74 return string(buffer);
77 // Sets components of the Adler checksum.
78 void AdlerChecksum::Set(uint64 a1, uint64 a2, uint64 b1, uint64 b2) {
85 // Calculates Adler checksum for supplied data.
86 bool CalculateAdlerChecksum(uint64 *data64, unsigned int size_in_bytes,
87 AdlerChecksum *checksum) {
88 // Use this data wrapper to access memory with 64bit read/write.
90 unsigned int count = size_in_bytes / sizeof(data);
92 if (count > (1U) << 19) {
93 // Size is too large, must be strictly less than 512 KB.
104 // Process 64 bits at a time.
105 data.l64 = data64[i];
106 a1 = a1 + data.l32.l;
108 a1 = a1 + data.l32.h;
112 data.l64 = data64[i];
113 a2 = a2 + data.l32.l;
115 a2 = a2 + data.l32.h;
119 checksum->Set(a1, a2, b1, b2);
123 // C implementation of Adler memory copy.
124 bool AdlerMemcpyC(uint64 *dstmem64, uint64 *srcmem64,
125 unsigned int size_in_bytes, AdlerChecksum *checksum) {
126 // Use this data wrapper to access memory with 64bit read/write.
128 unsigned int count = size_in_bytes / sizeof(data);
130 if (count > ((1U) << 19)) {
131 // Size is too large, must be strictly less than 512 KB.
142 // Process 64 bits at a time.
143 data.l64 = srcmem64[i];
144 a1 = a1 + data.l32.l;
146 a1 = a1 + data.l32.h;
148 dstmem64[i] = data.l64;
151 data.l64 = srcmem64[i];
152 a2 = a2 + data.l32.l;
154 a2 = a2 + data.l32.h;
156 dstmem64[i] = data.l64;
159 checksum->Set(a1, a2, b1, b2);
163 // C implementation of Adler memory copy with some float point ops,
164 // attempting to warm up the CPU.
165 bool AdlerMemcpyWarmC(uint64 *dstmem64, uint64 *srcmem64,
166 unsigned int size_in_bytes, AdlerChecksum *checksum) {
167 // Use this data wrapper to access memory with 64bit read/write.
169 unsigned int count = size_in_bytes / sizeof(data);
171 if (count > ((1U) << 19)) {
172 // Size is too large, must be strictly less than 512 KB.
181 double a = 2.0 * static_cast<double>(srcmem64[0]);
182 double b = 5.0 * static_cast<double>(srcmem64[0]);
183 double c = 7.0 * static_cast<double>(srcmem64[0]);
184 double d = 9.0 * static_cast<double>(srcmem64[0]);
188 // Process 64 bits at a time.
189 data.l64 = srcmem64[i];
190 a1 = a1 + data.l32.l;
192 a1 = a1 + data.l32.h;
194 dstmem64[i] = data.l64;
201 data.l64 = srcmem64[i];
202 a2 = a2 + data.l32.l;
204 a2 = a2 + data.l32.h;
206 dstmem64[i] = data.l64;
217 // Reference the result so that it can't be discarded by the compiler.
218 printf("Log: This will probably never happen.\n");
221 checksum->Set(a1, a2, b1, b2);
225 // x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy.
226 bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
227 unsigned int size_in_bytes, AdlerChecksum *checksum) {
228 // Use assembly implementation only with 64bit compilation.
229 #ifndef STRESSAPPTEST_CPU_X86_64
230 // Fall back to C implementation for 32bit compilation.
231 return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
233 // Elements 0 to 3 are used for holding checksum terms a1, a2,
234 // b1, b2 respectively. These elements are filled by asm code.
235 // Elements 4 and 5 are used by asm code to for ANDing MMX data and removing
236 // 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are
237 // setting word index 0 and word index 2 to zero).
238 // Element 6 and 7 are used for setting a1 and a2 to 1.
239 volatile uint64 checksum_arr[] = {0, 0, 0, 0,
240 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1};
242 if ((size_in_bytes >> 19) > 0) {
243 // Size is too large. Must be less than 2^19 bytes = 512 KB.
247 // Number of 32-bit words which are not added to a1/a2 in the main loop.
248 uint64 remaining_words = (size_in_bytes % 48) / 4;
250 // Since we are moving 48 bytes at a time number of iterations = total size/48
251 // is value of counter.
252 uint64 num_of_48_byte_units = size_in_bytes / 48;
255 // Source address is in ESI (extended source index)
256 // destination is in EDI (extended destination index)
257 // and counter is already in ECX (extended counter index).
258 "cmp $0, %%ecx;" // Compare counter to zero.
261 // XMM6 is initialized with 1 and XMM7 with 0.
262 "prefetchnta 0(%%rsi);"
263 "prefetchnta 64(%%rsi);"
264 "movdqu 48(%%rax), %%xmm6;"
265 "xorps %%xmm7, %%xmm7;"
267 // Start of the loop which copies 48 bytes from source to dst each time.
270 // Make 6 moves each of 16 bytes from srcmem to XMM registers.
271 // We are using 2 words out of 4 words in each XMM register,
272 // word index 0 and word index 2)
273 "movdqa 0(%%rsi), %%xmm0;"
274 "movdqu 4(%%rsi), %%xmm1;" // Be careful to use unaligned move here.
275 "movdqa 16(%%rsi), %%xmm2;"
276 "movdqu 20(%%rsi), %%xmm3;"
277 "movdqa 32(%%rsi), %%xmm4;"
278 "movdqu 36(%%rsi), %%xmm5;"
280 // Move 3 * 16 bytes from XMM registers to dstmem.
281 // Note: this copy must be performed before pinsrw instructions since
282 // they will modify the XMM registers.
283 "movntdq %%xmm0, 0(%%rdi);"
284 "movntdq %%xmm2, 16(%%rdi);"
285 "movntdq %%xmm4, 32(%%rdi);"
287 // Sets the word[1] and word[3] of XMM0 to XMM5 to zero.
288 "andps 32(%%rax), %%xmm0;"
289 "andps 32(%%rax), %%xmm1;"
290 "andps 32(%%rax), %%xmm2;"
291 "andps 32(%%rax), %%xmm3;"
292 "andps 32(%%rax), %%xmm4;"
293 "andps 32(%%rax), %%xmm5;"
295 // Add XMM0 to XMM6 and then add XMM6 to XMM7.
296 // Repeat this for XMM1, ..., XMM5.
297 // Overflow(for XMM7) can occur only if there are more
298 // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
299 // if size_in_bytes > 2^19 than overflow occurs.
300 "paddq %%xmm0, %%xmm6;"
301 "paddq %%xmm6, %%xmm7;"
302 "paddq %%xmm1, %%xmm6;"
303 "paddq %%xmm6, %%xmm7;"
304 "paddq %%xmm2, %%xmm6;"
305 "paddq %%xmm6, %%xmm7;"
306 "paddq %%xmm3, %%xmm6;"
307 "paddq %%xmm6, %%xmm7;"
308 "paddq %%xmm4, %%xmm6;"
309 "paddq %%xmm6, %%xmm7;"
310 "paddq %%xmm5, %%xmm6;"
311 "paddq %%xmm6, %%xmm7;"
313 // Increment ESI and EDI by 48 bytes and decrement counter by 1.
316 "prefetchnta 0(%%rsi);"
317 "prefetchnta 64(%%rsi);"
321 // Now only remaining_words 32-bit words are left.
322 // make a loop, add first two words to a1 and next two to a2 (just like
323 // above loop, the only extra thing we are doing is rechecking
324 // %rdx (=remaining_words) everytime we add a number to a1/a2.
325 "REM_IS_STILL_NOT_ZERO:\n"
326 // Unless remaining_words becomes less than 4 words(16 bytes)
327 // there is not much issue and remaining_words will always
328 // be a multiple of four by assumption.
330 // In case for some weird reasons if remaining_words becomes
331 // less than 4 but not zero then also break the code and go off to END.
333 // Otherwise just go on and copy data in chunks of 4-words at a time till
334 // whole data (<48 bytes) is copied.
335 "movdqa 0(%%rsi), %%xmm0;" // Copy next 4-words to XMM0 and to XMM1.
337 "movdqa 0(%%rsi), %%xmm5;" // Accomplish movdqu 4(%%rsi) without
338 "pshufd $0x39, %%xmm5, %%xmm1;" // indexing off memory boundary.
340 "movntdq %%xmm0, 0(%%rdi);" // Copy 4-words to destination.
341 "andps 32(%%rax), %%xmm0;"
342 "andps 32(%%rax), %%xmm1;"
343 "paddq %%xmm0, %%xmm6;"
344 "paddq %%xmm6, %%xmm7;"
345 "paddq %%xmm1, %%xmm6;"
346 "paddq %%xmm6, %%xmm7;"
350 // Decrement %%rdx by 4 since %%rdx is number of 32-bit
351 // words left after considering all 48-byte units.
352 "jmp REM_IS_STILL_NOT_ZERO;"
355 // Report checksum values A and B (both right now are two concatenated
356 // 64 bit numbers and have to be converted to 64 bit numbers)
357 // seems like Adler128 (since size of each part is 4 byte rather than
359 "movdqa %%xmm6, 0(%%rax);"
360 "movdqa %%xmm7, 16(%%rax);"
363 // No output registers.
366 : "S" (srcmem64), "D" (dstmem64), "a" (checksum_arr),
367 "c" (num_of_48_byte_units), "d" (remaining_words)
370 if (checksum != NULL) {
371 checksum->Set(checksum_arr[0], checksum_arr[1],
372 checksum_arr[2], checksum_arr[3]);
375 // Everything went fine, so return true (this does not mean
376 // that there is no problem with memory this just mean that data was copied
377 // from src to dst and checksum was calculated successfully).