From 3c1c63e2c8620aeb552aba19374c7af134bb63fd Mon Sep 17 00:00:00 2001 From: "nick.j.sanders" Date: Tue, 11 Feb 2014 05:57:33 +0000 Subject: [PATCH] Add NEON checksum and some bugfixes * Add NEON copy and checksum for "-W" on ARM * Fix timer overflow for log runs under 32 bit. * Fix assert on checksum failure without miscompare. * Improve checksum error printout. --- src/adler32memcpy.cc | 119 ++++++++++++++++++++++++++++++++++++++++++- src/os.cc | 11 ++-- src/os.h | 8 +-- src/sat.cc | 2 +- src/sattypes.h | 2 + src/worker.cc | 10 ++-- src/worker.h | 8 +-- 7 files changed, 141 insertions(+), 19 deletions(-) diff --git a/src/adler32memcpy.cc b/src/adler32memcpy.cc index 69324f7..47c6262 100644 --- a/src/adler32memcpy.cc +++ b/src/adler32memcpy.cc @@ -70,7 +70,7 @@ bool AdlerChecksum::Equals(const AdlerChecksum &other) const { // Returns string representation of the Adler checksum. string AdlerChecksum::ToHexString() const { char buffer[128]; - snprintf(buffer, sizeof(buffer), "%llx%llx%llx%llx", a1_, a2_, b1_, b2_); + snprintf(buffer, sizeof(buffer), "%016llx %016llx %016llx %016llx", a1_, a2_, b1_, b2_); return string(buffer); } @@ -395,11 +395,128 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64, checksum_arr[2], checksum_arr[3]); } + // Everything went fine, so return true (this does not mean + // that there is no problem with memory this just mean that data was copied + // from src to dst and checksum was calculated successfully). + return true; +#elif defined(STRESSAPPTEST_CPU_ARMV7A) && defined(__ARM_NEON__) + // Elements 0 to 3 are used for holding checksum terms a1, a2, + // b1, b2 respectively. These elements are filled by asm code. + // Checksum is seeded with the null checksum. + volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) = + {1, 1, 0, 0}; + + if ((size_in_bytes >> 19) > 0) { + // Size is too large. Must be less than 2^19 bytes = 512 KB. + return false; + } + + // Since we are moving 64 bytes at a time number of iterations = total size/64 + uint32 blocks = size_in_bytes / 64; + + uint64 *dst = dstmem64; + uint64 *src = srcmem64; + + #define src_r "r3" + #define dst_r "r4" + #define blocks_r "r5" + #define crc_r "r6" + + asm volatile ( + "mov "src_r", %[src]; \n" + "mov "dst_r", %[dst]; \n" + "mov "crc_r", %[crc]; \n" + "mov "blocks_r", %[blocks]; \n" + + // Loop over block count. + "cmp "blocks_r", #0; \n" // Compare counter to zero. + "ble END; \n" + + + // Preload upcoming cacheline. + "pld ["src_r", #0x0]; \n" + "pld ["src_r", #0x20]; \n" + + // Init checksum + "vldm "crc_r", {q0}; \n" + "vmov.i32 q1, #0; \n" + + // Start of the loop which copies 48 bytes from source to dst each time. + "TOP: \n" + + // Make 3 moves each of 16 bytes from srcmem to qX registers. + // We are using 2 words out of 4 words in each qX register, + // word index 0 and word index 2. We'll swizzle them in a bit. + // Copy it. + "vldm "src_r"!, {q8, q9, q10, q11}; \n" + "vstm "dst_r"!, {q8, q9, q10, q11}; \n" + + // Arrange it. + "vmov.i64 q12, #0; \n" + "vmov.i64 q13, #0; \n" + "vmov.i64 q14, #0; \n" + "vmov.i64 q15, #0; \n" + // This exchenges words 1,3 in the filled registers with + // words 0,2 in the empty registers. + "vtrn.32 q8, q12; \n" + "vtrn.32 q9, q13; \n" + "vtrn.32 q10, q14; \n" + "vtrn.32 q11, q15; \n" + + // Sum into q0, then into q1. + // Repeat this for q8 - q13. + // Overflow can occur only if there are more + // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so + // if size_in_bytes > 2^19 than overflow occurs. + "vadd.i64 q0, q0, q8; \n" + "vadd.i64 q1, q1, q0; \n" + "vadd.i64 q0, q0, q12; \n" + "vadd.i64 q1, q1, q0; \n" + "vadd.i64 q0, q0, q9; \n" + "vadd.i64 q1, q1, q0; \n" + "vadd.i64 q0, q0, q13; \n" + "vadd.i64 q1, q1, q0; \n" + + "vadd.i64 q0, q0, q10; \n" + "vadd.i64 q1, q1, q0; \n" + "vadd.i64 q0, q0, q14; \n" + "vadd.i64 q1, q1, q0; \n" + "vadd.i64 q0, q0, q11; \n" + "vadd.i64 q1, q1, q0; \n" + "vadd.i64 q0, q0, q15; \n" + "vadd.i64 q1, q1, q0; \n" + + // Increment counter and loop. + "sub "blocks_r", "blocks_r", #1; \n" + "cmp "blocks_r", #0; \n" // Compare counter to zero. + "bgt TOP; \n" + + + "END:\n" + // Report checksum values A and B (both right now are two concatenated + // 64 bit numbers and have to be converted to 64 bit numbers) + // seems like Adler128 (since size of each part is 4 byte rather than + // 1 byte). + "vstm "crc_r", {q0, q1}; \n" + + // Output registers. + : + // Input registers. + : [src] "r"(src), [dst] "r"(dst), [blocks] "r"(blocks) , [crc] "r"(checksum_arr) + : "memory", "cc", "r3", "r4", "r5", "r6", "q0", "q1", "q8","q9","q10", "q11", "q12","q13","q14","q15" + ); // asm. + + if (checksum != NULL) { + checksum->Set(checksum_arr[0], checksum_arr[1], + checksum_arr[2], checksum_arr[3]); + } + // Everything went fine, so return true (this does not mean // that there is no problem with memory this just mean that data was copied // from src to dst and checksum was calculated successfully). return true; #else + #warning "No vector copy defined for this architecture." // Fall back to C implementation for anything else. return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum); #endif diff --git a/src/os.cc b/src/os.cc index 6358398..7c4e3d1 100644 --- a/src/os.cc +++ b/src/os.cc @@ -79,7 +79,7 @@ OsLayer::OsLayer() { address_mode_ = sizeof(pvoid) * 8; has_clflush_ = false; - has_sse2_ = false; + has_vector_ = false; use_flush_page_cache_ = false; @@ -183,15 +183,18 @@ void OsLayer::GetFeatures() { unsigned int eax = 1, ebx, ecx, edx; cpuid(&eax, &ebx, &ecx, &edx); has_clflush_ = (edx >> 19) & 1; - has_sse2_ = (edx >> 26) & 1; + has_vector_ = (edx >> 26) & 1; // SSE2 caps bit. logprintf(9, "Log: has clflush: %s, has sse2: %s\n", has_clflush_ ? "true" : "false", - has_sse2_ ? "true" : "false"); + has_vector_ ? "true" : "false"); #elif defined(STRESSAPPTEST_CPU_PPC) // All PPC implementations have cache flush instructions. has_clflush_ = true; #elif defined(STRESSAPPTEST_CPU_ARMV7A) + // TODO(nsanders): add detect from /proc/cpuinfo or /proc/self/auxv. + // For now assume neon and don't run -W if you don't have it. + has_vector_ = true; // NEON. #warning "Unsupported CPU type ARMV7A: unable to determine feature set." #else #warning "Unsupported CPU type: unable to determine feature set." @@ -253,7 +256,7 @@ void OsLayer::Flush(void *vaddr) { bool OsLayer::AdlerMemcpyWarm(uint64 *dstmem, uint64 *srcmem, unsigned int size_in_bytes, AdlerChecksum *checksum) { - if (has_sse2_) { + if (has_vector_) { return AdlerMemcpyAsm(dstmem, srcmem, size_in_bytes, checksum); } else { return AdlerMemcpyWarmC(dstmem, srcmem, size_in_bytes, checksum); diff --git a/src/os.h b/src/os.h index 13660d8..2272e4d 100644 --- a/src/os.h +++ b/src/os.h @@ -17,6 +17,7 @@ #define STRESSAPPTEST_OS_H_ #include +#include #include #include @@ -153,7 +154,6 @@ class OsLayer { asm volatile("clflush (%0)" : : "r" (vaddr)); asm volatile("mfence"); #elif defined(STRESSAPPTEST_CPU_ARMV7A) - #warning "Unsupported CPU type ARMV7A: Using syscall to cache flush." // ARMv7a cachelines are 8 words (32 bytes). syscall(__ARM_NR_cacheflush, vaddr, reinterpret_cast(vaddr) + 32, 0); #else @@ -267,10 +267,10 @@ class OsLayer { __asm __volatile("rdtsc" : "=a" (data.l32.l), "=d"(data.l32.h)); tsc = data.l64; #elif defined(STRESSAPPTEST_CPU_ARMV7A) - #warning "Unsupported CPU type ARMV7A: your build may not function correctly" + #warning "Unsupported CPU type ARMV7A: your timer may not function correctly" tsc = 0; #else - #warning "Unsupported CPU type: your build may not function correctly" + #warning "Unsupported CPU type: your timer may not function correctly" tsc = 0; #endif return (tsc); @@ -381,7 +381,7 @@ class OsLayer { int num_nodes_; // Number of nodes in the system. int num_cpus_per_node_; // Number of cpus per node in the system. int address_mode_; // Are we running 32 or 64 bit? - bool has_sse2_; // Do we have sse2 instructions? + bool has_vector_; // Do we have sse2/neon instructions? bool has_clflush_; // Do we have clflush instructions? bool use_flush_page_cache_; // Do we need to flush the page cache? diff --git a/src/sat.cc b/src/sat.cc index 57fd4fe..56c6b66 100644 --- a/src/sat.cc +++ b/src/sat.cc @@ -1614,7 +1614,7 @@ void Sat::AnalysisAllStats() { map_it != workers_map_.end(); ++map_it) { for (WorkerVector::const_iterator it = map_it->second->begin(); it != map_it->second->end(); ++it) { - thread_runtime_sec = (*it)->GetRunDurationUSec()*1.0/1000000; + thread_runtime_sec = (*it)->GetRunDurationUSec()*1.0/1000000.; total_data += (*it)->GetMemoryCopiedData(); total_data += (*it)->GetDeviceCopiedData(); if (thread_runtime_sec > max_runtime_sec) { diff --git a/src/sattypes.h b/src/sattypes.h index e51db31..79bb47d 100644 --- a/src/sattypes.h +++ b/src/sattypes.h @@ -225,6 +225,8 @@ inline void cpuid( #endif // defined(__PIC__) && defined(STRESSAPPTEST_CPU_I686) #elif defined(STRESSAPPTEST_CPU_PPC) return; +#elif defined(STRESSAPPTEST_CPU_ARMV7A) + return; #else #warning "Unsupported CPU type." #endif diff --git a/src/worker.cc b/src/worker.cc index dcffd4e..0864661 100644 --- a/src/worker.cc +++ b/src/worker.cc @@ -1359,10 +1359,10 @@ int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe, blocksize, currentblock * blocksize, 0); if (errorcount == 0) { - logprintf(0, "Log: CrcWarmCopyPage CRC mismatch %s != %s, " + logprintf(0, "Log: CrcWarmCopyPage CRC mismatch expected: %s != actual: %s, " "but no miscompares found. Retrying with fresh data.\n", - crc.ToHexString().c_str(), - expectedcrc->ToHexString().c_str()); + expectedcrc->ToHexString().c_str(), + crc.ToHexString().c_str() ); if (!tag_mode_) { // Copy the data originally read from this region back again. // This data should have any corruption read originally while @@ -1382,7 +1382,7 @@ int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe, expectedcrc->ToHexString().c_str()); struct ErrorRecord er; er.actual = sourcemem[0]; - er.expected = 0x0; + er.expected = 0xbad; er.vaddr = sourcemem; ProcessError(&er, 0, "Hardware Error"); } @@ -1954,7 +1954,7 @@ bool FileThread::Work() { // Load patterns into page records. page_recs_ = new struct PageRec[sat_->disk_pages()]; for (int i = 0; i < sat_->disk_pages(); i++) { - page_recs_[i].pattern = new struct Pattern(); + page_recs_[i].pattern = new class Pattern(); } // Loop until done. diff --git a/src/worker.h b/src/worker.h index 6f9fde7..091d96b 100644 --- a/src/worker.h +++ b/src/worker.h @@ -240,7 +240,7 @@ class WorkerThread { int64 ReadThreadTimer() { struct timeval end_time_; gettimeofday(&end_time_, NULL); - return (end_time_.tv_sec - start_time_.tv_sec)*1000000 + + return (end_time_.tv_sec - start_time_.tv_sec)*1000000ULL + (end_time_.tv_usec - start_time_.tv_usec); } // Stops per-WorkerThread timer and records thread run duration. @@ -264,10 +264,10 @@ class WorkerThread { // Calculate worker thread specific bandwidth. virtual float GetMemoryBandwidth() {return GetMemoryCopiedData() / ( - runduration_usec_ * 1.0 / 1000000);} + runduration_usec_ * 1.0 / 1000000.);} virtual float GetDeviceBandwidth() {return GetDeviceCopiedData() / ( - runduration_usec_ * 1.0 / 1000000);} + runduration_usec_ * 1.0 / 1000000.);} void set_cpu_mask(cpu_set_t *mask) { memcpy(&cpu_mask_, mask, sizeof(*mask)); @@ -421,7 +421,7 @@ class FileThread : public WorkerThread { // Record of where these pages were sourced from, and what // potentially broken components they passed through. struct PageRec { - struct Pattern *pattern; // This is the data it should contain. + class Pattern *pattern; // This is the data it should contain. void *src; // This is the memory location the data was sourced from. void *dst; // This is where it ended up. }; -- 2.30.2