Add NEON checksum and some bugfixes

author nick.j.sanders <nick.j.sanders@gmail.com>

Tue, 11 Feb 2014 05:57:33 +0000 (05:57 +0000)

committer nick.j.sanders <nick.j.sanders@gmail.com>

Tue, 11 Feb 2014 05:57:33 +0000 (05:57 +0000)
author nick.j.sanders <nick.j.sanders@gmail.com>
Tue, 11 Feb 2014 05:57:33 +0000 (05:57 +0000)
committer nick.j.sanders <nick.j.sanders@gmail.com>
Tue, 11 Feb 2014 05:57:33 +0000 (05:57 +0000)
diff --git a/src/adler32memcpy.cc b/src/adler32memcpy.cc

index 69324f7b9a104baef20bb842f5921245aca79489..47c62628f512ea0d09db5554a92a0ba8f4cd479d 100644 (file)
--- a/src/adler32memcpy.cc
+++ b/src/adler32memcpy.cc
@@ -70,7 +70,7 @@ bool AdlerChecksum::Equals(const AdlerChecksum &other) const {
  // Returns string representation of the Adler checksum.
  string AdlerChecksum::ToHexString() const {
    char buffer[128];
-  snprintf(buffer, sizeof(buffer), "%llx%llx%llx%llx", a1_, a2_, b1_, b2_);
+  snprintf(buffer, sizeof(buffer), "%016llx %016llx %016llx %016llx", a1_, a2_, b1_, b2_);
    return string(buffer);
  }
  
@@ -395,11 +395,128 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
                    checksum_arr[2], checksum_arr[3]);
    }
  
+  // Everything went fine, so return true (this does not mean
+  // that there is no problem with memory this just mean that data was copied
+  // from src to dst and checksum was calculated successfully).
+  return true;
+#elif defined(STRESSAPPTEST_CPU_ARMV7A) && defined(__ARM_NEON__)
+  // Elements 0 to 3 are used for holding checksum terms a1, a2,
+  // b1, b2 respectively. These elements are filled by asm code.
+  // Checksum is seeded with the null checksum.
+  volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
+      {1, 1, 0, 0};
+
+  if ((size_in_bytes >> 19) > 0) {
+    // Size is too large. Must be less than 2^19 bytes = 512 KB.
+    return false;
+  }
+
+  // Since we are moving 64 bytes at a time number of iterations = total size/64
+  uint32 blocks = size_in_bytes / 64;
+
+  uint64 *dst = dstmem64;
+  uint64 *src = srcmem64;
+
+  #define src_r "r3"
+  #define dst_r "r4"
+  #define blocks_r "r5"
+  #define crc_r "r6"
+
+  asm volatile (
+      "mov "src_r", %[src];            \n"
+      "mov "dst_r", %[dst];            \n"
+      "mov "crc_r", %[crc];            \n"
+      "mov "blocks_r", %[blocks];      \n"
+
+      // Loop over block count.
+      "cmp "blocks_r", #0;     \n"   // Compare counter to zero.
+      "ble END;                        \n"
+
+
+      // Preload upcoming cacheline.
+      "pld ["src_r", #0x0];    \n"
+      "pld ["src_r", #0x20];   \n"
+
+      // Init checksum
+      "vldm "crc_r", {q0};             \n"
+      "vmov.i32 q1, #0;                        \n"
+
+      // Start of the loop which copies 48 bytes from source to dst each time.
+      "TOP:                    \n"
+
+      // Make 3 moves each of 16 bytes from srcmem to qX registers.
+      // We are using 2 words out of 4 words in each qX register,
+      // word index 0 and word index 2. We'll swizzle them in a bit.
+      // Copy it.
+      "vldm "src_r"!, {q8, q9, q10, q11};      \n"
+      "vstm "dst_r"!, {q8, q9, q10, q11};      \n"
+
+      // Arrange it.
+      "vmov.i64 q12, #0;       \n"
+      "vmov.i64 q13, #0;       \n"
+      "vmov.i64 q14, #0;       \n"
+      "vmov.i64 q15, #0;       \n"
+      // This exchenges words 1,3 in the filled registers with 
+      // words 0,2 in the empty registers.
+      "vtrn.32 q8, q12;                \n"
+      "vtrn.32 q9, q13;                \n"
+      "vtrn.32 q10, q14;       \n"
+      "vtrn.32 q11, q15;       \n"
+
+      // Sum into q0, then into q1.
+      // Repeat this for q8 - q13.
+      // Overflow can occur only if there are more
+      // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
+      // if size_in_bytes > 2^19 than overflow occurs.
+      "vadd.i64 q0, q0, q8;    \n"
+      "vadd.i64 q1, q1, q0;    \n"
+      "vadd.i64 q0, q0, q12;   \n"
+      "vadd.i64 q1, q1, q0;    \n"
+      "vadd.i64 q0, q0, q9;    \n"
+      "vadd.i64 q1, q1, q0;    \n"
+      "vadd.i64 q0, q0, q13;   \n"
+      "vadd.i64 q1, q1, q0;    \n"
+
+      "vadd.i64 q0, q0, q10;   \n"
+      "vadd.i64 q1, q1, q0;    \n"
+      "vadd.i64 q0, q0, q14;   \n"
+      "vadd.i64 q1, q1, q0;    \n"
+      "vadd.i64 q0, q0, q11;   \n"
+      "vadd.i64 q1, q1, q0;    \n"
+      "vadd.i64 q0, q0, q15;   \n"
+      "vadd.i64 q1, q1, q0;    \n"
+
+      // Increment counter and loop.
+      "sub "blocks_r", "blocks_r", #1; \n"
+      "cmp "blocks_r", #0;     \n"   // Compare counter to zero.
+      "bgt TOP;        \n"
+
+
+      "END:\n"
+      // Report checksum values A and B (both right now are two concatenated
+      // 64 bit numbers and have to be converted to 64 bit numbers)
+      // seems like Adler128 (since size of each part is 4 byte rather than
+      // 1 byte).
+      "vstm "crc_r", {q0, q1}; \n"
+
+      // Output registers.
+      :
+      // Input registers.
+      : [src] "r"(src), [dst] "r"(dst), [blocks] "r"(blocks) , [crc] "r"(checksum_arr)
+      : "memory", "cc", "r3", "r4", "r5", "r6", "q0", "q1", "q8","q9","q10", "q11", "q12","q13","q14","q15"
+  );  // asm.
+
+  if (checksum != NULL) {
+    checksum->Set(checksum_arr[0], checksum_arr[1],
+                  checksum_arr[2], checksum_arr[3]);
+  }
+
    // Everything went fine, so return true (this does not mean
    // that there is no problem with memory this just mean that data was copied
    // from src to dst and checksum was calculated successfully).
    return true;
  #else
+  #warning "No vector copy defined for this architecture."
    // Fall back to C implementation for anything else.
    return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
  #endif
diff --git a/src/os.cc b/src/os.cc

index 63583984984f050b92c6e8c3ff935d9a8be39a0c..7c4e3d132c5149f1f1ca611577e61e4cab4a1f0f 100644 (file)
--- a/src/os.cc
+++ b/src/os.cc
@@ -79,7 +79,7 @@ OsLayer::OsLayer() {
    address_mode_ = sizeof(pvoid) * 8;
  
    has_clflush_ = false;
-  has_sse2_ = false;
+  has_vector_ = false;
  
    use_flush_page_cache_ = false;
  
@@ -183,15 +183,18 @@ void OsLayer::GetFeatures() {
    unsigned int eax = 1, ebx, ecx, edx;
    cpuid(&eax, &ebx, &ecx, &edx);
    has_clflush_ = (edx >> 19) & 1;
-  has_sse2_ = (edx >> 26) & 1;
+  has_vector_ = (edx >> 26) & 1;  // SSE2 caps bit.
  
    logprintf(9, "Log: has clflush: %s, has sse2: %s\n",
              has_clflush_ ? "true" : "false",
-            has_sse2_ ? "true" : "false");
+            has_vector_ ? "true" : "false");
  #elif defined(STRESSAPPTEST_CPU_PPC)
    // All PPC implementations have cache flush instructions.
    has_clflush_ = true;
  #elif defined(STRESSAPPTEST_CPU_ARMV7A)
+  // TODO(nsanders): add detect from /proc/cpuinfo or /proc/self/auxv.
+  // For now assume neon and don't run -W if you don't have it.
+  has_vector_ = true; // NEON.
  #warning "Unsupported CPU type ARMV7A: unable to determine feature set."
  #else
  #warning "Unsupported CPU type: unable to determine feature set."
@@ -253,7 +256,7 @@ void OsLayer::Flush(void *vaddr) {
  bool OsLayer::AdlerMemcpyWarm(uint64 *dstmem, uint64 *srcmem,
                                unsigned int size_in_bytes,
                                AdlerChecksum *checksum) {
-  if (has_sse2_) {
+  if (has_vector_) {
      return AdlerMemcpyAsm(dstmem, srcmem, size_in_bytes, checksum);
    } else {
      return AdlerMemcpyWarmC(dstmem, srcmem, size_in_bytes, checksum);
diff --git a/src/os.h b/src/os.h

index 13660d8e7ead63b7ea16ff1ba8e9c84de08d41f7..2272e4d8d9c4d0eeb23513bae44db07e74427e38 100644 (file)
--- a/src/os.h
+++ b/src/os.h
@@ -17,6 +17,7 @@
  #define STRESSAPPTEST_OS_H_
  
  #include <dirent.h>
+#include <unistd.h>
  #include <sys/syscall.h>
  
  #include <string>
@@ -153,7 +154,6 @@ class OsLayer {
      asm volatile("clflush (%0)" : : "r" (vaddr));
      asm volatile("mfence");
  #elif defined(STRESSAPPTEST_CPU_ARMV7A)
-    #warning "Unsupported CPU type ARMV7A: Using syscall to cache flush."
      // ARMv7a cachelines are 8 words (32 bytes).
      syscall(__ARM_NR_cacheflush, vaddr, reinterpret_cast<char*>(vaddr) + 32, 0);
  #else
@@ -267,10 +267,10 @@ class OsLayer {
      __asm __volatile("rdtsc" : "=a" (data.l32.l), "=d"(data.l32.h));
      tsc = data.l64;
  #elif defined(STRESSAPPTEST_CPU_ARMV7A)
-  #warning "Unsupported CPU type ARMV7A: your build may not function correctly"
+    #warning "Unsupported CPU type ARMV7A: your timer may not function correctly"
      tsc = 0;
  #else
-  #warning "Unsupported CPU type: your build may not function correctly"
+    #warning "Unsupported CPU type: your timer may not function correctly"
      tsc = 0;
  #endif
      return (tsc);
@@ -381,7 +381,7 @@ class OsLayer {
    int   num_nodes_;              // Number of nodes in the system.
    int   num_cpus_per_node_;      // Number of cpus per node in the system.
    int   address_mode_;           // Are we running 32 or 64 bit?
-  bool  has_sse2_;               // Do we have sse2 instructions?
+  bool  has_vector_;             // Do we have sse2/neon instructions?
    bool  has_clflush_;            // Do we have clflush instructions?
    bool  use_flush_page_cache_;   // Do we need to flush the page cache?
  
diff --git a/src/sat.cc b/src/sat.cc

index 57fd4fe02fac5b520f90c5e0cb5ce4718527e111..56c6b6695162c533a3508f559cf4a39f2b736af8 100644 (file)
--- a/src/sat.cc
+++ b/src/sat.cc
@@ -1614,7 +1614,7 @@ void Sat::AnalysisAllStats() {
         map_it != workers_map_.end(); ++map_it) {
      for (WorkerVector::const_iterator it = map_it->second->begin();
           it != map_it->second->end(); ++it) {
-      thread_runtime_sec = (*it)->GetRunDurationUSec()*1.0/1000000;
+      thread_runtime_sec = (*it)->GetRunDurationUSec()*1.0/1000000.;
        total_data += (*it)->GetMemoryCopiedData();
        total_data += (*it)->GetDeviceCopiedData();
        if (thread_runtime_sec > max_runtime_sec) {
diff --git a/src/sattypes.h b/src/sattypes.h

index e51db318b44b9fe7b1be0cc41cc5a223b568517f..79bb47dc8071dc182b4538d2b52648fa2c600650 100644 (file)
--- a/src/sattypes.h
+++ b/src/sattypes.h
@@ -225,6 +225,8 @@ inline void cpuid(
  #endif  // defined(__PIC__) && defined(STRESSAPPTEST_CPU_I686)
  #elif defined(STRESSAPPTEST_CPU_PPC)
    return;
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+  return;
  #else
  #warning "Unsupported CPU type."
  #endif
diff --git a/src/worker.cc b/src/worker.cc

index dcffd4e77893df93d0aaa60d167e7c093ea0b272..0864661f2892f663482e39bf1306718c4f523083 100644 (file)
--- a/src/worker.cc
+++ b/src/worker.cc
@@ -1359,10 +1359,10 @@ int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe,
                                     blocksize,
                                     currentblock * blocksize, 0);
        if (errorcount == 0) {
-        logprintf(0, "Log: CrcWarmCopyPage CRC mismatch %s != %s, "
+        logprintf(0, "Log: CrcWarmCopyPage CRC mismatch expected: %s != actual: %s, "
                       "but no miscompares found. Retrying with fresh data.\n",
-                  crc.ToHexString().c_str(),
-                  expectedcrc->ToHexString().c_str());
+                  expectedcrc->ToHexString().c_str(),
+                  crc.ToHexString().c_str() );
          if (!tag_mode_) {
            // Copy the data originally read from this region back again.
            // This data should have any corruption read originally while
@@ -1382,7 +1382,7 @@ int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe,
                        expectedcrc->ToHexString().c_str());
              struct ErrorRecord er;
              er.actual = sourcemem[0];
-            er.expected = 0x0;
+            er.expected = 0xbad;
              er.vaddr = sourcemem;
              ProcessError(&er, 0, "Hardware Error");
            }
@@ -1954,7 +1954,7 @@ bool FileThread::Work() {
    // Load patterns into page records.
    page_recs_ = new struct PageRec[sat_->disk_pages()];
    for (int i = 0; i < sat_->disk_pages(); i++) {
-    page_recs_[i].pattern = new struct Pattern();
+    page_recs_[i].pattern = new class Pattern();
    }
  
    // Loop until done.
diff --git a/src/worker.h b/src/worker.h

index 6f9fde7d1f9ffd6b2c9796b851946d1c963cbfab..091d96b6557533aa000d205c369e2d4d1e603e4f 100644 (file)
--- a/src/worker.h
+++ b/src/worker.h
@@ -240,7 +240,7 @@ class WorkerThread {
    int64 ReadThreadTimer() {
      struct timeval end_time_;
      gettimeofday(&end_time_, NULL);
-    return (end_time_.tv_sec - start_time_.tv_sec)*1000000 +
+    return (end_time_.tv_sec - start_time_.tv_sec)*1000000ULL +
        (end_time_.tv_usec - start_time_.tv_usec);
    }
    // Stops per-WorkerThread timer and records thread run duration.
@@ -264,10 +264,10 @@ class WorkerThread {
    // Calculate worker thread specific bandwidth.
    virtual float GetMemoryBandwidth()
      {return GetMemoryCopiedData() / (
-        runduration_usec_ * 1.0 / 1000000);}
+        runduration_usec_ * 1.0 / 1000000.);}
    virtual float GetDeviceBandwidth()
      {return GetDeviceCopiedData() / (
-        runduration_usec_ * 1.0 / 1000000);}
+        runduration_usec_ * 1.0 / 1000000.);}
  
    void set_cpu_mask(cpu_set_t *mask) {
      memcpy(&cpu_mask_, mask, sizeof(*mask));
@@ -421,7 +421,7 @@ class FileThread : public WorkerThread {
    // Record of where these pages were sourced from, and what
    // potentially broken components they passed through.
    struct PageRec {
-     struct Pattern *pattern;  // This is the data it should contain.
+     class Pattern *pattern;  // This is the data it should contain.
       void *src;  // This is the memory location the data was sourced from.
       void *dst;  // This is where it ended up.
    };
author	nick.j.sanders <nick.j.sanders@gmail.com>
	Tue, 11 Feb 2014 05:57:33 +0000 (05:57 +0000)
committer	nick.j.sanders <nick.j.sanders@gmail.com>
	Tue, 11 Feb 2014 05:57:33 +0000 (05:57 +0000)
src/adler32memcpy.cc		patch \| blob \| history
src/os.cc		patch \| blob \| history
src/os.h		patch \| blob \| history
src/sat.cc		patch \| blob \| history
src/sattypes.h		patch \| blob \| history
src/worker.cc		patch \| blob \| history
src/worker.h		patch \| blob \| history