chiark - git - ian - stressapptest/blob - src/worker.cc

   1 // Copyright 2006 Google Inc. All Rights Reserved.
   2
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 // worker.cc : individual tasks that can be run in combination to
  16 // stress the system
  17
  18 #include <errno.h>
  19 #include <pthread.h>
  20 #include <sched.h>
  21 #include <signal.h>
  22 #include <stdlib.h>
  23 #include <stdio.h>
  24 #include <stdint.h>
  25 #include <string.h>
  26 #include <time.h>
  27 #include <unistd.h>
  28
  29 #include <sys/select.h>
  30 #include <sys/stat.h>
  31 #include <sys/types.h>
  32 #include <sys/times.h>
  33
  34 // These are necessary, but on by default
  35 // #define __USE_GNU
  36 // #define __USE_LARGEFILE64
  37 #include <fcntl.h>
  38 #include <sys/socket.h>
  39 #include <netdb.h>
  40 #include <arpa/inet.h>
  41 #include <linux/unistd.h>  // for gettid
  42
  43 // For size of block device
  44 #include <sys/ioctl.h>
  45 #include <linux/fs.h>
  46 // For asynchronous I/O
  47 #ifdef HAVE_LIBAIO_H
  48 #include <libaio.h>
  49 #endif
  50
  51 #include <sys/syscall.h>
  52
  53 #include <set>
  54 #include <string>
  55
  56 // This file must work with autoconf on its public version,
  57 // so these includes are correct.
  58 #include "error_diag.h"  // NOLINT
  59 #include "os.h"          // NOLINT
  60 #include "pattern.h"     // NOLINT
  61 #include "queue.h"       // NOLINT
  62 #include "sat.h"         // NOLINT
  63 #include "sattypes.h"    // NOLINT
  64 #include "worker.h"      // NOLINT
  65
  66 // Syscalls
  67 // Why ubuntu, do you hate gettid so bad?
  68 #if !defined(__NR_gettid)
  69   #define __NR_gettid             224
  70 #endif
  71
  72 #define gettid() syscall(__NR_gettid)
  73 #if !defined(CPU_SETSIZE)
  74 _syscall3(int, sched_getaffinity, pid_t, pid,
  75           unsigned int, len, cpu_set_t*, mask)
  76 _syscall3(int, sched_setaffinity, pid_t, pid,
  77           unsigned int, len, cpu_set_t*, mask)
  78 #endif
  79
  80 namespace {
  81   // Work around the sad fact that there are two (gnu, xsi) incompatible
  82   // versions of strerror_r floating around google. Awesome.
  83   bool sat_strerror(int err, char *buf, int len) {
  84     buf[0] = 0;
  85     char *errmsg = reinterpret_cast<char*>(strerror_r(err, buf, len));
  86     int retval = reinterpret_cast<int64>(errmsg);
  87     if (retval == 0)
  88       return true;
  89     if (retval == -1)
  90       return false;
  91     if (errmsg != buf) {
  92       strncpy(buf, errmsg, len);
  93       buf[len - 1] = 0;
  94     }
  95     return true;
  96   }
  97
  98
  99   inline uint64 addr_to_tag(void *address) {
 100     return reinterpret_cast<uint64>(address);
 101   }
 102 }  // namespace
 103
 104 #if !defined(O_DIRECT)
 105 // Sometimes this isn't available.
 106 // Disregard if it's not defined.
 107   #define O_DIRECT            0
 108 #endif
 109
 110 // A struct to hold captured errors, for later reporting.
 111 struct ErrorRecord {
 112   uint64 actual;  // This is the actual value read.
 113   uint64 reread;  // This is the actual value, reread.
 114   uint64 expected;  // This is what it should have been.
 115   uint64 *vaddr;  // This is where it was (or wasn't).
 116   char *vbyteaddr;  // This is byte specific where the data was (or wasn't).
 117   uint64 paddr;  // This is the bus address, if available.
 118   uint64 *tagvaddr;  // This holds the tag value if this data was tagged.
 119   uint64 tagpaddr;  // This holds the physical address corresponding to the tag.
 120 };
 121
 122 // This is a helper function to create new threads with pthreads.
 123 static void *ThreadSpawnerGeneric(void *ptr) {
 124   WorkerThread *worker = static_cast<WorkerThread*>(ptr);
 125   worker->StartRoutine();
 126   return NULL;
 127 }
 128
 129 void WorkerStatus::Initialize() {
 130   sat_assert(0 == pthread_mutex_init(&num_workers_mutex_, NULL));
 131   sat_assert(0 == pthread_rwlock_init(&status_rwlock_, NULL));
 132 #ifdef HAVE_PTHREAD_BARRIERS
 133   sat_assert(0 == pthread_barrier_init(&pause_barrier_, NULL,
 134                                        num_workers_ + 1));
 135 #endif
 136 }
 137
 138 void WorkerStatus::Destroy() {
 139   sat_assert(0 == pthread_mutex_destroy(&num_workers_mutex_));
 140   sat_assert(0 == pthread_rwlock_destroy(&status_rwlock_));
 141 #ifdef HAVE_PTHREAD_BARRIERS
 142   sat_assert(0 == pthread_barrier_destroy(&pause_barrier_));
 143 #endif
 144 }
 145
 146 void WorkerStatus::PauseWorkers() {
 147   if (SetStatus(PAUSE) != PAUSE)
 148     WaitOnPauseBarrier();
 149 }
 150
 151 void WorkerStatus::ResumeWorkers() {
 152   if (SetStatus(RUN) == PAUSE)
 153     WaitOnPauseBarrier();
 154 }
 155
 156 void WorkerStatus::StopWorkers() {
 157   if (SetStatus(STOP) == PAUSE)
 158     WaitOnPauseBarrier();
 159 }
 160
 161 bool WorkerStatus::ContinueRunning(bool *paused) {
 162   // This loop is an optimization.  We use it to immediately re-check the status
 163   // after resuming from a pause, instead of returning and waiting for the next
 164   // call to this function.
 165   if (paused) {
 166     *paused = false;
 167   }
 168   for (;;) {
 169     switch (GetStatus()) {
 170       case RUN:
 171         return true;
 172       case PAUSE:
 173         // Wait for the other workers to call this function so that
 174         // PauseWorkers() can return.
 175         WaitOnPauseBarrier();
 176         // Wait for ResumeWorkers() to be called.
 177         WaitOnPauseBarrier();
 178         // Indicate that a pause occurred.
 179         if (paused) {
 180           *paused = true;
 181         }
 182         break;
 183       case STOP:
 184         return false;
 185     }
 186   }
 187 }
 188
 189 bool WorkerStatus::ContinueRunningNoPause() {
 190   return (GetStatus() != STOP);
 191 }
 192
 193 void WorkerStatus::RemoveSelf() {
 194   // Acquire a read lock on status_rwlock_ while (status_ != PAUSE).
 195   for (;;) {
 196     AcquireStatusReadLock();
 197     if (status_ != PAUSE)
 198       break;
 199     // We need to obey PauseWorkers() just like ContinueRunning() would, so that
 200     // the other threads won't wait on pause_barrier_ forever.
 201     ReleaseStatusLock();
 202     // Wait for the other workers to call this function so that PauseWorkers()
 203     // can return.
 204     WaitOnPauseBarrier();
 205     // Wait for ResumeWorkers() to be called.
 206     WaitOnPauseBarrier();
 207   }
 208
 209   // This lock would be unnecessary if we held a write lock instead of a read
 210   // lock on status_rwlock_, but that would also force all threads calling
 211   // ContinueRunning() to wait on this one.  Using a separate lock avoids that.
 212   AcquireNumWorkersLock();
 213   // Decrement num_workers_ and reinitialize pause_barrier_, which we know isn't
 214   // in use because (status != PAUSE).
 215 #ifdef HAVE_PTHREAD_BARRIERS
 216   sat_assert(0 == pthread_barrier_destroy(&pause_barrier_));
 217   sat_assert(0 == pthread_barrier_init(&pause_barrier_, NULL, num_workers_));
 218 #endif
 219   --num_workers_;
 220   ReleaseNumWorkersLock();
 221
 222   // Release status_rwlock_.
 223   ReleaseStatusLock();
 224 }
 225
 226
 227 // Parent thread class.
 228 WorkerThread::WorkerThread() {
 229   status_ = false;
 230   pages_copied_ = 0;
 231   errorcount_ = 0;
 232   runduration_usec_ = 1;
 233   priority_ = Normal;
 234   worker_status_ = NULL;
 235   thread_spawner_ = &ThreadSpawnerGeneric;
 236   tag_mode_ = false;
 237 }
 238
 239 WorkerThread::~WorkerThread() {}
 240
 241 // Constructors. Just init some default values.
 242 FillThread::FillThread() {
 243   num_pages_to_fill_ = 0;
 244 }
 245
 246 // Initialize file name to empty.
 247 FileThread::FileThread() {
 248   filename_ = "";
 249   devicename_ = "";
 250   pass_ = 0;
 251   page_io_ = true;
 252   crc_page_ = -1;
 253   local_page_ = NULL;
 254 }
 255
 256 // If file thread used bounce buffer in memory, account for the extra
 257 // copy for memory bandwidth calculation.
 258 float FileThread::GetMemoryCopiedData() {
 259   if (!os_->normal_mem())
 260     return GetCopiedData();
 261   else
 262     return 0;
 263 }
 264
 265 // Initialize target hostname to be invalid.
 266 NetworkThread::NetworkThread() {
 267   snprintf(ipaddr_, sizeof(ipaddr_), "Unknown");
 268   sock_ = 0;
 269 }
 270
 271 // Initialize?
 272 NetworkSlaveThread::NetworkSlaveThread() {
 273 }
 274
 275 // Initialize?
 276 NetworkListenThread::NetworkListenThread() {
 277 }
 278
 279 // Init member variables.
 280 void WorkerThread::InitThread(int thread_num_init,
 281                               class Sat *sat_init,
 282                               class OsLayer *os_init,
 283                               class PatternList *patternlist_init,
 284                               WorkerStatus *worker_status) {
 285   sat_assert(worker_status);
 286   worker_status->AddWorkers(1);
 287
 288   thread_num_ = thread_num_init;
 289   sat_ = sat_init;
 290   os_ = os_init;
 291   patternlist_ = patternlist_init;
 292   worker_status_ = worker_status;
 293
 294   AvailableCpus(&cpu_mask_);
 295   tag_ = 0xffffffff;
 296
 297   tag_mode_ = sat_->tag_mode();
 298 }
 299
 300
 301 // Use pthreads to prioritize a system thread.
 302 bool WorkerThread::InitPriority() {
 303   // This doesn't affect performance that much, and may not be too safe.
 304
 305   bool ret = BindToCpus(&cpu_mask_);
 306   if (!ret)
 307     logprintf(11, "Log: Bind to %s failed.\n",
 308               cpuset_format(&cpu_mask_).c_str());
 309
 310   logprintf(11, "Log: Thread %d running on core ID %d mask %s (%s).\n",
 311             thread_num_, sched_getcpu(),
 312             CurrentCpusFormat().c_str(),
 313             cpuset_format(&cpu_mask_).c_str());
 314 #if 0
 315   if (priority_ == High) {
 316     sched_param param;
 317     param.sched_priority = 1;
 318     // Set the priority; others are unchanged.
 319     logprintf(0, "Log: Changing priority to SCHED_FIFO %d\n",
 320               param.sched_priority);
 321     if (sched_setscheduler(0, SCHED_FIFO, &param)) {
 322       char buf[256];
 323       sat_strerror(errno, buf, sizeof(buf));
 324       logprintf(0, "Process Error: sched_setscheduler "
 325                    "failed - error %d %s\n",
 326                 errno, buf);
 327     }
 328   }
 329 #endif
 330   return true;
 331 }
 332
 333 // Use pthreads to create a system thread.
 334 int WorkerThread::SpawnThread() {
 335   // Create the new thread.
 336   int result = pthread_create(&thread_, NULL, thread_spawner_, this);
 337   if (result) {
 338     char buf[256];
 339     sat_strerror(result, buf, sizeof(buf));
 340     logprintf(0, "Process Error: pthread_create "
 341                   "failed - error %d %s\n", result,
 342               buf);
 343     status_ = false;
 344     return false;
 345   }
 346
 347   // 0 is pthreads success.
 348   return true;
 349 }
 350
 351 // Kill the worker thread with SIGINT.
 352 bool WorkerThread::KillThread() {
 353   return (pthread_kill(thread_, SIGINT) == 0);
 354 }
 355
 356 // Block until thread has exited.
 357 bool WorkerThread::JoinThread() {
 358   int result = pthread_join(thread_, NULL);
 359
 360   if (result) {
 361     logprintf(0, "Process Error: pthread_join failed - error %d\n", result);
 362     status_ = false;
 363   }
 364
 365   // 0 is pthreads success.
 366   return (!result);
 367 }
 368
 369
 370 void WorkerThread::StartRoutine() {
 371   InitPriority();
 372   StartThreadTimer();
 373   Work();
 374   StopThreadTimer();
 375   worker_status_->RemoveSelf();
 376 }
 377
 378
 379 // Thread work loop. Execute until marked finished.
 380 bool WorkerThread::Work() {
 381   do {
 382     logprintf(9, "Log: ...\n");
 383     // Sleep for 1 second.
 384     sat_sleep(1);
 385   } while (IsReadyToRun());
 386
 387   return false;
 388 }
 389
 390
 391 // Returns CPU mask of CPUs available to this process,
 392 // Conceptually, each bit represents a logical CPU, ie:
 393 //   mask = 3  (11b):   cpu0, 1
 394 //   mask = 13 (1101b): cpu0, 2, 3
 395 bool WorkerThread::AvailableCpus(cpu_set_t *cpuset) {
 396   CPU_ZERO(cpuset);
 397 #ifdef HAVE_SCHED_GETAFFINITY
 398   return sched_getaffinity(getppid(), sizeof(*cpuset), cpuset) == 0;
 399 #else
 400   return 0;
 401 #endif
 402 }
 403
 404
 405 // Returns CPU mask of CPUs this thread is bound to,
 406 // Conceptually, each bit represents a logical CPU, ie:
 407 //   mask = 3  (11b):   cpu0, 1
 408 //   mask = 13 (1101b): cpu0, 2, 3
 409 bool WorkerThread::CurrentCpus(cpu_set_t *cpuset) {
 410   CPU_ZERO(cpuset);
 411 #ifdef HAVE_SCHED_GETAFFINITY
 412   return sched_getaffinity(0, sizeof(*cpuset), cpuset) == 0;
 413 #else
 414   return 0;
 415 #endif
 416 }
 417
 418
 419 // Bind worker thread to specified CPU(s)
 420 //   Args:
 421 //     thread_mask: cpu_set_t representing CPUs, ie
 422 //                  mask = 1  (01b):   cpu0
 423 //                  mask = 3  (11b):   cpu0, 1
 424 //                  mask = 13 (1101b): cpu0, 2, 3
 425 //
 426 //   Returns true on success, false otherwise.
 427 bool WorkerThread::BindToCpus(const cpu_set_t *thread_mask) {
 428   cpu_set_t process_mask;
 429   AvailableCpus(&process_mask);
 430   if (cpuset_isequal(thread_mask, &process_mask))
 431     return true;
 432
 433   logprintf(11, "Log: available CPU mask - %s\n",
 434             cpuset_format(&process_mask).c_str());
 435   if (!cpuset_issubset(thread_mask, &process_mask)) {
 436     // Invalid cpu_mask, ie cpu not allocated to this process or doesn't exist.
 437     logprintf(0, "Log: requested CPUs %s not a subset of available %s\n",
 438               cpuset_format(thread_mask).c_str(),
 439               cpuset_format(&process_mask).c_str());
 440     return false;
 441   }
 442 #ifdef HAVE_SCHED_GETAFFINITY
 443   return (sched_setaffinity(gettid(), sizeof(*thread_mask), thread_mask) == 0);
 444 #else
 445   return 0;
 446 #endif
 447 }
 448
 449
 450 // A worker thread can yield itself to give up CPU until it's scheduled again.
 451 //   Returns true on success, false on error.
 452 bool WorkerThread::YieldSelf() {
 453   return (sched_yield() == 0);
 454 }
 455
 456
 457 // Fill this page with its pattern.
 458 bool WorkerThread::FillPage(struct page_entry *pe) {
 459   // Error check arguments.
 460   if (pe == 0) {
 461     logprintf(0, "Process Error: Fill Page entry null\n");
 462     return 0;
 463   }
 464
 465   // Mask is the bitmask of indexes used by the pattern.
 466   // It is the pattern size -1. Size is always a power of 2.
 467   uint64 *memwords = static_cast<uint64*>(pe->addr);
 468   int length = sat_->page_length();
 469
 470   if (tag_mode_) {
 471     // Select tag or data as appropriate.
 472     for (int i = 0; i < length / wordsize_; i++) {
 473       datacast_t data;
 474
 475       if ((i & 0x7) == 0) {
 476         data.l64 = addr_to_tag(&memwords[i]);
 477       } else {
 478         data.l32.l = pe->pattern->pattern(i << 1);
 479         data.l32.h = pe->pattern->pattern((i << 1) + 1);
 480       }
 481       memwords[i] = data.l64;
 482     }
 483   } else {
 484     // Just fill in untagged data directly.
 485     for (int i = 0; i < length / wordsize_; i++) {
 486       datacast_t data;
 487
 488       data.l32.l = pe->pattern->pattern(i << 1);
 489       data.l32.h = pe->pattern->pattern((i << 1) + 1);
 490       memwords[i] = data.l64;
 491     }
 492   }
 493
 494   return 1;
 495 }
 496
 497
 498 // Tell the thread how many pages to fill.
 499 void FillThread::SetFillPages(int64 num_pages_to_fill_init) {
 500   num_pages_to_fill_ = num_pages_to_fill_init;
 501 }
 502
 503 // Fill this page with a random pattern.
 504 bool FillThread::FillPageRandom(struct page_entry *pe) {
 505   // Error check arguments.
 506   if (pe == 0) {
 507     logprintf(0, "Process Error: Fill Page entry null\n");
 508     return 0;
 509   }
 510   if ((patternlist_ == 0) || (patternlist_->Size() == 0)) {
 511     logprintf(0, "Process Error: No data patterns available\n");
 512     return 0;
 513   }
 514
 515   // Choose a random pattern for this block.
 516   pe->pattern = patternlist_->GetRandomPattern();
 517   if (pe->pattern == 0) {
 518     logprintf(0, "Process Error: Null data pattern\n");
 519     return 0;
 520   }
 521
 522   // Actually fill the page.
 523   return FillPage(pe);
 524 }
 525
 526
 527 // Memory fill work loop. Execute until alloted pages filled.
 528 bool FillThread::Work() {
 529   bool result = true;
 530
 531   logprintf(9, "Log: Starting fill thread %d\n", thread_num_);
 532
 533   // We want to fill num_pages_to_fill pages, and
 534   // stop when we've filled that many.
 535   // We also want to capture early break
 536   struct page_entry pe;
 537   int64 loops = 0;
 538   while (IsReadyToRun() && (loops < num_pages_to_fill_)) {
 539     result = result && sat_->GetEmpty(&pe);
 540     if (!result) {
 541       logprintf(0, "Process Error: fill_thread failed to pop pages, "
 542                 "bailing\n");
 543       break;
 544     }
 545
 546     // Fill the page with pattern
 547     result = result && FillPageRandom(&pe);
 548     if (!result) break;
 549
 550     // Put the page back on the queue.
 551     result = result && sat_->PutValid(&pe);
 552     if (!result) {
 553       logprintf(0, "Process Error: fill_thread failed to push pages, "
 554                 "bailing\n");
 555       break;
 556     }
 557     loops++;
 558   }
 559
 560   // Fill in thread status.
 561   pages_copied_ = loops;
 562   status_ = result;
 563   logprintf(9, "Log: Completed %d: Fill thread. Status %d, %d pages filled\n",
 564             thread_num_, status_, pages_copied_);
 565   return result;
 566 }
 567
 568
 569 // Print error information about a data miscompare.
 570 void WorkerThread::ProcessError(struct ErrorRecord *error,
 571                                 int priority,
 572                                 const char *message) {
 573   char dimm_string[256] = "";
 574
 575   int core_id = sched_getcpu();
 576
 577   // Determine if this is a write or read error.
 578   os_->Flush(error->vaddr);
 579   error->reread = *(error->vaddr);
 580
 581   char *good = reinterpret_cast<char*>(&(error->expected));
 582   char *bad = reinterpret_cast<char*>(&(error->actual));
 583
 584   sat_assert(error->expected != error->actual);
 585   unsigned int offset = 0;
 586   for (offset = 0; offset < (sizeof(error->expected) - 1); offset++) {
 587     if (good[offset] != bad[offset])
 588       break;
 589   }
 590
 591   error->vbyteaddr = reinterpret_cast<char*>(error->vaddr) + offset;
 592
 593   // Find physical address if possible.
 594   error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
 595
 596   // Pretty print DIMM mapping if available.
 597   os_->FindDimm(error->paddr, dimm_string, sizeof(dimm_string));
 598
 599   // Report parseable error.
 600   if (priority < 5) {
 601     // Run miscompare error through diagnoser for logging and reporting.
 602     os_->error_diagnoser_->AddMiscompareError(dimm_string,
 603                                               reinterpret_cast<uint64>
 604                                               (error->vaddr), 1);
 605
 606     logprintf(priority,
 607               "%s: miscompare on CPU %d(0x%s) at %p(0x%llx:%s): "
 608               "read:0x%016llx, reread:0x%016llx expected:0x%016llx\n",
 609               message,
 610               core_id,
 611               CurrentCpusFormat().c_str(),
 612               error->vaddr,
 613               error->paddr,
 614               dimm_string,
 615               error->actual,
 616               error->reread,
 617               error->expected);
 618   }
 619
 620
 621   // Overwrite incorrect data with correct data to prevent
 622   // future miscompares when this data is reused.
 623   *(error->vaddr) = error->expected;
 624   os_->Flush(error->vaddr);
 625 }
 626
 627
 628
 629 // Print error information about a data miscompare.
 630 void FileThread::ProcessError(struct ErrorRecord *error,
 631                               int priority,
 632                               const char *message) {
 633   char dimm_string[256] = "";
 634
 635   // Determine if this is a write or read error.
 636   os_->Flush(error->vaddr);
 637   error->reread = *(error->vaddr);
 638
 639   char *good = reinterpret_cast<char*>(&(error->expected));
 640   char *bad = reinterpret_cast<char*>(&(error->actual));
 641
 642   sat_assert(error->expected != error->actual);
 643   unsigned int offset = 0;
 644   for (offset = 0; offset < (sizeof(error->expected) - 1); offset++) {
 645     if (good[offset] != bad[offset])
 646       break;
 647   }
 648
 649   error->vbyteaddr = reinterpret_cast<char*>(error->vaddr) + offset;
 650
 651   // Find physical address if possible.
 652   error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
 653
 654   // Pretty print DIMM mapping if available.
 655   os_->FindDimm(error->paddr, dimm_string, sizeof(dimm_string));
 656
 657   // If crc_page_ is valid, ie checking content read back from file,
 658   // track src/dst memory addresses. Otherwise catagorize as general
 659   // mememory miscompare for CRC checking everywhere else.
 660   if (crc_page_ != -1) {
 661     int miscompare_byteoffset = static_cast<char*>(error->vbyteaddr) -
 662                                 static_cast<char*>(page_recs_[crc_page_].dst);
 663     os_->error_diagnoser_->AddHDDMiscompareError(devicename_,
 664                                                  crc_page_,
 665                                                  miscompare_byteoffset,
 666                                                  page_recs_[crc_page_].src,
 667                                                  page_recs_[crc_page_].dst);
 668   } else {
 669     os_->error_diagnoser_->AddMiscompareError(dimm_string,
 670                                               reinterpret_cast<uint64>
 671                                               (error->vaddr), 1);
 672   }
 673
 674   logprintf(priority,
 675             "%s: miscompare on %s at %p(0x%llx:%s): read:0x%016llx, "
 676             "reread:0x%016llx expected:0x%016llx\n",
 677             message,
 678             devicename_.c_str(),
 679             error->vaddr,
 680             error->paddr,
 681             dimm_string,
 682             error->actual,
 683             error->reread,
 684             error->expected);
 685
 686   // Overwrite incorrect data with correct data to prevent
 687   // future miscompares when this data is reused.
 688   *(error->vaddr) = error->expected;
 689   os_->Flush(error->vaddr);
 690 }
 691
 692
 693 // Do a word by word result check of a region.
 694 // Print errors on mismatches.
 695 int WorkerThread::CheckRegion(void *addr,
 696                               class Pattern *pattern,
 697                               int64 length,
 698                               int offset,
 699                               int64 pattern_offset) {
 700   uint64 *memblock = static_cast<uint64*>(addr);
 701   const int kErrorLimit = 128;
 702   int errors = 0;
 703   int overflowerrors = 0;  // Count of overflowed errors.
 704   bool page_error = false;
 705   string errormessage("Hardware Error");
 706   struct ErrorRecord
 707     recorded[kErrorLimit];  // Queued errors for later printing.
 708
 709   // For each word in the data region.
 710   for (int i = 0; i < length / wordsize_; i++) {
 711     uint64 actual = memblock[i];
 712     uint64 expected;
 713
 714     // Determine the value that should be there.
 715     datacast_t data;
 716     int index = 2 * i + pattern_offset;
 717     data.l32.l = pattern->pattern(index);
 718     data.l32.h = pattern->pattern(index + 1);
 719     expected = data.l64;
 720     // Check tags if necessary.
 721     if (tag_mode_ && ((reinterpret_cast<uint64>(&memblock[i]) & 0x3f) == 0)) {
 722       expected = addr_to_tag(&memblock[i]);
 723     }
 724
 725
 726     // If the value is incorrect, save an error record for later printing.
 727     if (actual != expected) {
 728       if (errors < kErrorLimit) {
 729         recorded[errors].actual = actual;
 730         recorded[errors].expected = expected;
 731         recorded[errors].vaddr = &memblock[i];
 732         errors++;
 733       } else {
 734         page_error = true;
 735         // If we have overflowed the error queue, just print the errors now.
 736         logprintf(10, "Log: Error record overflow, too many miscompares!\n");
 737         errormessage = "Page Error";
 738         break;
 739       }
 740     }
 741   }
 742
 743   // Find if this is a whole block corruption.
 744   if (page_error && !tag_mode_) {
 745     int patsize = patternlist_->Size();
 746     for (int pat = 0; pat < patsize; pat++) {
 747       class Pattern *altpattern = patternlist_->GetPattern(pat);
 748       const int kGood = 0;
 749       const int kBad = 1;
 750       const int kGoodAgain = 2;
 751       const int kNoMatch = 3;
 752       int state = kGood;
 753       unsigned int badstart = 0;
 754       unsigned int badend = 0;
 755
 756       // Don't match against ourself!
 757       if (pattern == altpattern)
 758         continue;
 759
 760       for (int i = 0; i < length / wordsize_; i++) {
 761         uint64 actual = memblock[i];
 762         datacast_t expected;
 763         datacast_t possible;
 764
 765         // Determine the value that should be there.
 766         int index = 2 * i + pattern_offset;
 767
 768         expected.l32.l = pattern->pattern(index);
 769         expected.l32.h = pattern->pattern(index + 1);
 770
 771         possible.l32.l = pattern->pattern(index);
 772         possible.l32.h = pattern->pattern(index + 1);
 773
 774         if (state == kGood) {
 775           if (actual == expected.l64) {
 776             continue;
 777           } else if (actual == possible.l64) {
 778             badstart = i;
 779             badend = i;
 780             state = kBad;
 781             continue;
 782           } else {
 783             state = kNoMatch;
 784             break;
 785           }
 786         } else if (state == kBad) {
 787           if (actual == possible.l64) {
 788             badend = i;
 789             continue;
 790           } else if (actual == expected.l64) {
 791             state = kGoodAgain;
 792             continue;
 793           } else {
 794             state = kNoMatch;
 795             break;
 796           }
 797         } else if (state == kGoodAgain) {
 798           if (actual == expected.l64) {
 799             continue;
 800           } else {
 801             state = kNoMatch;
 802             break;
 803           }
 804         }
 805       }
 806
 807       if ((state == kGoodAgain) || (state == kBad)) {
 808         unsigned int blockerrors = badend - badstart + 1;
 809         errormessage = "Block Error";
 810         // It's okay for the 1st entry to be corrected multiple times,
 811         // it will simply be reported twice. Once here and once below
 812         // when processing the error queue.
 813         ProcessError(&recorded[0], 0, errormessage.c_str());
 814         logprintf(0, "Block Error: (%p) pattern %s instead of %s, "
 815                   "%d bytes from offset 0x%x to 0x%x\n",
 816                   &memblock[badstart],
 817                   altpattern->name(), pattern->name(),
 818                   blockerrors * wordsize_,
 819                   offset + badstart * wordsize_,
 820                   offset + badend * wordsize_);
 821       }
 822     }
 823   }
 824
 825
 826   // Process error queue after all errors have been recorded.
 827   for (int err = 0; err < errors; err++) {
 828     int priority = 5;
 829     if (errorcount_ + err < 30)
 830       priority = 0;  // Bump up the priority for the first few errors.
 831     ProcessError(&recorded[err], priority, errormessage.c_str());
 832   }
 833
 834   if (page_error) {
 835     // For each word in the data region.
 836     for (int i = 0; i < length / wordsize_; i++) {
 837       uint64 actual = memblock[i];
 838       uint64 expected;
 839       datacast_t data;
 840       // Determine the value that should be there.
 841       int index = 2 * i + pattern_offset;
 842
 843       data.l32.l = pattern->pattern(index);
 844       data.l32.h = pattern->pattern(index + 1);
 845       expected = data.l64;
 846
 847       // Check tags if necessary.
 848       if (tag_mode_ && ((reinterpret_cast<uint64>(&memblock[i]) & 0x3f) == 0)) {
 849         expected = addr_to_tag(&memblock[i]);
 850       }
 851
 852       // If the value is incorrect, save an error record for later printing.
 853       if (actual != expected) {
 854         // If we have overflowed the error queue, print the errors now.
 855         struct ErrorRecord er;
 856         er.actual = actual;
 857         er.expected = expected;
 858         er.vaddr = &memblock[i];
 859
 860         // Do the error printout. This will take a long time and
 861         // likely change the machine state.
 862         ProcessError(&er, 12, errormessage.c_str());
 863         overflowerrors++;
 864       }
 865     }
 866   }
 867
 868   // Keep track of observed errors.
 869   errorcount_ += errors + overflowerrors;
 870   return errors + overflowerrors;
 871 }
 872
 873 float WorkerThread::GetCopiedData() {
 874   return pages_copied_ * sat_->page_length() / kMegabyte;
 875 }
 876
 877 // Calculate the CRC of a region.
 878 // Result check if the CRC mismatches.
 879 int WorkerThread::CrcCheckPage(struct page_entry *srcpe) {
 880   const int blocksize = 4096;
 881   const int blockwords = blocksize / wordsize_;
 882   int errors = 0;
 883
 884   const AdlerChecksum *expectedcrc = srcpe->pattern->crc();
 885   uint64 *memblock = static_cast<uint64*>(srcpe->addr);
 886   int blocks = sat_->page_length() / blocksize;
 887   for (int currentblock = 0; currentblock < blocks; currentblock++) {
 888     uint64 *memslice = memblock + currentblock * blockwords;
 889
 890     AdlerChecksum crc;
 891     if (tag_mode_) {
 892       AdlerAddrCrcC(memslice, blocksize, &crc, srcpe);
 893     } else {
 894       CalculateAdlerChecksum(memslice, blocksize, &crc);
 895     }
 896
 897     // If the CRC does not match, we'd better look closer.
 898     if (!crc.Equals(*expectedcrc)) {
 899       logprintf(11, "Log: CrcCheckPage Falling through to slow compare, "
 900                 "CRC mismatch %s != %s\n",
 901                 crc.ToHexString().c_str(),
 902                 expectedcrc->ToHexString().c_str());
 903       int errorcount = CheckRegion(memslice,
 904                                    srcpe->pattern,
 905                                    blocksize,
 906                                    currentblock * blocksize, 0);
 907       if (errorcount == 0) {
 908         logprintf(0, "Log: CrcCheckPage CRC mismatch %s != %s, "
 909                      "but no miscompares found.\n",
 910                   crc.ToHexString().c_str(),
 911                   expectedcrc->ToHexString().c_str());
 912       }
 913       errors += errorcount;
 914     }
 915   }
 916
 917   // For odd length transfers, we should never hit this.
 918   int leftovers = sat_->page_length() % blocksize;
 919   if (leftovers) {
 920     uint64 *memslice = memblock + blocks * blockwords;
 921     errors += CheckRegion(memslice,
 922                           srcpe->pattern,
 923                           leftovers,
 924                           blocks * blocksize, 0);
 925   }
 926   return errors;
 927 }
 928
 929
 930 // Print error information about a data miscompare.
 931 void WorkerThread::ProcessTagError(struct ErrorRecord *error,
 932                                    int priority,
 933                                    const char *message) {
 934   char dimm_string[256] = "";
 935   char tag_dimm_string[256] = "";
 936   bool read_error = false;
 937
 938   int core_id = sched_getcpu();
 939
 940   // Determine if this is a write or read error.
 941   os_->Flush(error->vaddr);
 942   error->reread = *(error->vaddr);
 943
 944   // Distinguish read and write errors.
 945   if (error->actual != error->reread) {
 946     read_error = true;
 947   }
 948
 949   sat_assert(error->expected != error->actual);
 950
 951   error->vbyteaddr = reinterpret_cast<char*>(error->vaddr);
 952
 953   // Find physical address if possible.
 954   error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
 955   error->tagpaddr = os_->VirtualToPhysical(error->tagvaddr);
 956
 957   // Pretty print DIMM mapping if available.
 958   os_->FindDimm(error->paddr, dimm_string, sizeof(dimm_string));
 959   // Pretty print DIMM mapping if available.
 960   os_->FindDimm(error->tagpaddr, tag_dimm_string, sizeof(tag_dimm_string));
 961
 962   // Report parseable error.
 963   if (priority < 5) {
 964     logprintf(priority,
 965               "%s: Tag from %p(0x%llx:%s) (%s) "
 966               "miscompare on CPU %d(0x%s) at %p(0x%llx:%s): "
 967               "read:0x%016llx, reread:0x%016llx expected:0x%016llx\n",
 968               message,
 969               error->tagvaddr, error->tagpaddr,
 970               tag_dimm_string,
 971               read_error ? "read error" : "write error",
 972               core_id,
 973               CurrentCpusFormat().c_str(),
 974               error->vaddr,
 975               error->paddr,
 976               dimm_string,
 977               error->actual,
 978               error->reread,
 979               error->expected);
 980   }
 981
 982   errorcount_ += 1;
 983
 984   // Overwrite incorrect data with correct data to prevent
 985   // future miscompares when this data is reused.
 986   *(error->vaddr) = error->expected;
 987   os_->Flush(error->vaddr);
 988 }
 989
 990
 991 // Print out and log a tag error.
 992 bool WorkerThread::ReportTagError(
 993     uint64 *mem64,
 994     uint64 actual,
 995     uint64 tag) {
 996   struct ErrorRecord er;
 997   er.actual = actual;
 998
 999   er.expected = tag;
1000   er.vaddr = mem64;
1001
1002   // Generate vaddr from tag.
1003   er.tagvaddr = reinterpret_cast<uint64*>(actual);
1004
1005   ProcessTagError(&er, 0, "Hardware Error");
1006   return true;
1007 }
1008
1009 // C implementation of Adler memory copy, with memory tagging.
1010 bool WorkerThread::AdlerAddrMemcpyC(uint64 *dstmem64,
1011                                     uint64 *srcmem64,
1012                                     unsigned int size_in_bytes,
1013                                     AdlerChecksum *checksum,
1014                                     struct page_entry *pe) {
1015   // Use this data wrapper to access memory with 64bit read/write.
1016   datacast_t data;
1017   datacast_t dstdata;
1018   unsigned int count = size_in_bytes / sizeof(data);
1019
1020   if (count > ((1U) << 19)) {
1021     // Size is too large, must be strictly less than 512 KB.
1022     return false;
1023   }
1024
1025   uint64 a1 = 1;
1026   uint64 a2 = 1;
1027   uint64 b1 = 0;
1028   uint64 b2 = 0;
1029
1030   class Pattern *pattern = pe->pattern;
1031
1032   unsigned int i = 0;
1033   while (i < count) {
1034     // Process 64 bits at a time.
1035     if ((i & 0x7) == 0) {
1036       data.l64 = srcmem64[i];
1037       dstdata.l64 = dstmem64[i];
1038       uint64 src_tag = addr_to_tag(&srcmem64[i]);
1039       uint64 dst_tag = addr_to_tag(&dstmem64[i]);
1040       // Detect if tags have been corrupted.
1041       if (data.l64 != src_tag)
1042         ReportTagError(&srcmem64[i], data.l64, src_tag);
1043       if (dstdata.l64 != dst_tag)
1044         ReportTagError(&dstmem64[i], dstdata.l64, dst_tag);
1045
1046       data.l32.l = pattern->pattern(i << 1);
1047       data.l32.h = pattern->pattern((i << 1) + 1);
1048       a1 = a1 + data.l32.l;
1049       b1 = b1 + a1;
1050       a1 = a1 + data.l32.h;
1051       b1 = b1 + a1;
1052
1053       data.l64  = dst_tag;
1054       dstmem64[i] = data.l64;
1055
1056     } else {
1057       data.l64 = srcmem64[i];
1058       a1 = a1 + data.l32.l;
1059       b1 = b1 + a1;
1060       a1 = a1 + data.l32.h;
1061       b1 = b1 + a1;
1062       dstmem64[i] = data.l64;
1063     }
1064     i++;
1065
1066     data.l64 = srcmem64[i];
1067     a2 = a2 + data.l32.l;
1068     b2 = b2 + a2;
1069     a2 = a2 + data.l32.h;
1070     b2 = b2 + a2;
1071     dstmem64[i] = data.l64;
1072     i++;
1073   }
1074   checksum->Set(a1, a2, b1, b2);
1075   return true;
1076 }
1077
1078 // x86_64 SSE2 assembly implementation of Adler memory copy, with address
1079 // tagging added as a second step. This is useful for debugging failures
1080 // that only occur when SSE / nontemporal writes are used.
1081 bool WorkerThread::AdlerAddrMemcpyWarm(uint64 *dstmem64,
1082                                        uint64 *srcmem64,
1083                                        unsigned int size_in_bytes,
1084                                        AdlerChecksum *checksum,
1085                                        struct page_entry *pe) {
1086   // Do ASM copy, ignore checksum.
1087   AdlerChecksum ignored_checksum;
1088   os_->AdlerMemcpyWarm(dstmem64, srcmem64, size_in_bytes, &ignored_checksum);
1089
1090   // Force cache flush of both the source and destination addresses.
1091   //  length - length of block to flush in cachelines.
1092   //  mem_increment - number of dstmem/srcmem values per cacheline.
1093   int length = size_in_bytes / kCacheLineSize;
1094   int mem_increment = kCacheLineSize / sizeof(*dstmem64);
1095   OsLayer::FastFlushSync();
1096   for (int i = 0; i < length; ++i) {
1097     OsLayer::FastFlushHint(dstmem64 + (i * mem_increment));
1098     OsLayer::FastFlushHint(srcmem64 + (i * mem_increment));
1099   }
1100   OsLayer::FastFlushSync();
1101
1102   // Check results.
1103   AdlerAddrCrcC(srcmem64, size_in_bytes, checksum, pe);
1104   // Patch up address tags.
1105   TagAddrC(dstmem64, size_in_bytes);
1106   return true;
1107 }
1108
1109 // Retag pages..
1110 bool WorkerThread::TagAddrC(uint64 *memwords,
1111                             unsigned int size_in_bytes) {
1112   // Mask is the bitmask of indexes used by the pattern.
1113   // It is the pattern size -1. Size is always a power of 2.
1114
1115   // Select tag or data as appropriate.
1116   int length = size_in_bytes / wordsize_;
1117   for (int i = 0; i < length; i += 8) {
1118     datacast_t data;
1119     data.l64 = addr_to_tag(&memwords[i]);
1120     memwords[i] = data.l64;
1121   }
1122   return true;
1123 }
1124
1125 // C implementation of Adler memory crc.
1126 bool WorkerThread::AdlerAddrCrcC(uint64 *srcmem64,
1127                                  unsigned int size_in_bytes,
1128                                  AdlerChecksum *checksum,
1129                                  struct page_entry *pe) {
1130   // Use this data wrapper to access memory with 64bit read/write.
1131   datacast_t data;
1132   unsigned int count = size_in_bytes / sizeof(data);
1133
1134   if (count > ((1U) << 19)) {
1135     // Size is too large, must be strictly less than 512 KB.
1136     return false;
1137   }
1138
1139   uint64 a1 = 1;
1140   uint64 a2 = 1;
1141   uint64 b1 = 0;
1142   uint64 b2 = 0;
1143
1144   class Pattern *pattern = pe->pattern;
1145
1146   unsigned int i = 0;
1147   while (i < count) {
1148     // Process 64 bits at a time.
1149     if ((i & 0x7) == 0) {
1150       data.l64 = srcmem64[i];
1151       uint64 src_tag = addr_to_tag(&srcmem64[i]);
1152       // Check that tags match expected.
1153       if (data.l64 != src_tag)
1154         ReportTagError(&srcmem64[i], data.l64, src_tag);
1155
1156       data.l32.l = pattern->pattern(i << 1);
1157       data.l32.h = pattern->pattern((i << 1) + 1);
1158       a1 = a1 + data.l32.l;
1159       b1 = b1 + a1;
1160       a1 = a1 + data.l32.h;
1161       b1 = b1 + a1;
1162     } else {
1163       data.l64 = srcmem64[i];
1164       a1 = a1 + data.l32.l;
1165       b1 = b1 + a1;
1166       a1 = a1 + data.l32.h;
1167       b1 = b1 + a1;
1168     }
1169     i++;
1170
1171     data.l64 = srcmem64[i];
1172     a2 = a2 + data.l32.l;
1173     b2 = b2 + a2;
1174     a2 = a2 + data.l32.h;
1175     b2 = b2 + a2;
1176     i++;
1177   }
1178   checksum->Set(a1, a2, b1, b2);
1179   return true;
1180 }
1181
1182 // Copy a block of memory quickly, while keeping a CRC of the data.
1183 // Result check if the CRC mismatches.
1184 int WorkerThread::CrcCopyPage(struct page_entry *dstpe,
1185                               struct page_entry *srcpe) {
1186   int errors = 0;
1187   const int blocksize = 4096;
1188   const int blockwords = blocksize / wordsize_;
1189   int blocks = sat_->page_length() / blocksize;
1190
1191   // Base addresses for memory copy
1192   uint64 *targetmembase = static_cast<uint64*>(dstpe->addr);
1193   uint64 *sourcemembase = static_cast<uint64*>(srcpe->addr);
1194   // Remember the expected CRC
1195   const AdlerChecksum *expectedcrc = srcpe->pattern->crc();
1196
1197   for (int currentblock = 0; currentblock < blocks; currentblock++) {
1198     uint64 *targetmem = targetmembase + currentblock * blockwords;
1199     uint64 *sourcemem = sourcemembase + currentblock * blockwords;
1200
1201     AdlerChecksum crc;
1202     if (tag_mode_) {
1203       AdlerAddrMemcpyC(targetmem, sourcemem, blocksize, &crc, srcpe);
1204     } else {
1205       AdlerMemcpyC(targetmem, sourcemem, blocksize, &crc);
1206     }
1207
1208     // Investigate miscompares.
1209     if (!crc.Equals(*expectedcrc)) {
1210       logprintf(11, "Log: CrcCopyPage Falling through to slow compare, "
1211                 "CRC mismatch %s != %s\n", crc.ToHexString().c_str(),
1212                 expectedcrc->ToHexString().c_str());
1213       int errorcount = CheckRegion(sourcemem,
1214                                    srcpe->pattern,
1215                                    blocksize,
1216                                    currentblock * blocksize, 0);
1217       if (errorcount == 0) {
1218         logprintf(0, "Log: CrcCopyPage CRC mismatch %s != %s, "
1219                      "but no miscompares found. Retrying with fresh data.\n",
1220                   crc.ToHexString().c_str(),
1221                   expectedcrc->ToHexString().c_str());
1222         if (!tag_mode_) {
1223           // Copy the data originally read from this region back again.
1224           // This data should have any corruption read originally while
1225           // calculating the CRC.
1226           memcpy(sourcemem, targetmem, blocksize);
1227           errorcount = CheckRegion(sourcemem,
1228                                    srcpe->pattern,
1229                                    blocksize,
1230                                    currentblock * blocksize, 0);
1231           if (errorcount == 0) {
1232             int core_id = sched_getcpu();
1233             logprintf(0, "Process Error: CPU %d(0x%s) CrcCopyPage "
1234                          "CRC mismatch %s != %s, "
1235                          "but no miscompares found on second pass.\n",
1236                       core_id, CurrentCpusFormat().c_str(),
1237                       crc.ToHexString().c_str(),
1238                       expectedcrc->ToHexString().c_str());
1239             struct ErrorRecord er;
1240             er.actual = sourcemem[0];
1241             er.expected = 0x0;
1242             er.vaddr = sourcemem;
1243             ProcessError(&er, 0, "Hardware Error");
1244           }
1245         }
1246       }
1247       errors += errorcount;
1248     }
1249   }
1250
1251   // For odd length transfers, we should never hit this.
1252   int leftovers = sat_->page_length() % blocksize;
1253   if (leftovers) {
1254     uint64 *targetmem = targetmembase + blocks * blockwords;
1255     uint64 *sourcemem = sourcemembase + blocks * blockwords;
1256
1257     errors += CheckRegion(sourcemem,
1258                           srcpe->pattern,
1259                           leftovers,
1260                           blocks * blocksize, 0);
1261     int leftoverwords = leftovers / wordsize_;
1262     for (int i = 0; i < leftoverwords; i++) {
1263       targetmem[i] = sourcemem[i];
1264     }
1265   }
1266
1267   // Update pattern reference to reflect new contents.
1268   dstpe->pattern = srcpe->pattern;
1269
1270   // Clean clean clean the errors away.
1271   if (errors) {
1272     // TODO(nsanders): Maybe we should patch rather than fill? Filling may
1273     // cause bad data to be propogated across the page.
1274     FillPage(dstpe);
1275   }
1276   return errors;
1277 }
1278
1279
1280
1281 // Invert a block of memory quickly, traversing downwards.
1282 int InvertThread::InvertPageDown(struct page_entry *srcpe) {
1283   const int blocksize = 4096;
1284   const int blockwords = blocksize / wordsize_;
1285   int blocks = sat_->page_length() / blocksize;
1286
1287   // Base addresses for memory copy
1288   unsigned int *sourcemembase = static_cast<unsigned int *>(srcpe->addr);
1289
1290   for (int currentblock = blocks-1; currentblock >= 0; currentblock--) {
1291     unsigned int *sourcemem = sourcemembase + currentblock * blockwords;
1292     for (int i = blockwords - 32; i >= 0; i -= 32) {
1293       for (int index = i + 31; index >= i; --index) {
1294         unsigned int actual = sourcemem[index];
1295         sourcemem[index] = ~actual;
1296       }
1297       OsLayer::FastFlush(&sourcemem[i]);
1298     }
1299   }
1300
1301   return 0;
1302 }
1303
1304 // Invert a block of memory, traversing upwards.
1305 int InvertThread::InvertPageUp(struct page_entry *srcpe) {
1306   const int blocksize = 4096;
1307   const int blockwords = blocksize / wordsize_;
1308   int blocks = sat_->page_length() / blocksize;
1309
1310   // Base addresses for memory copy
1311   unsigned int *sourcemembase = static_cast<unsigned int *>(srcpe->addr);
1312
1313   for (int currentblock = 0; currentblock < blocks; currentblock++) {
1314     unsigned int *sourcemem = sourcemembase + currentblock * blockwords;
1315     for (int i = 0; i < blockwords; i += 32) {
1316       for (int index = i; index <= i + 31; ++index) {
1317         unsigned int actual = sourcemem[index];
1318         sourcemem[index] = ~actual;
1319       }
1320       OsLayer::FastFlush(&sourcemem[i]);
1321     }
1322   }
1323   return 0;
1324 }
1325
1326 // Copy a block of memory quickly, while keeping a CRC of the data.
1327 // Result check if the CRC mismatches. Warm the CPU while running
1328 int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe,
1329                                   struct page_entry *srcpe) {
1330   int errors = 0;
1331   const int blocksize = 4096;
1332   const int blockwords = blocksize / wordsize_;
1333   int blocks = sat_->page_length() / blocksize;
1334
1335   // Base addresses for memory copy
1336   uint64 *targetmembase = static_cast<uint64*>(dstpe->addr);
1337   uint64 *sourcemembase = static_cast<uint64*>(srcpe->addr);
1338   // Remember the expected CRC
1339   const AdlerChecksum *expectedcrc = srcpe->pattern->crc();
1340
1341   for (int currentblock = 0; currentblock < blocks; currentblock++) {
1342     uint64 *targetmem = targetmembase + currentblock * blockwords;
1343     uint64 *sourcemem = sourcemembase + currentblock * blockwords;
1344
1345     AdlerChecksum crc;
1346     if (tag_mode_) {
1347       AdlerAddrMemcpyWarm(targetmem, sourcemem, blocksize, &crc, srcpe);
1348     } else {
1349       os_->AdlerMemcpyWarm(targetmem, sourcemem, blocksize, &crc);
1350     }
1351
1352     // Investigate miscompares.
1353     if (!crc.Equals(*expectedcrc)) {
1354       logprintf(11, "Log: CrcWarmCopyPage Falling through to slow compare, "
1355                 "CRC mismatch %s != %s\n", crc.ToHexString().c_str(),
1356                 expectedcrc->ToHexString().c_str());
1357       int errorcount = CheckRegion(sourcemem,
1358                                    srcpe->pattern,
1359                                    blocksize,
1360                                    currentblock * blocksize, 0);
1361       if (errorcount == 0) {
1362         logprintf(0, "Log: CrcWarmCopyPage CRC mismatch expected: %s != actual: %s, "
1363                      "but no miscompares found. Retrying with fresh data.\n",
1364                   expectedcrc->ToHexString().c_str(),
1365                   crc.ToHexString().c_str() );
1366         if (!tag_mode_) {
1367           // Copy the data originally read from this region back again.
1368           // This data should have any corruption read originally while
1369           // calculating the CRC.
1370           memcpy(sourcemem, targetmem, blocksize);
1371           errorcount = CheckRegion(sourcemem,
1372                                    srcpe->pattern,
1373                                    blocksize,
1374                                    currentblock * blocksize, 0);
1375           if (errorcount == 0) {
1376             int core_id = sched_getcpu();
1377             logprintf(0, "Process Error: CPU %d(0x%s) CrciWarmCopyPage "
1378                          "CRC mismatch %s != %s, "
1379                          "but no miscompares found on second pass.\n",
1380                       core_id, CurrentCpusFormat().c_str(),
1381                       crc.ToHexString().c_str(),
1382                       expectedcrc->ToHexString().c_str());
1383             struct ErrorRecord er;
1384             er.actual = sourcemem[0];
1385             er.expected = 0xbad;
1386             er.vaddr = sourcemem;
1387             ProcessError(&er, 0, "Hardware Error");
1388           }
1389         }
1390       }
1391       errors += errorcount;
1392     }
1393   }
1394
1395   // For odd length transfers, we should never hit this.
1396   int leftovers = sat_->page_length() % blocksize;
1397   if (leftovers) {
1398     uint64 *targetmem = targetmembase + blocks * blockwords;
1399     uint64 *sourcemem = sourcemembase + blocks * blockwords;
1400
1401     errors += CheckRegion(sourcemem,
1402                           srcpe->pattern,
1403                           leftovers,
1404                           blocks * blocksize, 0);
1405     int leftoverwords = leftovers / wordsize_;
1406     for (int i = 0; i < leftoverwords; i++) {
1407       targetmem[i] = sourcemem[i];
1408     }
1409   }
1410
1411   // Update pattern reference to reflect new contents.
1412   dstpe->pattern = srcpe->pattern;
1413
1414   // Clean clean clean the errors away.
1415   if (errors) {
1416     // TODO(nsanders): Maybe we should patch rather than fill? Filling may
1417     // cause bad data to be propogated across the page.
1418     FillPage(dstpe);
1419   }
1420   return errors;
1421 }
1422
1423
1424
1425 // Memory check work loop. Execute until done, then exhaust pages.
1426 bool CheckThread::Work() {
1427   struct page_entry pe;
1428   bool result = true;
1429   int64 loops = 0;
1430
1431   logprintf(9, "Log: Starting Check thread %d\n", thread_num_);
1432
1433   // We want to check all the pages, and
1434   // stop when there aren't any left.
1435   while (true) {
1436     result = result && sat_->GetValid(&pe);
1437     if (!result) {
1438       if (IsReadyToRunNoPause())
1439         logprintf(0, "Process Error: check_thread failed to pop pages, "
1440                   "bailing\n");
1441       else
1442         result = true;
1443       break;
1444     }
1445
1446     // Do the result check.
1447     CrcCheckPage(&pe);
1448
1449     // Push pages back on the valid queue if we are still going,
1450     // throw them out otherwise.
1451     if (IsReadyToRunNoPause())
1452       result = result && sat_->PutValid(&pe);
1453     else
1454       result = result && sat_->PutEmpty(&pe);
1455     if (!result) {
1456       logprintf(0, "Process Error: check_thread failed to push pages, "
1457                 "bailing\n");
1458       break;
1459     }
1460     loops++;
1461   }
1462
1463   pages_copied_ = loops;
1464   status_ = result;
1465   logprintf(9, "Log: Completed %d: Check thread. Status %d, %d pages checked\n",
1466             thread_num_, status_, pages_copied_);
1467   return result;
1468 }
1469
1470
1471 // Memory copy work loop. Execute until marked done.
1472 bool CopyThread::Work() {
1473   struct page_entry src;
1474   struct page_entry dst;
1475   bool result = true;
1476   int64 loops = 0;
1477
1478   logprintf(9, "Log: Starting copy thread %d: cpu %s, mem %x\n",
1479             thread_num_, cpuset_format(&cpu_mask_).c_str(), tag_);
1480
1481   while (IsReadyToRun()) {
1482     // Pop the needed pages.
1483     result = result && sat_->GetValid(&src, tag_);
1484     result = result && sat_->GetEmpty(&dst, tag_);
1485     if (!result) {
1486       logprintf(0, "Process Error: copy_thread failed to pop pages, "
1487                 "bailing\n");
1488       break;
1489     }
1490
1491     // Force errors for unittests.
1492     if (sat_->error_injection()) {
1493       if (loops == 8) {
1494         char *addr = reinterpret_cast<char*>(src.addr);
1495         int offset = random() % sat_->page_length();
1496         addr[offset] = 0xba;
1497       }
1498     }
1499
1500     // We can use memcpy, or CRC check while we copy.
1501     if (sat_->warm()) {
1502       CrcWarmCopyPage(&dst, &src);
1503     } else if (sat_->strict()) {
1504       CrcCopyPage(&dst, &src);
1505     } else {
1506       memcpy(dst.addr, src.addr, sat_->page_length());
1507       dst.pattern = src.pattern;
1508     }
1509
1510     result = result && sat_->PutValid(&dst);
1511     result = result && sat_->PutEmpty(&src);
1512
1513     // Copy worker-threads yield themselves at the end of each copy loop,
1514     // to avoid threads from preempting each other in the middle of the inner
1515     // copy-loop. Cooperations between Copy worker-threads results in less
1516     // unnecessary cache thrashing (which happens when context-switching in the
1517     // middle of the inner copy-loop).
1518     YieldSelf();
1519
1520     if (!result) {
1521       logprintf(0, "Process Error: copy_thread failed to push pages, "
1522                 "bailing\n");
1523       break;
1524     }
1525     loops++;
1526   }
1527
1528   pages_copied_ = loops;
1529   status_ = result;
1530   logprintf(9, "Log: Completed %d: Copy thread. Status %d, %d pages copied\n",
1531             thread_num_, status_, pages_copied_);
1532   return result;
1533 }
1534
1535 // Memory invert work loop. Execute until marked done.
1536 bool InvertThread::Work() {
1537   struct page_entry src;
1538   bool result = true;
1539   int64 loops = 0;
1540
1541   logprintf(9, "Log: Starting invert thread %d\n", thread_num_);
1542
1543   while (IsReadyToRun()) {
1544     // Pop the needed pages.
1545     result = result && sat_->GetValid(&src);
1546     if (!result) {
1547       logprintf(0, "Process Error: invert_thread failed to pop pages, "
1548                 "bailing\n");
1549       break;
1550     }
1551
1552     if (sat_->strict())
1553       CrcCheckPage(&src);
1554
1555     // For the same reason CopyThread yields itself (see YieldSelf comment
1556     // in CopyThread::Work(), InvertThread yields itself after each invert
1557     // operation to improve cooperation between different worker threads
1558     // stressing the memory/cache.
1559     InvertPageUp(&src);
1560     YieldSelf();
1561     InvertPageDown(&src);
1562     YieldSelf();
1563     InvertPageDown(&src);
1564     YieldSelf();
1565     InvertPageUp(&src);
1566     YieldSelf();
1567
1568     if (sat_->strict())
1569       CrcCheckPage(&src);
1570
1571     result = result && sat_->PutValid(&src);
1572     if (!result) {
1573       logprintf(0, "Process Error: invert_thread failed to push pages, "
1574                 "bailing\n");
1575       break;
1576     }
1577     loops++;
1578   }
1579
1580   pages_copied_ = loops * 2;
1581   status_ = result;
1582   logprintf(9, "Log: Completed %d: Copy thread. Status %d, %d pages copied\n",
1583             thread_num_, status_, pages_copied_);
1584   return result;
1585 }
1586
1587
1588 // Set file name to use for File IO.
1589 void FileThread::SetFile(const char *filename_init) {
1590   filename_ = filename_init;
1591   devicename_ = os_->FindFileDevice(filename_);
1592 }
1593
1594 // Open the file for access.
1595 bool FileThread::OpenFile(int *pfile) {
1596   int flags = O_RDWR | O_CREAT | O_SYNC;
1597   int fd = open(filename_.c_str(), flags | O_DIRECT, 0644);
1598   if (O_DIRECT != 0 && fd < 0 && errno == EINVAL) {
1599     fd = open(filename_.c_str(), flags, 0644);  // Try without O_DIRECT
1600     os_->ActivateFlushPageCache();  // Not using O_DIRECT fixed EINVAL
1601   }
1602   if (fd < 0) {
1603     logprintf(0, "Process Error: Failed to create file %s!!\n",
1604               filename_.c_str());
1605     pages_copied_ = 0;
1606     return false;
1607   }
1608   *pfile = fd;
1609   return true;
1610 }
1611
1612 // Close the file.
1613 bool FileThread::CloseFile(int fd) {
1614   close(fd);
1615   return true;
1616 }
1617
1618 // Check sector tagging.
1619 bool FileThread::SectorTagPage(struct page_entry *src, int block) {
1620   int page_length = sat_->page_length();
1621   struct FileThread::SectorTag *tag =
1622     (struct FileThread::SectorTag *)(src->addr);
1623
1624   // Tag each sector.
1625   unsigned char magic = ((0xba + thread_num_) & 0xff);
1626   for (int sec = 0; sec < page_length / 512; sec++) {
1627     tag[sec].magic = magic;
1628     tag[sec].block = block & 0xff;
1629     tag[sec].sector = sec & 0xff;
1630     tag[sec].pass = pass_ & 0xff;
1631   }
1632   return true;
1633 }
1634
1635 bool FileThread::WritePageToFile(int fd, struct page_entry *src) {
1636   int page_length = sat_->page_length();
1637   // Fill the file with our data.
1638   int64 size = write(fd, src->addr, page_length);
1639
1640   if (size != page_length) {
1641     os_->ErrorReport(devicename_.c_str(), "write-error", 1);
1642     errorcount_++;
1643     logprintf(0, "Block Error: file_thread failed to write, "
1644               "bailing\n");
1645     return false;
1646   }
1647   return true;
1648 }
1649
1650 // Write the data to the file.
1651 bool FileThread::WritePages(int fd) {
1652   int strict = sat_->strict();
1653
1654   // Start fresh at beginning of file for each batch of pages.
1655   lseek64(fd, 0, SEEK_SET);
1656   for (int i = 0; i < sat_->disk_pages(); i++) {
1657     struct page_entry src;
1658     if (!GetValidPage(&src))
1659       return false;
1660     // Save expected pattern.
1661     page_recs_[i].pattern = src.pattern;
1662     page_recs_[i].src = src.addr;
1663
1664     // Check data correctness.
1665     if (strict)
1666       CrcCheckPage(&src);
1667
1668     SectorTagPage(&src, i);
1669
1670     bool result = WritePageToFile(fd, &src);
1671
1672     if (!PutEmptyPage(&src))
1673       return false;
1674
1675     if (!result)
1676       return false;
1677   }
1678   return os_->FlushPageCache();  // If O_DIRECT worked, this will be a NOP.
1679 }
1680
1681 // Copy data from file into memory block.
1682 bool FileThread::ReadPageFromFile(int fd, struct page_entry *dst) {
1683   int page_length = sat_->page_length();
1684
1685   // Do the actual read.
1686   int64 size = read(fd, dst->addr, page_length);
1687   if (size != page_length) {
1688     os_->ErrorReport(devicename_.c_str(), "read-error", 1);
1689     logprintf(0, "Block Error: file_thread failed to read, "
1690               "bailing\n");
1691     errorcount_++;
1692     return false;
1693   }
1694   return true;
1695 }
1696
1697 // Check sector tagging.
1698 bool FileThread::SectorValidatePage(const struct PageRec &page,
1699                                     struct page_entry *dst, int block) {
1700   // Error injection.
1701   static int calls = 0;
1702   calls++;
1703
1704   // Do sector tag compare.
1705   int firstsector = -1;
1706   int lastsector = -1;
1707   bool badsector = false;
1708   int page_length = sat_->page_length();
1709
1710   // Cast data block into an array of tagged sectors.
1711   struct FileThread::SectorTag *tag =
1712   (struct FileThread::SectorTag *)(dst->addr);
1713
1714   sat_assert(sizeof(*tag) == 512);
1715
1716   // Error injection.
1717   if (sat_->error_injection()) {
1718     if (calls == 2) {
1719       for (int badsec = 8; badsec < 17; badsec++)
1720         tag[badsec].pass = 27;
1721     }
1722     if (calls == 18) {
1723       (static_cast<int32*>(dst->addr))[27] = 0xbadda7a;
1724     }
1725   }
1726
1727   // Check each sector for the correct tag we added earlier,
1728   // then revert the tag to the to normal data pattern.
1729   unsigned char magic = ((0xba + thread_num_) & 0xff);
1730   for (int sec = 0; sec < page_length / 512; sec++) {
1731     // Check magic tag.
1732     if ((tag[sec].magic != magic) ||
1733         (tag[sec].block != (block & 0xff)) ||
1734         (tag[sec].sector != (sec & 0xff)) ||
1735         (tag[sec].pass != (pass_ & 0xff))) {
1736       // Offset calculation for tag location.
1737       int offset = sec * sizeof(SectorTag);
1738       if (tag[sec].block != (block & 0xff))
1739         offset += 1 * sizeof(uint8);
1740       else if (tag[sec].sector != (sec & 0xff))
1741         offset += 2 * sizeof(uint8);
1742       else if (tag[sec].pass != (pass_ & 0xff))
1743         offset += 3 * sizeof(uint8);
1744
1745       // Run sector tag error through diagnoser for logging and reporting.
1746       errorcount_ += 1;
1747       os_->error_diagnoser_->AddHDDSectorTagError(devicename_, tag[sec].block,
1748                                                   offset,
1749                                                   tag[sec].sector,
1750                                                   page.src, page.dst);
1751
1752       logprintf(5, "Sector Error: Sector tag @ 0x%x, pass %d/%d. "
1753                 "sec %x/%x, block %d/%d, magic %x/%x, File: %s \n",
1754                 block * page_length + 512 * sec,
1755                 (pass_ & 0xff), (unsigned int)tag[sec].pass,
1756                 sec, (unsigned int)tag[sec].sector,
1757                 block, (unsigned int)tag[sec].block,
1758                 magic, (unsigned int)tag[sec].magic,
1759                 filename_.c_str());
1760
1761       // Keep track of first and last bad sector.
1762       if (firstsector == -1)
1763         firstsector = (block * page_length / 512) + sec;
1764       lastsector = (block * page_length / 512) + sec;
1765       badsector = true;
1766     }
1767     // Patch tag back to proper pattern.
1768     unsigned int *addr = (unsigned int *)(&tag[sec]);
1769     *addr = dst->pattern->pattern(512 * sec / sizeof(*addr));
1770   }
1771
1772   // If we found sector errors:
1773   if (badsector == true) {
1774     logprintf(5, "Log: file sector miscompare at offset %x-%x. File: %s\n",
1775               firstsector * 512,
1776               ((lastsector + 1) * 512) - 1,
1777               filename_.c_str());
1778
1779     // Either exit immediately, or patch the data up and continue.
1780     if (sat_->stop_on_error()) {
1781       exit(1);
1782     } else {
1783       // Patch up bad pages.
1784       for (int block = (firstsector * 512) / page_length;
1785           block <= (lastsector * 512) / page_length;
1786           block++) {
1787         unsigned int *memblock = static_cast<unsigned int *>(dst->addr);
1788         int length = page_length / wordsize_;
1789         for (int i = 0; i < length; i++) {
1790           memblock[i] = dst->pattern->pattern(i);
1791         }
1792       }
1793     }
1794   }
1795   return true;
1796 }
1797
1798 // Get memory for an incoming data transfer..
1799 bool FileThread::PagePrepare() {
1800   // We can only do direct IO to SAT pages if it is normal mem.
1801   page_io_ = os_->normal_mem();
1802
1803   // Init a local buffer if we need it.
1804   if (!page_io_) {
1805 #ifdef HAVE_POSIX_MEMALIGN
1806     int result = posix_memalign(&local_page_, 512, sat_->page_length());
1807 #else
1808     local_page_ = memalign(512, sat_->page_length());
1809     int result = (local_page_ == 0);
1810 #endif
1811     if (result) {
1812       logprintf(0, "Process Error: disk thread posix_memalign "
1813                    "returned %d (fail)\n",
1814                 result);
1815       status_ = false;
1816       return false;
1817     }
1818   }
1819   return true;
1820 }
1821
1822
1823 // Remove memory allocated for data transfer.
1824 bool FileThread::PageTeardown() {
1825   // Free a local buffer if we need to.
1826   if (!page_io_) {
1827     free(local_page_);
1828   }
1829   return true;
1830 }
1831
1832
1833
1834 // Get memory for an incoming data transfer..
1835 bool FileThread::GetEmptyPage(struct page_entry *dst) {
1836   if (page_io_) {
1837     if (!sat_->GetEmpty(dst))
1838       return false;
1839   } else {
1840     dst->addr = local_page_;
1841     dst->offset = 0;
1842     dst->pattern = 0;
1843   }
1844   return true;
1845 }
1846
1847 // Get memory for an outgoing data transfer..
1848 bool FileThread::GetValidPage(struct page_entry *src) {
1849   struct page_entry tmp;
1850   if (!sat_->GetValid(&tmp))
1851     return false;
1852   if (page_io_) {
1853     *src = tmp;
1854     return true;
1855   } else {
1856     src->addr = local_page_;
1857     src->offset = 0;
1858     CrcCopyPage(src, &tmp);
1859     if (!sat_->PutValid(&tmp))
1860       return false;
1861   }
1862   return true;
1863 }
1864
1865
1866 // Throw out a used empty page.
1867 bool FileThread::PutEmptyPage(struct page_entry *src) {
1868   if (page_io_) {
1869     if (!sat_->PutEmpty(src))
1870       return false;
1871   }
1872   return true;
1873 }
1874
1875 // Throw out a used, filled page.
1876 bool FileThread::PutValidPage(struct page_entry *src) {
1877   if (page_io_) {
1878     if (!sat_->PutValid(src))
1879       return false;
1880   }
1881   return true;
1882 }
1883
1884 // Copy data from file into memory blocks.
1885 bool FileThread::ReadPages(int fd) {
1886   int page_length = sat_->page_length();
1887   int strict = sat_->strict();
1888   bool result = true;
1889
1890   // Read our data back out of the file, into it's new location.
1891   lseek64(fd, 0, SEEK_SET);
1892   for (int i = 0; i < sat_->disk_pages(); i++) {
1893     struct page_entry dst;
1894     if (!GetEmptyPage(&dst))
1895       return false;
1896     // Retrieve expected pattern.
1897     dst.pattern = page_recs_[i].pattern;
1898     // Update page recordpage record.
1899     page_recs_[i].dst = dst.addr;
1900
1901     // Read from the file into destination page.
1902     if (!ReadPageFromFile(fd, &dst)) {
1903         PutEmptyPage(&dst);
1904         return false;
1905     }
1906
1907     SectorValidatePage(page_recs_[i], &dst, i);
1908
1909     // Ensure that the transfer ended up with correct data.
1910     if (strict) {
1911       // Record page index currently CRC checked.
1912       crc_page_ = i;
1913       int errors = CrcCheckPage(&dst);
1914       if (errors) {
1915         logprintf(5, "Log: file miscompare at block %d, "
1916                   "offset %x-%x. File: %s\n",
1917                   i, i * page_length, ((i + 1) * page_length) - 1,
1918                   filename_.c_str());
1919         result = false;
1920       }
1921       crc_page_ = -1;
1922       errorcount_ += errors;
1923     }
1924     if (!PutValidPage(&dst))
1925       return false;
1926   }
1927   return result;
1928 }
1929
1930 // File IO work loop. Execute until marked done.
1931 bool FileThread::Work() {
1932   bool result = true;
1933   int64 loops = 0;
1934
1935   logprintf(9, "Log: Starting file thread %d, file %s, device %s\n",
1936             thread_num_,
1937             filename_.c_str(),
1938             devicename_.c_str());
1939
1940   if (!PagePrepare()) {
1941     status_ = false;
1942     return false;
1943   }
1944
1945   // Open the data IO file.
1946   int fd = 0;
1947   if (!OpenFile(&fd)) {
1948     status_ = false;
1949     return false;
1950   }
1951
1952   pass_ = 0;
1953
1954   // Load patterns into page records.
1955   page_recs_ = new struct PageRec[sat_->disk_pages()];
1956   for (int i = 0; i < sat_->disk_pages(); i++) {
1957     page_recs_[i].pattern = new class Pattern();
1958   }
1959
1960   // Loop until done.
1961   while (IsReadyToRun()) {
1962     // Do the file write.
1963     if (!(result = result && WritePages(fd)))
1964       break;
1965
1966     // Do the file read.
1967     if (!(result = result && ReadPages(fd)))
1968       break;
1969
1970     loops++;
1971     pass_ = loops;
1972   }
1973
1974   pages_copied_ = loops * sat_->disk_pages();
1975
1976   // Clean up.
1977   CloseFile(fd);
1978   PageTeardown();
1979
1980   logprintf(9, "Log: Completed %d: file thread status %d, %d pages copied\n",
1981             thread_num_, status_, pages_copied_);
1982   // Failure to read from device indicates hardware,
1983   // rather than procedural SW error.
1984   status_ = true;
1985   return true;
1986 }
1987
1988 bool NetworkThread::IsNetworkStopSet() {
1989   return !IsReadyToRunNoPause();
1990 }
1991
1992 bool NetworkSlaveThread::IsNetworkStopSet() {
1993   // This thread has no completion status.
1994   // It finishes whever there is no more data to be
1995   // passed back.
1996   return true;
1997 }
1998
1999 // Set ip name to use for Network IO.
2000 void NetworkThread::SetIP(const char *ipaddr_init) {
2001   strncpy(ipaddr_, ipaddr_init, 256);
2002 }
2003
2004 // Create a socket.
2005 // Return 0 on error.
2006 bool NetworkThread::CreateSocket(int *psocket) {
2007   int sock = socket(AF_INET, SOCK_STREAM, 0);
2008   if (sock == -1) {
2009     logprintf(0, "Process Error: Cannot open socket\n");
2010     pages_copied_ = 0;
2011     status_ = false;
2012     return false;
2013   }
2014   *psocket = sock;
2015   return true;
2016 }
2017
2018 // Close the socket.
2019 bool NetworkThread::CloseSocket(int sock) {
2020   close(sock);
2021   return true;
2022 }
2023
2024 // Initiate the tcp connection.
2025 bool NetworkThread::Connect(int sock) {
2026   struct sockaddr_in dest_addr;
2027   dest_addr.sin_family = AF_INET;
2028   dest_addr.sin_port = htons(kNetworkPort);
2029   memset(&(dest_addr.sin_zero), '\0', sizeof(dest_addr.sin_zero));
2030
2031   // Translate dot notation to u32.
2032   if (inet_aton(ipaddr_, &dest_addr.sin_addr) == 0) {
2033     logprintf(0, "Process Error: Cannot resolve %s\n", ipaddr_);
2034     pages_copied_ = 0;
2035     status_ = false;
2036     return false;
2037   }
2038
2039   if (-1 == connect(sock, reinterpret_cast<struct sockaddr *>(&dest_addr),
2040                     sizeof(struct sockaddr))) {
2041     logprintf(0, "Process Error: Cannot connect %s\n", ipaddr_);
2042     pages_copied_ = 0;
2043     status_ = false;
2044     return false;
2045   }
2046   return true;
2047 }
2048
2049 // Initiate the tcp connection.
2050 bool NetworkListenThread::Listen() {
2051   struct sockaddr_in sa;
2052
2053   memset(&(sa.sin_zero), '\0', sizeof(sa.sin_zero));
2054
2055   sa.sin_family = AF_INET;
2056   sa.sin_addr.s_addr = INADDR_ANY;
2057   sa.sin_port = htons(kNetworkPort);
2058
2059   if (-1 == bind(sock_, (struct sockaddr*)&sa, sizeof(struct sockaddr))) {
2060     char buf[256];
2061     sat_strerror(errno, buf, sizeof(buf));
2062     logprintf(0, "Process Error: Cannot bind socket: %s\n", buf);
2063     pages_copied_ = 0;
2064     status_ = false;
2065     return false;
2066   }
2067   listen(sock_, 3);
2068   return true;
2069 }
2070
2071 // Wait for a connection from a network traffic generation thread.
2072 bool NetworkListenThread::Wait() {
2073     fd_set rfds;
2074     struct timeval tv;
2075     int retval;
2076
2077     // Watch sock_ to see when it has input.
2078     FD_ZERO(&rfds);
2079     FD_SET(sock_, &rfds);
2080     // Wait up to five seconds.
2081     tv.tv_sec = 5;
2082     tv.tv_usec = 0;
2083
2084     retval = select(sock_ + 1, &rfds, NULL, NULL, &tv);
2085
2086     return (retval > 0);
2087 }
2088
2089 // Wait for a connection from a network traffic generation thread.
2090 bool NetworkListenThread::GetConnection(int *pnewsock) {
2091   struct sockaddr_in sa;
2092   socklen_t size = sizeof(struct sockaddr_in);
2093
2094   int newsock = accept(sock_, reinterpret_cast<struct sockaddr *>(&sa), &size);
2095   if (newsock < 0)  {
2096     logprintf(0, "Process Error: Did not receive connection\n");
2097     pages_copied_ = 0;
2098     status_ = false;
2099     return false;
2100   }
2101   *pnewsock = newsock;
2102   return true;
2103 }
2104
2105 // Send a page, return false if a page was not sent.
2106 bool NetworkThread::SendPage(int sock, struct page_entry *src) {
2107   int page_length = sat_->page_length();
2108   char *address = static_cast<char*>(src->addr);
2109
2110   // Send our data over the network.
2111   int size = page_length;
2112   while (size) {
2113     int transferred = send(sock, address + (page_length - size), size, 0);
2114     if ((transferred == 0) || (transferred == -1)) {
2115       if (!IsNetworkStopSet()) {
2116         char buf[256] = "";
2117         sat_strerror(errno, buf, sizeof(buf));
2118         logprintf(0, "Process Error: Thread %d, "
2119                      "Network write failed, bailing. (%s)\n",
2120                   thread_num_, buf);
2121         status_ = false;
2122       }
2123       return false;
2124     }
2125     size = size - transferred;
2126   }
2127   return true;
2128 }
2129
2130 // Receive a page. Return false if a page was not received.
2131 bool NetworkThread::ReceivePage(int sock, struct page_entry *dst) {
2132   int page_length = sat_->page_length();
2133   char *address = static_cast<char*>(dst->addr);
2134
2135   // Maybe we will get our data back again, maybe not.
2136   int size = page_length;
2137   while (size) {
2138     int transferred = recv(sock, address + (page_length - size), size, 0);
2139     if ((transferred == 0) || (transferred == -1)) {
2140       // Typically network slave thread should exit as network master
2141       // thread stops sending data.
2142       if (IsNetworkStopSet()) {
2143         int err = errno;
2144         if (transferred == 0 && err == 0) {
2145           // Two system setups will not sync exactly,
2146           // allow early exit, but log it.
2147           logprintf(0, "Log: Net thread did not receive any data, exiting.\n");
2148         } else {
2149           char buf[256] = "";
2150           sat_strerror(err, buf, sizeof(buf));
2151           // Print why we failed.
2152           logprintf(0, "Process Error: Thread %d, "
2153                        "Network read failed, bailing (%s).\n",
2154                     thread_num_, buf);
2155           status_ = false;
2156           // Print arguments and results.
2157           logprintf(0, "Log: recv(%d, address %x, size %x, 0) == %x, err %d\n",
2158                     sock, address + (page_length - size),
2159                     size, transferred, err);
2160           if ((transferred == 0) &&
2161               (page_length - size < 512) &&
2162               (page_length - size > 0)) {
2163             // Print null terminated data received, to see who's been
2164             // sending us supicious unwanted data.
2165             address[page_length - size] = 0;
2166             logprintf(0, "Log: received  %d bytes: '%s'\n",
2167                       page_length - size, address);
2168           }
2169         }
2170       }
2171       return false;
2172     }
2173     size = size - transferred;
2174   }
2175   return true;
2176 }
2177
2178 // Network IO work loop. Execute until marked done.
2179 // Return true if the thread ran as expected.
2180 bool NetworkThread::Work() {
2181   logprintf(9, "Log: Starting network thread %d, ip %s\n",
2182             thread_num_,
2183             ipaddr_);
2184
2185   // Make a socket.
2186   int sock = 0;
2187   if (!CreateSocket(&sock))
2188     return false;
2189
2190   // Network IO loop requires network slave thread to have already initialized.
2191   // We will sleep here for awhile to ensure that the slave thread will be
2192   // listening by the time we connect.
2193   // Sleep for 15 seconds.
2194   sat_sleep(15);
2195   logprintf(9, "Log: Starting execution of network thread %d, ip %s\n",
2196             thread_num_,
2197             ipaddr_);
2198
2199
2200   // Connect to a slave thread.
2201   if (!Connect(sock))
2202     return false;
2203
2204   // Loop until done.
2205   bool result = true;
2206   int strict = sat_->strict();
2207   int64 loops = 0;
2208   while (IsReadyToRun()) {
2209     struct page_entry src;
2210     struct page_entry dst;
2211     result = result && sat_->GetValid(&src);
2212     result = result && sat_->GetEmpty(&dst);
2213     if (!result) {
2214       logprintf(0, "Process Error: net_thread failed to pop pages, "
2215                 "bailing\n");
2216       break;
2217     }
2218
2219     // Check data correctness.
2220     if (strict)
2221       CrcCheckPage(&src);
2222
2223     // Do the network write.
2224     if (!(result = result && SendPage(sock, &src)))
2225       break;
2226
2227     // Update pattern reference to reflect new contents.
2228     dst.pattern = src.pattern;
2229
2230     // Do the network read.
2231     if (!(result = result && ReceivePage(sock, &dst)))
2232       break;
2233
2234     // Ensure that the transfer ended up with correct data.
2235     if (strict)
2236       CrcCheckPage(&dst);
2237
2238     // Return all of our pages to the queue.
2239     result = result && sat_->PutValid(&dst);
2240     result = result && sat_->PutEmpty(&src);
2241     if (!result) {
2242       logprintf(0, "Process Error: net_thread failed to push pages, "
2243                 "bailing\n");
2244       break;
2245     }
2246     loops++;
2247   }
2248
2249   pages_copied_ = loops;
2250   status_ = result;
2251
2252   // Clean up.
2253   CloseSocket(sock);
2254
2255   logprintf(9, "Log: Completed %d: network thread status %d, "
2256                "%d pages copied\n",
2257             thread_num_, status_, pages_copied_);
2258   return result;
2259 }
2260
2261 // Spawn slave threads for incoming connections.
2262 bool NetworkListenThread::SpawnSlave(int newsock, int threadid) {
2263   logprintf(12, "Log: Listen thread spawning slave\n");
2264
2265   // Spawn slave thread, to reflect network traffic back to sender.
2266   ChildWorker *child_worker = new ChildWorker;
2267   child_worker->thread.SetSock(newsock);
2268   child_worker->thread.InitThread(threadid, sat_, os_, patternlist_,
2269                                   &child_worker->status);
2270   child_worker->status.Initialize();
2271   child_worker->thread.SpawnThread();
2272   child_workers_.push_back(child_worker);
2273
2274   return true;
2275 }
2276
2277 // Reap slave threads.
2278 bool NetworkListenThread::ReapSlaves() {
2279   bool result = true;
2280   // Gather status and reap threads.
2281   logprintf(12, "Log: Joining all outstanding threads\n");
2282
2283   for (size_t i = 0; i < child_workers_.size(); i++) {
2284     NetworkSlaveThread& child_thread = child_workers_[i]->thread;
2285     logprintf(12, "Log: Joining slave thread %d\n", i);
2286     child_thread.JoinThread();
2287     if (child_thread.GetStatus() != 1) {
2288       logprintf(0, "Process Error: Slave Thread %d failed with status %d\n", i,
2289                 child_thread.GetStatus());
2290       result = false;
2291     }
2292     errorcount_ += child_thread.GetErrorCount();
2293     logprintf(9, "Log: Slave Thread %d found %lld miscompares\n", i,
2294               child_thread.GetErrorCount());
2295     pages_copied_ += child_thread.GetPageCount();
2296   }
2297
2298   return result;
2299 }
2300
2301 // Network listener IO work loop. Execute until marked done.
2302 // Return false on fatal software error.
2303 bool NetworkListenThread::Work() {
2304   logprintf(9, "Log: Starting network listen thread %d\n",
2305             thread_num_);
2306
2307   // Make a socket.
2308   sock_ = 0;
2309   if (!CreateSocket(&sock_)) {
2310     status_ = false;
2311     return false;
2312   }
2313   logprintf(9, "Log: Listen thread created sock\n");
2314
2315   // Allows incoming connections to be queued up by socket library.
2316   int newsock = 0;
2317   Listen();
2318   logprintf(12, "Log: Listen thread waiting for incoming connections\n");
2319
2320   // Wait on incoming connections, and spawn worker threads for them.
2321   int threadcount = 0;
2322   while (IsReadyToRun()) {
2323     // Poll for connections that we can accept().
2324     if (Wait()) {
2325       // Accept those connections.
2326       logprintf(12, "Log: Listen thread found incoming connection\n");
2327       if (GetConnection(&newsock)) {
2328         SpawnSlave(newsock, threadcount);
2329         threadcount++;
2330       }
2331     }
2332   }
2333
2334   // Gather status and join spawned threads.
2335   ReapSlaves();
2336
2337   // Delete the child workers.
2338   for (ChildVector::iterator it = child_workers_.begin();
2339        it != child_workers_.end(); ++it) {
2340     (*it)->status.Destroy();
2341     delete *it;
2342   }
2343   child_workers_.clear();
2344
2345   CloseSocket(sock_);
2346
2347   status_ = true;
2348   logprintf(9,
2349             "Log: Completed %d: network listen thread status %d, "
2350             "%d pages copied\n",
2351             thread_num_, status_, pages_copied_);
2352   return true;
2353 }
2354
2355 // Set network reflector socket struct.
2356 void NetworkSlaveThread::SetSock(int sock) {
2357   sock_ = sock;
2358 }
2359
2360 // Network reflector IO work loop. Execute until marked done.
2361 // Return false on fatal software error.
2362 bool NetworkSlaveThread::Work() {
2363   logprintf(9, "Log: Starting network slave thread %d\n",
2364             thread_num_);
2365
2366   // Verify that we have a socket.
2367   int sock = sock_;
2368   if (!sock) {
2369     status_ = false;
2370     return false;
2371   }
2372
2373   // Loop until done.
2374   int64 loops = 0;
2375   // Init a local buffer for storing data.
2376   void *local_page = NULL;
2377 #ifdef HAVE_POSIX_MEMALIGN
2378   int result = posix_memalign(&local_page, 512, sat_->page_length());
2379 #else
2380   local_page = memalign(512, sat_->page_length());
2381   int result = (local_page == 0);
2382 #endif
2383   if (result) {
2384     logprintf(0, "Process Error: net slave posix_memalign "
2385                  "returned %d (fail)\n",
2386               result);
2387     status_ = false;
2388     return false;
2389   }
2390
2391   struct page_entry page;
2392   page.addr = local_page;
2393
2394   // This thread will continue to run as long as the thread on the other end of
2395   // the socket is still sending and receiving data.
2396   while (1) {
2397     // Do the network read.
2398     if (!ReceivePage(sock, &page))
2399       break;
2400
2401     // Do the network write.
2402     if (!SendPage(sock, &page))
2403       break;
2404
2405     loops++;
2406   }
2407
2408   pages_copied_ = loops;
2409   // No results provided from this type of thread.
2410   status_ = true;
2411
2412   // Clean up.
2413   CloseSocket(sock);
2414
2415   logprintf(9,
2416             "Log: Completed %d: network slave thread status %d, "
2417             "%d pages copied\n",
2418             thread_num_, status_, pages_copied_);
2419   return true;
2420 }
2421
2422 // Thread work loop. Execute until marked finished.
2423 bool ErrorPollThread::Work() {
2424   logprintf(9, "Log: Starting system error poll thread %d\n", thread_num_);
2425
2426   // This calls a generic error polling function in the Os abstraction layer.
2427   do {
2428     errorcount_ += os_->ErrorPoll();
2429     os_->ErrorWait();
2430   } while (IsReadyToRun());
2431
2432   logprintf(9, "Log: Finished system error poll thread %d: %d errors\n",
2433             thread_num_, errorcount_);
2434   status_ = true;
2435   return true;
2436 }
2437
2438 // Worker thread to heat up CPU.
2439 // This thread does not evaluate pass/fail or software error.
2440 bool CpuStressThread::Work() {
2441   logprintf(9, "Log: Starting CPU stress thread %d\n", thread_num_);
2442
2443   do {
2444     // Run ludloff's platform/CPU-specific assembly workload.
2445     os_->CpuStressWorkload();
2446     YieldSelf();
2447   } while (IsReadyToRun());
2448
2449   logprintf(9, "Log: Finished CPU stress thread %d:\n",
2450             thread_num_);
2451   status_ = true;
2452   return true;
2453 }
2454
2455 CpuCacheCoherencyThread::CpuCacheCoherencyThread(cc_cacheline_data *data,
2456                                                  int cacheline_count,
2457                                                  int thread_num,
2458                                                  int thread_count,
2459                                                  int inc_count) {
2460   cc_cacheline_data_ = data;
2461   cc_cacheline_count_ = cacheline_count;
2462   cc_thread_num_ = thread_num;
2463   cc_thread_count_ = thread_count;
2464   cc_inc_count_ = inc_count;
2465 }
2466
2467 // A very simple psuedorandom generator.  Since the random number is based
2468 // on only a few simple logic operations, it can be done quickly in registers
2469 // and the compiler can inline it.
2470 uint64 CpuCacheCoherencyThread::SimpleRandom(uint64 seed) {
2471   return (seed >> 1) ^ (-(seed & 1) & kRandomPolynomial);
2472 }
2473
2474 // Worked thread to test the cache coherency of the CPUs
2475 // Return false on fatal sw error.
2476 bool CpuCacheCoherencyThread::Work() {
2477   logprintf(9, "Log: Starting the Cache Coherency thread %d\n",
2478             cc_thread_num_);
2479   uint64 time_start, time_end;
2480   struct timeval tv;
2481
2482   // Use a slightly more robust random number for the initial
2483   // value, so the random sequences from the simple generator will
2484   // be more divergent.
2485 #ifdef HAVE_RAND_R
2486   unsigned int seed = static_cast<unsigned int>(gettid());
2487   uint64 r = static_cast<uint64>(rand_r(&seed));
2488   r |= static_cast<uint64>(rand_r(&seed)) << 32;
2489 #else
2490   srand(time(NULL));
2491   uint64 r = static_cast<uint64>(rand());  // NOLINT
2492   r |= static_cast<uint64>(rand()) << 32;  // NOLINT
2493 #endif
2494
2495   gettimeofday(&tv, NULL);  // Get the timestamp before increments.
2496   time_start = tv.tv_sec * 1000000ULL + tv.tv_usec;
2497
2498   uint64 total_inc = 0;  // Total increments done by the thread.
2499   while (IsReadyToRun()) {
2500     for (int i = 0; i < cc_inc_count_; i++) {
2501       // Choose a datastructure in random and increment the appropriate
2502       // member in that according to the offset (which is the same as the
2503       // thread number.
2504       r = SimpleRandom(r);
2505       int cline_num = r % cc_cacheline_count_;
2506       int offset;
2507       // Reverse the order for odd numbered threads in odd numbered cache
2508       // lines.  This is designed for massively multi-core systems where the
2509       // number of cores exceeds the bytes in a cache line, so "distant" cores
2510       // get a chance to exercize cache coherency between them.
2511       if (cline_num & cc_thread_num_ & 1)
2512         offset = (cc_thread_count_ & ~1) - cc_thread_num_;
2513       else
2514         offset = cc_thread_num_;
2515       // Increment the member of the randomely selected structure.
2516       (cc_cacheline_data_[cline_num].num[offset])++;
2517     }
2518
2519     total_inc += cc_inc_count_;
2520
2521     // Calculate if the local counter matches with the global value
2522     // in all the cache line structures for this particular thread.
2523     int cc_global_num = 0;
2524     for (int cline_num = 0; cline_num < cc_cacheline_count_; cline_num++) {
2525       int offset;
2526       // Perform the same offset calculation from above.
2527       if (cline_num & cc_thread_num_ & 1)
2528         offset = (cc_thread_count_ & ~1) - cc_thread_num_;
2529       else
2530         offset = cc_thread_num_;
2531       cc_global_num += cc_cacheline_data_[cline_num].num[offset];
2532       // Reset the cachline member's value for the next run.
2533       cc_cacheline_data_[cline_num].num[offset] = 0;
2534     }
2535     if (sat_->error_injection())
2536       cc_global_num = -1;
2537
2538     // Since the count is only stored in a byte, to squeeze more into a
2539     // single cache line, only compare it as a byte.  In the event that there
2540     // is something detected, the chance that it would be missed by a single
2541     // thread is 1 in 256.  If it affects all cores, that makes the chance
2542     // of it being missed terribly minute.  It seems unlikely any failure
2543     // case would be off by more than a small number.
2544     if ((cc_global_num & 0xff) != (cc_inc_count_ & 0xff)) {
2545       errorcount_++;
2546       logprintf(0, "Hardware Error: global(%d) and local(%d) do not match\n",
2547                 cc_global_num, cc_inc_count_);
2548     }
2549   }
2550   gettimeofday(&tv, NULL);  // Get the timestamp at the end.
2551   time_end = tv.tv_sec * 1000000ULL + tv.tv_usec;
2552
2553   uint64 us_elapsed = time_end - time_start;
2554   // inc_rate is the no. of increments per second.
2555   double inc_rate = total_inc * 1e6 / us_elapsed;
2556
2557   logprintf(4, "Stats: CC Thread(%d): Time=%llu us,"
2558             " Increments=%llu, Increments/sec = %.6lf\n",
2559             cc_thread_num_, us_elapsed, total_inc, inc_rate);
2560   logprintf(9, "Log: Finished CPU Cache Coherency thread %d:\n",
2561             cc_thread_num_);
2562   status_ = true;
2563   return true;
2564 }
2565
2566 DiskThread::DiskThread(DiskBlockTable *block_table) {
2567   read_block_size_ = kSectorSize;   // default 1 sector (512 bytes)
2568   write_block_size_ = kSectorSize;  // this assumes read and write block size
2569                                     // are the same
2570   segment_size_ = -1;               // use the entire disk as one segment
2571   cache_size_ = 16 * 1024 * 1024;   // assume 16MiB cache by default
2572   // Use a queue such that 3/2 times as much data as the cache can hold
2573   // is written before it is read so that there is little chance the read
2574   // data is in the cache.
2575   queue_size_ = ((cache_size_ / write_block_size_) * 3) / 2;
2576   blocks_per_segment_ = 32;
2577
2578   read_threshold_ = 100000;         // 100ms is a reasonable limit for
2579   write_threshold_ = 100000;        // reading/writing a sector
2580
2581   read_timeout_ = 5000000;          // 5 seconds should be long enough for a
2582   write_timeout_ = 5000000;         // timout for reading/writing
2583
2584   device_sectors_ = 0;
2585   non_destructive_ = 0;
2586
2587 #ifdef HAVE_LIBAIO_H
2588   aio_ctx_ = 0;
2589 #endif
2590   block_table_ = block_table;
2591   update_block_table_ = 1;
2592
2593   block_buffer_ = NULL;
2594
2595   blocks_written_ = 0;
2596   blocks_read_ = 0;
2597 }
2598
2599 DiskThread::~DiskThread() {
2600   if (block_buffer_)
2601     free(block_buffer_);
2602 }
2603
2604 // Set filename for device file (in /dev).
2605 void DiskThread::SetDevice(const char *device_name) {
2606   device_name_ = device_name;
2607 }
2608
2609 // Set various parameters that control the behaviour of the test.
2610 // -1 is used as a sentinel value on each parameter (except non_destructive)
2611 // to indicate that the parameter not be set.
2612 bool DiskThread::SetParameters(int read_block_size,
2613                                int write_block_size,
2614                                int64 segment_size,
2615                                int64 cache_size,
2616                                int blocks_per_segment,
2617                                int64 read_threshold,
2618                                int64 write_threshold,
2619                                int non_destructive) {
2620   if (read_block_size != -1) {
2621     // Blocks must be aligned to the disk's sector size.
2622     if (read_block_size % kSectorSize != 0) {
2623       logprintf(0, "Process Error: Block size must be a multiple of %d "
2624                 "(thread %d).\n", kSectorSize, thread_num_);
2625       return false;
2626     }
2627
2628     read_block_size_ = read_block_size;
2629   }
2630
2631   if (write_block_size != -1) {
2632     // Write blocks must be aligned to the disk's sector size and to the
2633     // block size.
2634     if (write_block_size % kSectorSize != 0) {
2635       logprintf(0, "Process Error: Write block size must be a multiple "
2636                 "of %d (thread %d).\n", kSectorSize, thread_num_);
2637       return false;
2638     }
2639     if (write_block_size % read_block_size_ != 0) {
2640       logprintf(0, "Process Error: Write block size must be a multiple "
2641                 "of the read block size, which is %d (thread %d).\n",
2642                 read_block_size_, thread_num_);
2643       return false;
2644     }
2645
2646     write_block_size_ = write_block_size;
2647
2648   } else {
2649     // Make sure write_block_size_ is still valid.
2650     if (read_block_size_ > write_block_size_) {
2651       logprintf(5, "Log: Assuming write block size equal to read block size, "
2652                 "which is %d (thread %d).\n", read_block_size_,
2653                 thread_num_);
2654       write_block_size_ = read_block_size_;
2655     } else {
2656       if (write_block_size_ % read_block_size_ != 0) {
2657         logprintf(0, "Process Error: Write block size (defined as %d) must "
2658                   "be a multiple of the read block size, which is %d "
2659                   "(thread %d).\n", write_block_size_, read_block_size_,
2660                   thread_num_);
2661         return false;
2662       }
2663     }
2664   }
2665
2666   if (cache_size != -1) {
2667     cache_size_ = cache_size;
2668   }
2669
2670   if (blocks_per_segment != -1) {
2671     if (blocks_per_segment <= 0) {
2672       logprintf(0, "Process Error: Blocks per segment must be greater than "
2673                    "zero.\n (thread %d)", thread_num_);
2674       return false;
2675     }
2676
2677     blocks_per_segment_ = blocks_per_segment;
2678   }
2679
2680   if (read_threshold != -1) {
2681     if (read_threshold <= 0) {
2682       logprintf(0, "Process Error: Read threshold must be greater than "
2683                    "zero (thread %d).\n", thread_num_);
2684       return false;
2685     }
2686
2687     read_threshold_ = read_threshold;
2688   }
2689
2690   if (write_threshold != -1) {
2691     if (write_threshold <= 0) {
2692       logprintf(0, "Process Error: Write threshold must be greater than "
2693                    "zero (thread %d).\n", thread_num_);
2694       return false;
2695     }
2696
2697     write_threshold_ = write_threshold;
2698   }
2699
2700   if (segment_size != -1) {
2701     // Segments must be aligned to the disk's sector size.
2702     if (segment_size % kSectorSize != 0) {
2703       logprintf(0, "Process Error: Segment size must be a multiple of %d"
2704                 " (thread %d).\n", kSectorSize, thread_num_);
2705       return false;
2706     }
2707
2708     segment_size_ = segment_size / kSectorSize;
2709   }
2710
2711   non_destructive_ = non_destructive;
2712
2713   // Having a queue of 150% of blocks that will fit in the disk's cache
2714   // should be enough to force out the oldest block before it is read and hence,
2715   // making sure the data comes form the disk and not the cache.
2716   queue_size_ = ((cache_size_ / write_block_size_) * 3) / 2;
2717   // Updating DiskBlockTable parameters
2718   if (update_block_table_) {
2719     block_table_->SetParameters(kSectorSize, write_block_size_,
2720                                 device_sectors_, segment_size_,
2721                                 device_name_);
2722   }
2723   return true;
2724 }
2725
2726 // Open a device, return false on failure.
2727 bool DiskThread::OpenDevice(int *pfile) {
2728   int flags = O_RDWR | O_SYNC | O_LARGEFILE;
2729   int fd = open(device_name_.c_str(), flags | O_DIRECT, 0);
2730   if (O_DIRECT != 0 && fd < 0 && errno == EINVAL) {
2731     fd = open(device_name_.c_str(), flags, 0);  // Try without O_DIRECT
2732     os_->ActivateFlushPageCache();
2733   }
2734   if (fd < 0) {
2735     logprintf(0, "Process Error: Failed to open device %s (thread %d)!!\n",
2736               device_name_.c_str(), thread_num_);
2737     return false;
2738   }
2739   *pfile = fd;
2740
2741   return GetDiskSize(fd);
2742 }
2743
2744 // Retrieves the size (in bytes) of the disk/file.
2745 // Return false on failure.
2746 bool DiskThread::GetDiskSize(int fd) {
2747   struct stat device_stat;
2748   if (fstat(fd, &device_stat) == -1) {
2749     logprintf(0, "Process Error: Unable to fstat disk %s (thread %d).\n",
2750               device_name_.c_str(), thread_num_);
2751     return false;
2752   }
2753
2754   // For a block device, an ioctl is needed to get the size since the size
2755   // of the device file (i.e. /dev/sdb) is 0.
2756   if (S_ISBLK(device_stat.st_mode)) {
2757     uint64 block_size = 0;
2758
2759     if (ioctl(fd, BLKGETSIZE64, &block_size) == -1) {
2760       logprintf(0, "Process Error: Unable to ioctl disk %s (thread %d).\n",
2761                 device_name_.c_str(), thread_num_);
2762       return false;
2763     }
2764
2765     // Zero size indicates nonworking device..
2766     if (block_size == 0) {
2767       os_->ErrorReport(device_name_.c_str(), "device-size-zero", 1);
2768       ++errorcount_;
2769       status_ = true;  // Avoid a procedural error.
2770       return false;
2771     }
2772
2773     device_sectors_ = block_size / kSectorSize;
2774
2775   } else if (S_ISREG(device_stat.st_mode)) {
2776     device_sectors_ = device_stat.st_size / kSectorSize;
2777
2778   } else {
2779     logprintf(0, "Process Error: %s is not a regular file or block "
2780               "device (thread %d).\n", device_name_.c_str(),
2781               thread_num_);
2782     return false;
2783   }
2784
2785   logprintf(12, "Log: Device sectors: %lld on disk %s (thread %d).\n",
2786             device_sectors_, device_name_.c_str(), thread_num_);
2787
2788   if (update_block_table_) {
2789     block_table_->SetParameters(kSectorSize, write_block_size_,
2790                                 device_sectors_, segment_size_,
2791                                 device_name_);
2792   }
2793
2794   return true;
2795 }
2796
2797 bool DiskThread::CloseDevice(int fd) {
2798   close(fd);
2799   return true;
2800 }
2801
2802 // Return the time in microseconds.
2803 int64 DiskThread::GetTime() {
2804   struct timeval tv;
2805   gettimeofday(&tv, NULL);
2806   return tv.tv_sec * 1000000 + tv.tv_usec;
2807 }
2808
2809 // Do randomized reads and (possibly) writes on a device.
2810 // Return false on fatal SW error, true on SW success,
2811 // regardless of whether HW failed.
2812 bool DiskThread::DoWork(int fd) {
2813   int64 block_num = 0;
2814   int64 num_segments;
2815
2816   if (segment_size_ == -1) {
2817     num_segments = 1;
2818   } else {
2819     num_segments = device_sectors_ / segment_size_;
2820     if (device_sectors_ % segment_size_ != 0)
2821       num_segments++;
2822   }
2823
2824   // Disk size should be at least 3x cache size.  See comment later for
2825   // details.
2826   sat_assert(device_sectors_ * kSectorSize > 3 * cache_size_);
2827
2828   // This disk test works by writing blocks with a certain pattern to
2829   // disk, then reading them back and verifying it against the pattern
2830   // at a later time.  A failure happens when either the block cannot
2831   // be written/read or when the read block is different than what was
2832   // written.  If a block takes too long to write/read, then a warning
2833   // is given instead of an error since taking too long is not
2834   // necessarily an error.
2835   //
2836   // To prevent the read blocks from coming from the disk cache,
2837   // enough blocks are written before read such that a block would
2838   // be ejected from the disk cache by the time it is read.
2839   //
2840   // TODO(amistry): Implement some sort of read/write throttling.  The
2841   //                flood of asynchronous I/O requests when a drive is
2842   //                unplugged is causing the application and kernel to
2843   //                become unresponsive.
2844
2845   while (IsReadyToRun()) {
2846     // Write blocks to disk.
2847     logprintf(16, "Log: Write phase %sfor disk %s (thread %d).\n",
2848               non_destructive_ ? "(disabled) " : "",
2849               device_name_.c_str(), thread_num_);
2850     while (IsReadyToRunNoPause() &&
2851            in_flight_sectors_.size() <
2852                static_cast<size_t>(queue_size_ + 1)) {
2853       // Confine testing to a particular segment of the disk.
2854       int64 segment = (block_num / blocks_per_segment_) % num_segments;
2855       if (!non_destructive_ &&
2856           (block_num % blocks_per_segment_ == 0)) {
2857         logprintf(20, "Log: Starting to write segment %lld out of "
2858                   "%lld on disk %s (thread %d).\n",
2859                   segment, num_segments, device_name_.c_str(),
2860                   thread_num_);
2861       }
2862       block_num++;
2863
2864       BlockData *block = block_table_->GetUnusedBlock(segment);
2865
2866       // If an unused sequence of sectors could not be found, skip to the
2867       // next block to process.  Soon, a new segment will come and new
2868       // sectors will be able to be allocated.  This effectively puts a
2869       // minumim on the disk size at 3x the stated cache size, or 48MiB
2870       // if a cache size is not given (since the cache is set as 16MiB
2871       // by default).  Given that todays caches are at the low MiB range
2872       // and drive sizes at the mid GB, this shouldn't pose a problem.
2873       // The 3x minimum comes from the following:
2874       //   1. In order to allocate 'y' blocks from a segment, the
2875       //      segment must contain at least 2y blocks or else an
2876       //      allocation may not succeed.
2877       //   2. Assume the entire disk is one segment.
2878       //   3. A full write phase consists of writing blocks corresponding to
2879       //      3/2 cache size.
2880       //   4. Therefore, the one segment must have 2 * 3/2 * cache
2881       //      size worth of blocks = 3 * cache size worth of blocks
2882       //      to complete.
2883       // In non-destructive mode, don't write anything to disk.
2884       if (!non_destructive_) {
2885         if (!WriteBlockToDisk(fd, block)) {
2886           block_table_->RemoveBlock(block);
2887           return true;
2888         }
2889         blocks_written_++;
2890       }
2891
2892       // Block is either initialized by writing, or in nondestructive case,
2893       // initialized by being added into the datastructure for later reading.
2894       block->initialized();
2895
2896       in_flight_sectors_.push(block);
2897     }
2898     if (!os_->FlushPageCache())  // If O_DIRECT worked, this will be a NOP.
2899       return false;
2900
2901     // Verify blocks on disk.
2902     logprintf(20, "Log: Read phase for disk %s (thread %d).\n",
2903               device_name_.c_str(), thread_num_);
2904     while (IsReadyToRunNoPause() && !in_flight_sectors_.empty()) {
2905       BlockData *block = in_flight_sectors_.front();
2906       in_flight_sectors_.pop();
2907       if (!ValidateBlockOnDisk(fd, block))
2908         return true;
2909       block_table_->RemoveBlock(block);
2910       blocks_read_++;
2911     }
2912   }
2913
2914   pages_copied_ = blocks_written_ + blocks_read_;
2915   return true;
2916 }
2917
2918 // Do an asynchronous disk I/O operation.
2919 // Return false if the IO is not set up.
2920 bool DiskThread::AsyncDiskIO(IoOp op, int fd, void *buf, int64 size,
2921                             int64 offset, int64 timeout) {
2922 #ifdef HAVE_LIBAIO_H
2923   // Use the Linux native asynchronous I/O interface for reading/writing.
2924   // A read/write consists of three basic steps:
2925   //    1. create an io context.
2926   //    2. prepare and submit an io request to the context
2927   //    3. wait for an event on the context.
2928
2929   struct {
2930     const int opcode;
2931     const char *op_str;
2932     const char *error_str;
2933   } operations[2] = {
2934     { IO_CMD_PREAD, "read", "disk-read-error" },
2935     { IO_CMD_PWRITE, "write", "disk-write-error" }
2936   };
2937
2938   struct iocb cb;
2939   memset(&cb, 0, sizeof(cb));
2940
2941   cb.aio_fildes = fd;
2942   cb.aio_lio_opcode = operations[op].opcode;
2943   cb.u.c.buf = buf;
2944   cb.u.c.nbytes = size;
2945   cb.u.c.offset = offset;
2946
2947   struct iocb *cbs[] = { &cb };
2948   if (io_submit(aio_ctx_, 1, cbs) != 1) {
2949     int error = errno;
2950     char buf[256];
2951     sat_strerror(error, buf, sizeof(buf));
2952     logprintf(0, "Process Error: Unable to submit async %s "
2953                  "on disk %s (thread %d). Error %d, %s\n",
2954               operations[op].op_str, device_name_.c_str(),
2955               thread_num_, error, buf);
2956     return false;
2957   }
2958
2959   struct io_event event;
2960   memset(&event, 0, sizeof(event));
2961   struct timespec tv;
2962   tv.tv_sec = timeout / 1000000;
2963   tv.tv_nsec = (timeout % 1000000) * 1000;
2964   if (io_getevents(aio_ctx_, 1, 1, &event, &tv) != 1) {
2965     // A ctrl-c from the keyboard will cause io_getevents to fail with an
2966     // EINTR error code.  This is not an error and so don't treat it as such,
2967     // but still log it.
2968     int error = errno;
2969     if (error == EINTR) {
2970       logprintf(5, "Log: %s interrupted on disk %s (thread %d).\n",
2971                 operations[op].op_str, device_name_.c_str(),
2972                 thread_num_);
2973     } else {
2974       os_->ErrorReport(device_name_.c_str(), operations[op].error_str, 1);
2975       errorcount_ += 1;
2976       logprintf(0, "Hardware Error: Timeout doing async %s to sectors "
2977                    "starting at %lld on disk %s (thread %d).\n",
2978                 operations[op].op_str, offset / kSectorSize,
2979                 device_name_.c_str(), thread_num_);
2980     }
2981
2982     // Don't bother checking return codes since io_cancel seems to always fail.
2983     // Since io_cancel is always failing, destroying and recreating an I/O
2984     // context is a workaround for canceling an in-progress I/O operation.
2985     // TODO(amistry): Find out why io_cancel isn't working and make it work.
2986     io_cancel(aio_ctx_, &cb, &event);
2987     io_destroy(aio_ctx_);
2988     aio_ctx_ = 0;
2989     if (io_setup(5, &aio_ctx_)) {
2990       int error = errno;
2991       char buf[256];
2992       sat_strerror(error, buf, sizeof(buf));
2993       logprintf(0, "Process Error: Unable to create aio context on disk %s"
2994                 " (thread %d) Error %d, %s\n",
2995                 device_name_.c_str(), thread_num_, error, buf);
2996     }
2997
2998     return false;
2999   }
3000
3001   // event.res contains the number of bytes written/read or
3002   // error if < 0, I think.
3003   if (event.res != static_cast<uint64>(size)) {
3004     errorcount_++;
3005     os_->ErrorReport(device_name_.c_str(), operations[op].error_str, 1);
3006
3007     int64 result = static_cast<int64>(event.res);
3008     if (result < 0) {
3009       switch (result) {
3010         case -EIO:
3011           logprintf(0, "Hardware Error: Low-level I/O error while doing %s to "
3012                        "sectors starting at %lld on disk %s (thread %d).\n",
3013                     operations[op].op_str, offset / kSectorSize,
3014                     device_name_.c_str(), thread_num_);
3015           break;
3016         default:
3017           logprintf(0, "Hardware Error: Unknown error while doing %s to "
3018                        "sectors starting at %lld on disk %s (thread %d).\n",
3019                     operations[op].op_str, offset / kSectorSize,
3020                     device_name_.c_str(), thread_num_);
3021       }
3022     } else {
3023       logprintf(0, "Hardware Error: Unable to %s to sectors starting at "
3024                    "%lld on disk %s (thread %d).\n",
3025                 operations[op].op_str, offset / kSectorSize,
3026                 device_name_.c_str(), thread_num_);
3027     }
3028     return false;
3029   }
3030
3031   return true;
3032 #else  // !HAVE_LIBAIO_H
3033   return false;
3034 #endif
3035 }
3036
3037 // Write a block to disk.
3038 // Return false if the block is not written.
3039 bool DiskThread::WriteBlockToDisk(int fd, BlockData *block) {
3040   memset(block_buffer_, 0, block->size());
3041
3042   // Fill block buffer with a pattern
3043   struct page_entry pe;
3044   if (!sat_->GetValid(&pe)) {
3045     // Even though a valid page could not be obatined, it is not an error
3046     // since we can always fill in a pattern directly, albeit slower.
3047     unsigned int *memblock = static_cast<unsigned int *>(block_buffer_);
3048     block->set_pattern(patternlist_->GetRandomPattern());
3049
3050     logprintf(11, "Log: Warning, using pattern fill fallback in "
3051                   "DiskThread::WriteBlockToDisk on disk %s (thread %d).\n",
3052               device_name_.c_str(), thread_num_);
3053
3054     for (unsigned int i = 0; i < block->size()/wordsize_; i++) {
3055       memblock[i] = block->pattern()->pattern(i);
3056     }
3057   } else {
3058     memcpy(block_buffer_, pe.addr, block->size());
3059     block->set_pattern(pe.pattern);
3060     sat_->PutValid(&pe);
3061   }
3062
3063   logprintf(12, "Log: Writing %lld sectors starting at %lld on disk %s"
3064             " (thread %d).\n",
3065             block->size()/kSectorSize, block->address(),
3066             device_name_.c_str(), thread_num_);
3067
3068   int64 start_time = GetTime();
3069
3070   if (!AsyncDiskIO(ASYNC_IO_WRITE, fd, block_buffer_, block->size(),
3071                    block->address() * kSectorSize, write_timeout_)) {
3072     return false;
3073   }
3074
3075   int64 end_time = GetTime();
3076   logprintf(12, "Log: Writing time: %lld us (thread %d).\n",
3077             end_time - start_time, thread_num_);
3078   if (end_time - start_time > write_threshold_) {
3079     logprintf(5, "Log: Write took %lld us which is longer than threshold "
3080                  "%lld us on disk %s (thread %d).\n",
3081               end_time - start_time, write_threshold_, device_name_.c_str(),
3082               thread_num_);
3083   }
3084
3085   return true;
3086 }
3087
3088 // Verify a block on disk.
3089 // Return true if the block was read, also increment errorcount
3090 // if the block had data errors or performance problems.
3091 bool DiskThread::ValidateBlockOnDisk(int fd, BlockData *block) {
3092   int64 blocks = block->size() / read_block_size_;
3093   int64 bytes_read = 0;
3094   int64 current_blocks;
3095   int64 current_bytes;
3096   uint64 address = block->address();
3097
3098   logprintf(20, "Log: Reading sectors starting at %lld on disk %s "
3099             "(thread %d).\n",
3100             address, device_name_.c_str(), thread_num_);
3101
3102   // Read block from disk and time the read.  If it takes longer than the
3103   // threshold, complain.
3104   if (lseek64(fd, address * kSectorSize, SEEK_SET) == -1) {
3105     logprintf(0, "Process Error: Unable to seek to sector %lld in "
3106               "DiskThread::ValidateSectorsOnDisk on disk %s "
3107               "(thread %d).\n", address, device_name_.c_str(), thread_num_);
3108     return false;
3109   }
3110   int64 start_time = GetTime();
3111
3112   // Split a large write-sized block into small read-sized blocks and
3113   // read them in groups of randomly-sized multiples of read block size.
3114   // This assures all data written on disk by this particular block
3115   // will be tested using a random reading pattern.
3116   while (blocks != 0) {
3117     // Test all read blocks in a written block.
3118     current_blocks = (random() % blocks) + 1;
3119     current_bytes = current_blocks * read_block_size_;
3120
3121     memset(block_buffer_, 0, current_bytes);
3122
3123     logprintf(20, "Log: Reading %lld sectors starting at sector %lld on "
3124               "disk %s (thread %d)\n",
3125               current_bytes / kSectorSize,
3126               (address * kSectorSize + bytes_read) / kSectorSize,
3127               device_name_.c_str(), thread_num_);
3128
3129     if (!AsyncDiskIO(ASYNC_IO_READ, fd, block_buffer_, current_bytes,
3130                      address * kSectorSize + bytes_read,
3131                      write_timeout_)) {
3132       return false;
3133     }
3134
3135     int64 end_time = GetTime();
3136     logprintf(20, "Log: Reading time: %lld us (thread %d).\n",
3137               end_time - start_time, thread_num_);
3138     if (end_time - start_time > read_threshold_) {
3139       logprintf(5, "Log: Read took %lld us which is longer than threshold "
3140                 "%lld us on disk %s (thread %d).\n",
3141                 end_time - start_time, read_threshold_,
3142                 device_name_.c_str(), thread_num_);
3143     }
3144
3145     // In non-destructive mode, don't compare the block to the pattern since
3146     // the block was never written to disk in the first place.
3147     if (!non_destructive_) {
3148       if (CheckRegion(block_buffer_, block->pattern(), current_bytes,
3149                       0, bytes_read)) {
3150         os_->ErrorReport(device_name_.c_str(), "disk-pattern-error", 1);
3151         errorcount_ += 1;
3152         logprintf(0, "Hardware Error: Pattern mismatch in block starting at "
3153                   "sector %lld in DiskThread::ValidateSectorsOnDisk on "
3154                   "disk %s (thread %d).\n",
3155                   address, device_name_.c_str(), thread_num_);
3156       }
3157     }
3158
3159     bytes_read += current_blocks * read_block_size_;
3160     blocks -= current_blocks;
3161   }
3162
3163   return true;
3164 }
3165
3166 // Direct device access thread.
3167 // Return false on software error.
3168 bool DiskThread::Work() {
3169   int fd;
3170
3171   logprintf(9, "Log: Starting disk thread %d, disk %s\n",
3172             thread_num_, device_name_.c_str());
3173
3174   srandom(time(NULL));
3175
3176   if (!OpenDevice(&fd)) {
3177     status_ = false;
3178     return false;
3179   }
3180
3181   // Allocate a block buffer aligned to 512 bytes since the kernel requires it
3182   // when using direct IO.
3183 #ifdef HAVE_POSIX_MEMALIGN
3184   int memalign_result = posix_memalign(&block_buffer_, kBufferAlignment,
3185                                        sat_->page_length());
3186 #else
3187   block_buffer_ = memalign(kBufferAlignment, sat_->page_length());
3188   int memalign_result = (block_buffer_ == 0);
3189 #endif
3190   if (memalign_result) {
3191     CloseDevice(fd);
3192     logprintf(0, "Process Error: Unable to allocate memory for buffers "
3193                  "for disk %s (thread %d) posix memalign returned %d.\n",
3194               device_name_.c_str(), thread_num_, memalign_result);
3195     status_ = false;
3196     return false;
3197   }
3198
3199 #ifdef HAVE_LIBAIO_H
3200   if (io_setup(5, &aio_ctx_)) {
3201     CloseDevice(fd);
3202     logprintf(0, "Process Error: Unable to create aio context for disk %s"
3203               " (thread %d).\n",
3204               device_name_.c_str(), thread_num_);
3205     status_ = false;
3206     return false;
3207   }
3208 #endif
3209
3210   bool result = DoWork(fd);
3211
3212   status_ = result;
3213
3214 #ifdef HAVE_LIBAIO_H
3215   io_destroy(aio_ctx_);
3216 #endif
3217   CloseDevice(fd);
3218
3219   logprintf(9, "Log: Completed %d (disk %s): disk thread status %d, "
3220                "%d pages copied\n",
3221             thread_num_, device_name_.c_str(), status_, pages_copied_);
3222   return result;
3223 }
3224
3225 RandomDiskThread::RandomDiskThread(DiskBlockTable *block_table)
3226     : DiskThread(block_table) {
3227   update_block_table_ = 0;
3228 }
3229
3230 RandomDiskThread::~RandomDiskThread() {
3231 }
3232
3233 // Workload for random disk thread.
3234 bool RandomDiskThread::DoWork(int fd) {
3235   logprintf(11, "Log: Random phase for disk %s (thread %d).\n",
3236             device_name_.c_str(), thread_num_);
3237   while (IsReadyToRun()) {
3238     BlockData *block = block_table_->GetRandomBlock();
3239     if (block == NULL) {
3240       logprintf(12, "Log: No block available for device %s (thread %d).\n",
3241                 device_name_.c_str(), thread_num_);
3242     } else {
3243       ValidateBlockOnDisk(fd, block);
3244       block_table_->ReleaseBlock(block);
3245       blocks_read_++;
3246     }
3247   }
3248   pages_copied_ = blocks_read_;
3249   return true;
3250 }
3251
3252 MemoryRegionThread::MemoryRegionThread() {
3253   error_injection_ = false;
3254   pages_ = NULL;
3255 }
3256
3257 MemoryRegionThread::~MemoryRegionThread() {
3258   if (pages_ != NULL)
3259     delete pages_;
3260 }
3261
3262 // Set a region of memory or MMIO to be tested.
3263 // Return false if region could not be mapped.
3264 bool MemoryRegionThread::SetRegion(void *region, int64 size) {
3265   int plength = sat_->page_length();
3266   int npages = size / plength;
3267   if (size % plength) {
3268     logprintf(0, "Process Error: region size is not a multiple of SAT "
3269               "page length\n");
3270     return false;
3271   } else {
3272     if (pages_ != NULL)
3273       delete pages_;
3274     pages_ = new PageEntryQueue(npages);
3275     char *base_addr = reinterpret_cast<char*>(region);
3276     region_ = base_addr;
3277     for (int i = 0; i < npages; i++) {
3278       struct page_entry pe;
3279       init_pe(&pe);
3280       pe.addr = reinterpret_cast<void*>(base_addr + i * plength);
3281       pe.offset = i * plength;
3282
3283       pages_->Push(&pe);
3284     }
3285     return true;
3286   }
3287 }
3288
3289 // More detailed error printout for hardware errors in memory or MMIO
3290 // regions.
3291 void MemoryRegionThread::ProcessError(struct ErrorRecord *error,
3292                                       int priority,
3293                                       const char *message) {
3294   uint32 buffer_offset;
3295   if (phase_ == kPhaseCopy) {
3296     // If the error occurred on the Copy Phase, it means that
3297     // the source data (i.e., the main memory) is wrong. so
3298     // just pass it to the original ProcessError to call a
3299     // bad-dimm error
3300     WorkerThread::ProcessError(error, priority, message);
3301   } else if (phase_ == kPhaseCheck) {
3302     // A error on the Check Phase means that the memory region tested
3303     // has an error. Gathering more information and then reporting
3304     // the error.
3305     // Determine if this is a write or read error.
3306     os_->Flush(error->vaddr);
3307     error->reread = *(error->vaddr);
3308     char *good = reinterpret_cast<char*>(&(error->expected));
3309     char *bad = reinterpret_cast<char*>(&(error->actual));
3310     sat_assert(error->expected != error->actual);
3311     unsigned int offset = 0;
3312     for (offset = 0; offset < (sizeof(error->expected) - 1); offset++) {
3313       if (good[offset] != bad[offset])
3314         break;
3315     }
3316
3317     error->vbyteaddr = reinterpret_cast<char*>(error->vaddr) + offset;
3318
3319     buffer_offset = error->vbyteaddr - region_;
3320
3321     // Find physical address if possible.
3322     error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
3323     logprintf(priority,
3324               "%s: miscompare on %s, CRC check at %p(0x%llx), "
3325               "offset %llx: read:0x%016llx, reread:0x%016llx "
3326               "expected:0x%016llx\n",
3327               message,
3328               identifier_.c_str(),
3329               error->vaddr,
3330               error->paddr,
3331               buffer_offset,
3332               error->actual,
3333               error->reread,
3334               error->expected);
3335   } else {
3336     logprintf(0, "Process Error: memory region thread raised an "
3337               "unexpected error.");
3338   }
3339 }
3340
3341 // Workload for testion memory or MMIO regions.
3342 // Return false on software error.
3343 bool MemoryRegionThread::Work() {
3344   struct page_entry source_pe;
3345   struct page_entry memregion_pe;
3346   bool result = true;
3347   int64 loops = 0;
3348   const uint64 error_constant = 0x00ba00000000ba00LL;
3349
3350   // For error injection.
3351   int64 *addr = 0x0;
3352   int offset = 0;
3353   int64 data = 0;
3354
3355   logprintf(9, "Log: Starting Memory Region thread %d\n", thread_num_);
3356
3357   while (IsReadyToRun()) {
3358     // Getting pages from SAT and queue.
3359     phase_ = kPhaseNoPhase;
3360     result = result && sat_->GetValid(&source_pe);
3361     if (!result) {
3362       logprintf(0, "Process Error: memory region thread failed to pop "
3363                 "pages from SAT, bailing\n");
3364       break;
3365     }
3366
3367     result = result && pages_->PopRandom(&memregion_pe);
3368     if (!result) {
3369       logprintf(0, "Process Error: memory region thread failed to pop "
3370                 "pages from queue, bailing\n");
3371       break;
3372     }
3373
3374     // Error injection for CRC copy.
3375     if ((sat_->error_injection() || error_injection_) && loops == 1) {
3376       addr = reinterpret_cast<int64*>(source_pe.addr);
3377       offset = random() % (sat_->page_length() / wordsize_);
3378       data = addr[offset];
3379       addr[offset] = error_constant;
3380     }
3381
3382     // Copying SAT page into memory region.
3383     phase_ = kPhaseCopy;
3384     CrcCopyPage(&memregion_pe, &source_pe);
3385     memregion_pe.pattern = source_pe.pattern;
3386
3387     // Error injection for CRC Check.
3388     if ((sat_->error_injection() || error_injection_) && loops == 2) {
3389       addr = reinterpret_cast<int64*>(memregion_pe.addr);
3390       offset = random() % (sat_->page_length() / wordsize_);
3391       data = addr[offset];
3392       addr[offset] = error_constant;
3393     }
3394
3395     // Checking page content in memory region.
3396     phase_ = kPhaseCheck;
3397     CrcCheckPage(&memregion_pe);
3398
3399     phase_ = kPhaseNoPhase;
3400     // Storing pages on their proper queues.
3401     result = result && sat_->PutValid(&source_pe);
3402     if (!result) {
3403       logprintf(0, "Process Error: memory region thread failed to push "
3404                 "pages into SAT, bailing\n");
3405       break;
3406     }
3407     result = result && pages_->Push(&memregion_pe);
3408     if (!result) {
3409       logprintf(0, "Process Error: memory region thread failed to push "
3410                 "pages into queue, bailing\n");
3411       break;
3412     }
3413
3414     if ((sat_->error_injection() || error_injection_) &&
3415         loops >= 1 && loops <= 2) {
3416       addr[offset] = data;
3417     }
3418
3419     loops++;
3420     YieldSelf();
3421   }
3422
3423   pages_copied_ = loops;
3424   status_ = result;
3425   logprintf(9, "Log: Completed %d: Memory Region thread. Status %d, %d "
3426             "pages checked\n", thread_num_, status_, pages_copied_);
3427   return result;
3428 }
3429
3430 // The list of MSRs to read from each cpu.
3431 const CpuFreqThread::CpuRegisterType CpuFreqThread::kCpuRegisters[] = {
3432   { kMsrTscAddr, "TSC" },
3433   { kMsrAperfAddr, "APERF" },
3434   { kMsrMperfAddr, "MPERF" },
3435 };
3436
3437 CpuFreqThread::CpuFreqThread(int num_cpus, int freq_threshold, int round)
3438   : num_cpus_(num_cpus),
3439     freq_threshold_(freq_threshold),
3440     round_(round) {
3441   sat_assert(round >= 0);
3442   if (round == 0) {
3443     // If rounding is off, force rounding to the nearest MHz.
3444     round_ = 1;
3445     round_value_ = 0.5;
3446   } else {
3447     round_value_ = round/2.0;
3448   }
3449 }
3450
3451 CpuFreqThread::~CpuFreqThread() {
3452 }
3453
3454 // Compute the difference between the currently read MSR values and the
3455 // previously read values and store the results in delta. If any of the
3456 // values did not increase, or the TSC value is too small, returns false.
3457 // Otherwise, returns true.
3458 bool CpuFreqThread::ComputeDelta(CpuDataType *current, CpuDataType *previous,
3459                                  CpuDataType *delta) {
3460   // Loop through the msrs.
3461   for (int msr = 0; msr < kMsrLast; msr++) {
3462     if (previous->msrs[msr] > current->msrs[msr]) {
3463       logprintf(0, "Log: Register %s went backwards 0x%llx to 0x%llx "
3464                 "skipping interval\n", kCpuRegisters[msr], previous->msrs[msr],
3465                 current->msrs[msr]);
3466       return false;
3467     } else {
3468       delta->msrs[msr] = current->msrs[msr] - previous->msrs[msr];
3469     }
3470   }
3471
3472   // Check for TSC < 1 Mcycles over interval.
3473   if (delta->msrs[kMsrTsc] < (1000 * 1000)) {
3474     logprintf(0, "Log: Insanely slow TSC rate, TSC stops in idle?\n");
3475     return false;
3476   }
3477   timersub(&current->tv, &previous->tv, &delta->tv);
3478
3479   return true;
3480 }
3481
3482 // Compute the change in values of the MSRs between current and previous,
3483 // set the frequency in MHz of the cpu. If there is an error computing
3484 // the delta, return false. Othewise, return true.
3485 bool CpuFreqThread::ComputeFrequency(CpuDataType *current,
3486                                      CpuDataType *previous, int *freq) {
3487   CpuDataType delta;
3488   if (!ComputeDelta(current, previous, &delta)) {
3489     return false;
3490   }
3491
3492   double interval = delta.tv.tv_sec + delta.tv.tv_usec / 1000000.0;
3493   double frequency = 1.0 * delta.msrs[kMsrTsc] / 1000000
3494                      * delta.msrs[kMsrAperf] / delta.msrs[kMsrMperf] / interval;
3495
3496   // Use the rounding value to round up properly.
3497   int computed = static_cast<int>(frequency + round_value_);
3498   *freq = computed - (computed % round_);
3499   return true;
3500 }
3501
3502 // This is the task function that the thread executes.
3503 bool CpuFreqThread::Work() {
3504   cpu_set_t cpuset;
3505   if (!AvailableCpus(&cpuset)) {
3506     logprintf(0, "Process Error: Cannot get information about the cpus.\n");
3507     return false;
3508   }
3509
3510   // Start off indicating the test is passing.
3511   status_ = true;
3512
3513   int curr = 0;
3514   int prev = 1;
3515   uint32 num_intervals = 0;
3516   bool paused = false;
3517   bool valid;
3518   bool pass = true;
3519
3520   vector<CpuDataType> data[2];
3521   data[0].resize(num_cpus_);
3522   data[1].resize(num_cpus_);
3523   while (IsReadyToRun(&paused)) {
3524     if (paused) {
3525       // Reset the intervals and restart logic after the pause.
3526       num_intervals = 0;
3527     }
3528     if (num_intervals == 0) {
3529       // If this is the first interval, then always wait a bit before
3530       // starting to collect data.
3531       sat_sleep(kStartupDelay);
3532     }
3533
3534     // Get the per cpu counters.
3535     valid = true;
3536     for (int cpu = 0; cpu < num_cpus_; cpu++) {
3537       if (CPU_ISSET(cpu, &cpuset)) {
3538         if (!GetMsrs(cpu, &data[curr][cpu])) {
3539           logprintf(0, "Failed to get msrs on cpu %d.\n", cpu);
3540           valid = false;
3541           break;
3542         }
3543       }
3544     }
3545     if (!valid) {
3546       // Reset the number of collected intervals since something bad happened.
3547       num_intervals = 0;
3548       continue;
3549     }
3550
3551     num_intervals++;
3552
3553     // Only compute a delta when we have at least two intervals worth of data.
3554     if (num_intervals > 2) {
3555       for (int cpu = 0; cpu < num_cpus_; cpu++) {
3556         if (CPU_ISSET(cpu, &cpuset)) {
3557           int freq;
3558           if (!ComputeFrequency(&data[curr][cpu], &data[prev][cpu],
3559                                 &freq)) {
3560             // Reset the number of collected intervals since an unknown
3561             // error occurred.
3562             logprintf(0, "Log: Cannot get frequency of cpu %d.\n", cpu);
3563             num_intervals = 0;
3564             break;
3565           }
3566           logprintf(15, "Cpu %d Freq %d\n", cpu, freq);
3567           if (freq < freq_threshold_) {
3568             errorcount_++;
3569             pass = false;
3570             logprintf(0, "Log: Cpu %d frequency is too low, frequency %d MHz "
3571                       "threshold %d MHz.\n", cpu, freq, freq_threshold_);
3572           }
3573         }
3574       }
3575     }
3576
3577     sat_sleep(kIntervalPause);
3578
3579     // Swap the values in curr and prev (these values flip between 0 and 1).
3580     curr ^= 1;
3581     prev ^= 1;
3582   }
3583
3584   return pass;
3585 }
3586
3587
3588 // Get the MSR values for this particular cpu and save them in data. If
3589 // any error is encountered, returns false. Otherwise, returns true.
3590 bool CpuFreqThread::GetMsrs(int cpu, CpuDataType *data) {
3591   for (int msr = 0; msr < kMsrLast; msr++) {
3592     if (!os_->ReadMSR(cpu, kCpuRegisters[msr].msr, &data->msrs[msr])) {
3593       return false;
3594     }
3595   }
3596   // Save the time at which we acquired these values.
3597   gettimeofday(&data->tv, NULL);
3598
3599   return true;
3600 }
3601
3602 // Returns true if this test can run on the current machine. Otherwise,
3603 // returns false.
3604 bool CpuFreqThread::CanRun() {
3605 #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
3606   unsigned int eax, ebx, ecx, edx;
3607
3608   // Check that the TSC feature is supported.
3609   // This check is valid for both Intel and AMD.
3610   eax = 1;
3611   cpuid(&eax, &ebx, &ecx, &edx);
3612   if (!(edx & (1 << 5))) {
3613     logprintf(0, "Process Error: No TSC support.\n");
3614     return false;
3615   }
3616
3617   // Check the highest extended function level supported.
3618   // This check is valid for both Intel and AMD.
3619   eax = 0x80000000;
3620   cpuid(&eax, &ebx, &ecx, &edx);
3621   if (eax < 0x80000007) {
3622     logprintf(0, "Process Error: No invariant TSC support.\n");
3623     return false;
3624   }
3625
3626   // Non-Stop TSC is advertised by CPUID.EAX=0x80000007: EDX.bit8
3627   // This check is valid for both Intel and AMD.
3628   eax = 0x80000007;
3629   cpuid(&eax, &ebx, &ecx, &edx);
3630   if ((edx & (1 << 8)) == 0) {
3631     logprintf(0, "Process Error: No non-stop TSC support.\n");
3632     return false;
3633   }
3634
3635   // APERF/MPERF is advertised by CPUID.EAX=0x6: ECX.bit0
3636   // This check is valid for both Intel and AMD.
3637   eax = 0x6;
3638   cpuid(&eax, &ebx, &ecx, &edx);
3639   if ((ecx & 1) == 0) {
3640     logprintf(0, "Process Error: No APERF MSR support.\n");
3641     return false;
3642   }
3643   return true;
3644 #else
3645   logprintf(0, "Process Error: "
3646                "cpu_freq_test is only supported on X86 processors.\n");
3647   return false;
3648 #endif
3649 }