chiark - git - ian - stressapptest/blob - src/sat.cc

   1 // Copyright 2006 Google Inc. All Rights Reserved.
   2
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 // sat.cc : a stress test for stressful testing
  16
  17 // stressapptest (or SAT, from Stressful Application Test) is a test
  18 // designed to stress the system, as well as provide a comprehensive
  19 // memory interface test.
  20
  21 // stressapptest can be run using memory only, or using many system components.
  22
  23 #include <errno.h>
  24 #include <pthread.h>
  25 #include <signal.h>
  26 #include <stdarg.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31
  32 #include <sys/stat.h>
  33 #include <sys/times.h>
  34
  35 // #define __USE_GNU
  36 // #define __USE_LARGEFILE64
  37 #include <fcntl.h>
  38
  39 #include <list>
  40 #include <string>
  41
  42 // This file must work with autoconf on its public version,
  43 // so these includes are correct.
  44 #include "disk_blocks.h"
  45 #include "logger.h"
  46 #include "os.h"
  47 #include "sat.h"
  48 #include "sattypes.h"
  49 #include "worker.h"
  50
  51 // stressapptest versioning here.
  52 #ifndef PACKAGE_VERSION
  53 static const char* kVersion = "1.0.0";
  54 #else
  55 static const char* kVersion = PACKAGE_VERSION;
  56 #endif
  57
  58 // Global stressapptest reference, for use by signal handler.
  59 // This makes Sat objects not safe for multiple instances.
  60 namespace {
  61   Sat *g_sat = NULL;
  62
  63   // Signal handler for catching break or kill.
  64   //
  65   // This must be installed after g_sat is assigned and while there is a single
  66   // thread.
  67   //
  68   // This must be uninstalled while there is only a single thread, and of course
  69   // before g_sat is cleared or deleted.
  70   void SatHandleBreak(int signal) {
  71     g_sat->Break();
  72   }
  73 }
  74
  75 // Opens the logfile for writing if necessary
  76 bool Sat::InitializeLogfile() {
  77   // Open logfile.
  78   if (use_logfile_) {
  79     logfile_ = open(logfilename_,
  80 #if defined(O_DSYNC)
  81                     O_DSYNC |
  82 #elif defined(O_SYNC)
  83                     O_SYNC |
  84 #elif defined(O_FSYNC)
  85                     O_FSYNC |
  86 #endif
  87                     O_WRONLY | O_CREAT,
  88                     S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
  89     if (logfile_ < 0) {
  90       printf("Fatal Error: cannot open file %s for logging\n",
  91              logfilename_);
  92       bad_status();
  93       return false;
  94     }
  95     // We seek to the end once instead of opening in append mode because no
  96     // other processes should be writing to it while this one exists.
  97     if (lseek(logfile_, 0, SEEK_END) == -1) {
  98       printf("Fatal Error: cannot seek to end of logfile (%s)\n",
  99              logfilename_);
 100       bad_status();
 101       return false;
 102     }
 103     Logger::GlobalLogger()->SetLogFd(logfile_);
 104   }
 105   return true;
 106 }
 107
 108 // Check that the environment is known and safe to run on.
 109 // Return 1 if good, 0 if unsuppported.
 110 bool Sat::CheckEnvironment() {
 111   // Check that this is not a debug build. Debug builds lack
 112   // enough performance to stress the system.
 113 #if !defined NDEBUG
 114   if (run_on_anything_) {
 115     logprintf(1, "Log: Running DEBUG version of SAT, "
 116                  "with significantly reduced coverage.\n");
 117   } else {
 118     logprintf(0, "Process Error: Running DEBUG version of SAT, "
 119                  "with significantly reduced coverage.\n");
 120     logprintf(0, "Log: Command line option '-A' bypasses this error.\n");
 121     bad_status();
 122     return false;
 123   }
 124 #elif !defined CHECKOPTS
 125   #error Build system regression - COPTS disregarded.
 126 #endif
 127
 128   // Use all CPUs if nothing is specified.
 129   if (memory_threads_ == -1) {
 130     memory_threads_ = os_->num_cpus();
 131     logprintf(7, "Log: Defaulting to %d copy threads\n", memory_threads_);
 132   }
 133
 134   // Use all memory if no size is specified.
 135   if (size_mb_ == 0)
 136     size_mb_ = os_->FindFreeMemSize() / kMegabyte;
 137   size_ = static_cast<int64>(size_mb_) * kMegabyte;
 138
 139   // Autodetect file locations.
 140   if (findfiles_ && (file_threads_ == 0)) {
 141     // Get a space separated sting of disk locations.
 142     list<string> locations = os_->FindFileDevices();
 143
 144     // Extract each one.
 145     while (!locations.empty()) {
 146       // Copy and remove the disk name.
 147       string disk = locations.back();
 148       locations.pop_back();
 149
 150       logprintf(12, "Log: disk at %s\n", disk.c_str());
 151       file_threads_++;
 152       filename_.push_back(disk + "/sat_disk.a");
 153       file_threads_++;
 154       filename_.push_back(disk + "/sat_disk.b");
 155     }
 156   }
 157
 158   // We'd better have some memory by this point.
 159   if (size_ < 1) {
 160     logprintf(0, "Process Error: No memory found to test.\n");
 161     bad_status();
 162     return false;
 163   }
 164
 165   if (tag_mode_ && ((file_threads_ > 0) ||
 166                     (disk_threads_ > 0) ||
 167                     (net_threads_ > 0))) {
 168     logprintf(0, "Process Error: Memory tag mode incompatible "
 169                  "with disk/network DMA.\n");
 170     bad_status();
 171     return false;
 172   }
 173
 174   // If platform is 32 bit Xeon, floor memory size to multiple of 4.
 175   if (address_mode_ == 32) {
 176     size_mb_ = (size_mb_ / 4) * 4;
 177     size_ = size_mb_ * kMegabyte;
 178     logprintf(1, "Log: Flooring memory allocation to multiple of 4: %lldMB\n",
 179               size_mb_);
 180   }
 181
 182   // Check if this system is on the whitelist for supported systems.
 183   if (!os_->IsSupported()) {
 184     if (run_on_anything_) {
 185       logprintf(1, "Log: Unsupported system. Running with reduced coverage.\n");
 186       // This is ok, continue on.
 187     } else {
 188       logprintf(0, "Process Error: Unsupported system, "
 189                    "no error reporting available\n");
 190       logprintf(0, "Log: Command line option '-A' bypasses this error.\n");
 191       bad_status();
 192       return false;
 193     }
 194   }
 195
 196   return true;
 197 }
 198
 199 // Allocates memory to run the test on
 200 bool Sat::AllocateMemory() {
 201   // Allocate our test memory.
 202   bool result = os_->AllocateTestMem(size_, paddr_base_);
 203   if (!result) {
 204     logprintf(0, "Process Error: failed to allocate memory\n");
 205     bad_status();
 206     return false;
 207   }
 208   return true;
 209 }
 210
 211 // Sets up access to data patterns
 212 bool Sat::InitializePatterns() {
 213   // Initialize pattern data.
 214   patternlist_ = new PatternList();
 215   if (!patternlist_) {
 216     logprintf(0, "Process Error: failed to allocate patterns\n");
 217     bad_status();
 218     return false;
 219   }
 220   if (!patternlist_->Initialize()) {
 221     logprintf(0, "Process Error: failed to initialize patternlist\n");
 222     bad_status();
 223     return false;
 224   }
 225   return true;
 226 }
 227
 228 // Get any valid page, no tag specified.
 229 bool Sat::GetValid(struct page_entry *pe) {
 230   return GetValid(pe, kDontCareTag);
 231 }
 232
 233
 234 // Fetch and return empty and full pages into the empty and full pools.
 235 bool Sat::GetValid(struct page_entry *pe, int32 tag) {
 236   bool result = false;
 237   // Get valid page depending on implementation.
 238   if (pe_q_implementation_ == SAT_FINELOCK)
 239     result = finelock_q_->GetValid(pe, tag);
 240   else if (pe_q_implementation_ == SAT_ONELOCK)
 241     result = valid_->PopRandom(pe);
 242
 243   if (result) {
 244     pe->addr = os_->PrepareTestMem(pe->offset, page_length_);  // Map it.
 245
 246     // Tag this access and current pattern.
 247     pe->ts = os_->GetTimestamp();
 248     pe->lastpattern = pe->pattern;
 249
 250     return (pe->addr != 0);     // Return success or failure.
 251   }
 252   return false;
 253 }
 254
 255 bool Sat::PutValid(struct page_entry *pe) {
 256   if (pe->addr != 0)
 257     os_->ReleaseTestMem(pe->addr, pe->offset, page_length_);  // Unmap the page.
 258   pe->addr = 0;
 259
 260   // Put valid page depending on implementation.
 261   if (pe_q_implementation_ == SAT_FINELOCK)
 262     return finelock_q_->PutValid(pe);
 263   else if (pe_q_implementation_ == SAT_ONELOCK)
 264     return valid_->Push(pe);
 265   else
 266     return false;
 267 }
 268
 269 // Get an empty page with any tag.
 270 bool Sat::GetEmpty(struct page_entry *pe) {
 271   return GetEmpty(pe, kDontCareTag);
 272 }
 273
 274 bool Sat::GetEmpty(struct page_entry *pe, int32 tag) {
 275   bool result = false;
 276   // Get empty page depending on implementation.
 277   if (pe_q_implementation_ == SAT_FINELOCK)
 278     result = finelock_q_->GetEmpty(pe, tag);
 279   else if (pe_q_implementation_ == SAT_ONELOCK)
 280     result = empty_->PopRandom(pe);
 281
 282   if (result) {
 283     pe->addr = os_->PrepareTestMem(pe->offset, page_length_);  // Map it.
 284     return (pe->addr != 0);     // Return success or failure.
 285   }
 286   return false;
 287 }
 288
 289 bool Sat::PutEmpty(struct page_entry *pe) {
 290   if (pe->addr != 0)
 291     os_->ReleaseTestMem(pe->addr, pe->offset, page_length_);  // Unmap the page.
 292   pe->addr = 0;
 293
 294   // Put empty page depending on implementation.
 295   if (pe_q_implementation_ == SAT_FINELOCK)
 296     return finelock_q_->PutEmpty(pe);
 297   else if (pe_q_implementation_ == SAT_ONELOCK)
 298     return empty_->Push(pe);
 299   else
 300     return false;
 301 }
 302
 303 // Set up the bitmap of physical pages in case we want to see which pages were
 304 // accessed under this run of SAT.
 305 void Sat::AddrMapInit() {
 306   if (!do_page_map_)
 307     return;
 308   // Find about how much physical mem is in the system.
 309   // TODO(nsanders): Find some way to get the max
 310   // and min phys addr in the system.
 311   uint64 maxsize = os_->FindFreeMemSize() * 4;
 312   sat_assert(maxsize != 0);
 313
 314   // Make a bitmask of this many pages. Assume that the memory is relatively
 315   // zero based. This is true on x86, typically.
 316   // This is one bit per page.
 317   uint64 arraysize = maxsize / 4096 / 8;
 318   unsigned char *bitmap = new unsigned char[arraysize];
 319   sat_assert(bitmap);
 320
 321   // Mark every page as 0, not seen.
 322   memset(bitmap, 0, arraysize);
 323
 324   page_bitmap_size_ = maxsize;
 325   page_bitmap_ = bitmap;
 326 }
 327
 328 // Add the 4k pages in this block to the array of pages SAT has seen.
 329 void Sat::AddrMapUpdate(struct page_entry *pe) {
 330   if (!do_page_map_)
 331     return;
 332
 333   // Go through 4k page blocks.
 334   uint64 arraysize = page_bitmap_size_ / 4096 / 8;
 335
 336   char *base = reinterpret_cast<char*>(pe->addr);
 337   for (int i = 0; i < page_length_; i += 4096) {
 338     uint64 paddr = os_->VirtualToPhysical(base + i);
 339
 340     uint32 offset = paddr / 4096 / 8;
 341     unsigned char mask = 1 << ((paddr / 4096) % 8);
 342
 343     if (offset >= arraysize) {
 344       logprintf(0, "Process Error: Physical address %#llx is "
 345                    "greater than expected %#llx.\n",
 346                 paddr, page_bitmap_size_);
 347       sat_assert(0);
 348     }
 349     page_bitmap_[offset] |= mask;
 350   }
 351 }
 352
 353 // Print out the physical memory ranges that SAT has accessed.
 354 void Sat::AddrMapPrint() {
 355   if (!do_page_map_)
 356     return;
 357
 358   uint64 pages = page_bitmap_size_ / 4096;
 359
 360   uint64 last_page = 0;
 361   bool valid_range = false;
 362
 363   logprintf(4, "Log: Printing tested physical ranges.\n");
 364
 365   for (uint64 i = 0; i < pages; i ++) {
 366     int offset = i / 8;
 367     unsigned char mask = 1 << (i % 8);
 368
 369     bool touched = page_bitmap_[offset] & mask;
 370     if (touched && !valid_range) {
 371       valid_range = true;
 372       last_page = i * 4096;
 373     } else if (!touched && valid_range) {
 374       valid_range = false;
 375       logprintf(4, "Log: %#016llx - %#016llx\n", last_page, (i * 4096) - 1);
 376     }
 377   }
 378   logprintf(4, "Log: Done printing physical ranges.\n");
 379 }
 380
 381 // Initializes page lists and fills pages with data patterns.
 382 bool Sat::InitializePages() {
 383   int result = 1;
 384   // Calculate needed page totals.
 385   int64 neededpages = memory_threads_ +
 386     invert_threads_ +
 387     check_threads_ +
 388     net_threads_ +
 389     file_threads_;
 390
 391   // Empty-valid page ratio is adjusted depending on queue implementation.
 392   // since fine-grain-locked queue keeps both valid and empty entries in the
 393   // same queue and randomly traverse to find pages, the empty-valid ratio
 394   // should be more even.
 395   if (pe_q_implementation_ == SAT_FINELOCK)
 396     freepages_ = pages_ / 5 * 2;  // Mark roughly 2/5 of all pages as Empty.
 397   else
 398     freepages_ = (pages_ / 100) + (2 * neededpages);
 399
 400   if (freepages_ < neededpages) {
 401     logprintf(0, "Process Error: freepages < neededpages.\n");
 402     logprintf(1, "Stats: Total: %lld, Needed: %lld, Marked free: %lld\n",
 403               static_cast<int64>(pages_),
 404               static_cast<int64>(neededpages),
 405               static_cast<int64>(freepages_));
 406     bad_status();
 407     return false;
 408   }
 409
 410   if (freepages_ >  pages_/2) {
 411     logprintf(0, "Process Error: not enough pages for IO\n");
 412     logprintf(1, "Stats: Total: %lld, Needed: %lld, Available: %lld\n",
 413               static_cast<int64>(pages_),
 414               static_cast<int64>(freepages_),
 415               static_cast<int64>(pages_/2));
 416     bad_status();
 417     return false;
 418   }
 419   logprintf(12, "Log: Allocating pages, Total: %lld Free: %lld\n",
 420             pages_,
 421             freepages_);
 422
 423   // Initialize page locations.
 424   for (int64 i = 0; i < pages_; i++) {
 425     struct page_entry pe;
 426     init_pe(&pe);
 427     pe.offset = i * page_length_;
 428     result &= PutEmpty(&pe);
 429   }
 430
 431   if (!result) {
 432     logprintf(0, "Process Error: while initializing empty_ list\n");
 433     bad_status();
 434     return false;
 435   }
 436
 437   // Fill valid pages with test patterns.
 438   // Use fill threads to do this.
 439   WorkerStatus fill_status;
 440   WorkerVector fill_vector;
 441
 442   logprintf(12, "Starting Fill threads: %d threads, %d pages\n",
 443             fill_threads_, pages_);
 444   // Initialize the fill threads.
 445   for (int i = 0; i < fill_threads_; i++) {
 446     FillThread *thread = new FillThread();
 447     thread->InitThread(i, this, os_, patternlist_, &fill_status);
 448     if (i != fill_threads_ - 1) {
 449         logprintf(12, "Starting Fill Threads %d: %d pages\n",
 450                   i, pages_ / fill_threads_);
 451         thread->SetFillPages(pages_ / fill_threads_);
 452       // The last thread finishes up all the leftover pages.
 453     } else {
 454       logprintf(12, "Starting Fill Threads %d: %d pages\n",
 455                 i, pages_ - pages_ / fill_threads_ * i);
 456         thread->SetFillPages(pages_ - pages_ / fill_threads_ * i);
 457     }
 458     fill_vector.push_back(thread);
 459   }
 460
 461   // Spawn the fill threads.
 462   fill_status.Initialize();
 463   for (WorkerVector::const_iterator it = fill_vector.begin();
 464        it != fill_vector.end(); ++it)
 465     (*it)->SpawnThread();
 466
 467   // Reap the finished fill threads.
 468   for (WorkerVector::const_iterator it = fill_vector.begin();
 469        it != fill_vector.end(); ++it) {
 470     (*it)->JoinThread();
 471     if ((*it)->GetStatus() != 1) {
 472       logprintf(0, "Thread %d failed with status %d at %.2f seconds\n",
 473                 (*it)->ThreadID(), (*it)->GetStatus(),
 474                 (*it)->GetRunDurationUSec() * 1.0/1000000);
 475       bad_status();
 476       return false;
 477     }
 478     delete (*it);
 479   }
 480   fill_vector.clear();
 481   fill_status.Destroy();
 482   logprintf(12, "Log: Done filling pages.\n");
 483   logprintf(12, "Log: Allocating pages.\n");
 484
 485   AddrMapInit();
 486
 487   // Initialize page locations.
 488   for (int64 i = 0; i < pages_; i++) {
 489     struct page_entry pe;
 490     // Only get valid pages with uninitialized tags here.
 491     if (GetValid(&pe, kInvalidTag)) {
 492       int64 paddr = os_->VirtualToPhysical(pe.addr);
 493       int32 region = os_->FindRegion(paddr);
 494
 495       if (i < 256) {
 496         char buf[256];
 497         os_->FindDimm(paddr, buf, sizeof(buf));
 498         logprintf(12, "Log: address: %#llx, %s\n", paddr, buf);
 499       }
 500       region_[region]++;
 501       pe.paddr = paddr;
 502       pe.tag = 1 << region;
 503       region_mask_ |= pe.tag;
 504
 505       // Generate a physical region map
 506       AddrMapUpdate(&pe);
 507
 508       // Note: this does not allocate free pages among all regions
 509       // fairly. However, with large enough (thousands) random number
 510       // of pages being marked free in each region, the free pages
 511       // count in each region end up pretty balanced.
 512       if (i < freepages_) {
 513         result &= PutEmpty(&pe);
 514       } else {
 515         result &= PutValid(&pe);
 516       }
 517     } else {
 518       logprintf(0, "Log: didn't tag all pages. %d - %d = %d\n",
 519                 pages_, i, pages_ - i);
 520       return false;
 521     }
 522   }
 523   logprintf(12, "Log: Done allocating pages.\n");
 524
 525   AddrMapPrint();
 526
 527   for (int i = 0; i < 32; i++) {
 528     if (region_mask_ & (1 << i)) {
 529       region_count_++;
 530       logprintf(12, "Log: Region %d: %d.\n", i, region_[i]);
 531     }
 532   }
 533   logprintf(5, "Log: Region mask: 0x%x\n", region_mask_);
 534
 535   return true;
 536 }
 537
 538 // Print SAT version info.
 539 bool Sat::PrintVersion() {
 540   logprintf(1, "Stats: SAT revision %s, %d bit binary\n",
 541             kVersion, address_mode_);
 542   logprintf(5, "Log: %s from %s\n", Timestamp(), BuildChangelist());
 543
 544   return true;
 545 }
 546
 547
 548 // Initializes the resources that SAT needs to run.
 549 // This needs to be called before Run(), and after ParseArgs().
 550 // Returns true on success, false on error, and will exit() on help message.
 551 bool Sat::Initialize() {
 552   g_sat = this;
 553
 554   // Initializes sync'd log file to ensure output is saved.
 555   if (!InitializeLogfile())
 556     return false;
 557   Logger::GlobalLogger()->StartThread();
 558
 559   logprintf(5, "Log: Commandline - %s\n", cmdline_.c_str());
 560   PrintVersion();
 561
 562   std::map<std::string, std::string> options;
 563
 564   GoogleOsOptions(&options);
 565
 566   // Initialize OS/Hardware interface.
 567   os_ = OsLayerFactory(options);
 568   if (!os_) {
 569     bad_status();
 570     return false;
 571   }
 572
 573   if (min_hugepages_mbytes_ > 0)
 574     os_->SetMinimumHugepagesSize(min_hugepages_mbytes_ * kMegabyte);
 575   if (modules_.size() > 0) {
 576     logprintf(6, "Log: Decoding memory: %dx%d bit channels,"
 577         " %d byte burst size, %d modules per channel (x%d)\n",
 578         modules_.size(), channel_width_, interleave_size_, modules_[0].size(),
 579         channel_width_/modules_[0].size());
 580     os_->SetDramMappingParams(interleave_size_, channel_width_, &modules_);
 581   }
 582
 583   if (!os_->Initialize()) {
 584     logprintf(0, "Process Error: Failed to initialize OS layer\n");
 585     bad_status();
 586     delete os_;
 587     return false;
 588   }
 589
 590   // Checks that OS/Build/Platform is supported.
 591   if (!CheckEnvironment())
 592     return false;
 593
 594   if (error_injection_)
 595     os_->set_error_injection(true);
 596
 597   // Run SAT in monitor only mode, do not continue to allocate resources.
 598   if (monitor_mode_) {
 599     logprintf(5, "Log: Running in monitor-only mode. "
 600                  "Will not allocate any memory nor run any stress test. "
 601                  "Only polling ECC errors.\n");
 602     return true;
 603   }
 604
 605   // Allocate the memory to test.
 606   if (!AllocateMemory())
 607     return false;
 608
 609   logprintf(5, "Stats: Starting SAT, %dM, %d seconds\n",
 610             static_cast<int>(size_/kMegabyte),
 611             runtime_seconds_);
 612
 613   if (!InitializePatterns())
 614     return false;
 615
 616   // Initialize memory allocation.
 617   pages_ = size_ / page_length_;
 618
 619   // Allocate page queue depending on queue implementation switch.
 620   if (pe_q_implementation_ == SAT_FINELOCK) {
 621       finelock_q_ = new FineLockPEQueue(pages_, page_length_);
 622       if (finelock_q_ == NULL)
 623         return false;
 624       finelock_q_->set_os(os_);
 625       os_->set_err_log_callback(finelock_q_->get_err_log_callback());
 626   } else if (pe_q_implementation_ == SAT_ONELOCK) {
 627       empty_ = new PageEntryQueue(pages_);
 628       valid_ = new PageEntryQueue(pages_);
 629       if ((empty_ == NULL) || (valid_ == NULL))
 630         return false;
 631   }
 632
 633   if (!InitializePages()) {
 634     logprintf(0, "Process Error: Initialize Pages failed\n");
 635     return false;
 636   }
 637
 638   return true;
 639 }
 640
 641 // Constructor and destructor.
 642 Sat::Sat() {
 643   // Set defaults, command line might override these.
 644   runtime_seconds_ = 20;
 645   page_length_ = kSatPageSize;
 646   disk_pages_ = kSatDiskPage;
 647   pages_ = 0;
 648   size_mb_ = 0;
 649   size_ = size_mb_ * kMegabyte;
 650   min_hugepages_mbytes_ = 0;
 651   freepages_ = 0;
 652   paddr_base_ = 0;
 653   interleave_size_ = kCacheLineSize;
 654   channel_width_ = 64;
 655
 656   user_break_ = false;
 657   verbosity_ = 8;
 658   Logger::GlobalLogger()->SetVerbosity(verbosity_);
 659   strict_ = 1;
 660   warm_ = 0;
 661   run_on_anything_ = 0;
 662   use_logfile_ = 0;
 663   logfile_ = 0;
 664   // Detect 32/64 bit binary.
 665   void *pvoid = 0;
 666   address_mode_ = sizeof(pvoid) * 8;
 667   error_injection_ = false;
 668   crazy_error_injection_ = false;
 669   max_errorcount_ = 0;  // Zero means no early exit.
 670   stop_on_error_ = false;
 671   error_poll_ = true;
 672   findfiles_ = false;
 673
 674   do_page_map_ = false;
 675   page_bitmap_ = 0;
 676   page_bitmap_size_ = 0;
 677
 678   // Cache coherency data initialization.
 679   cc_test_ = false;         // Flag to trigger cc threads.
 680   cc_cacheline_count_ = 2;  // Two datastructures of cache line size.
 681   cc_inc_count_ = 1000;     // Number of times to increment the shared variable.
 682   cc_cacheline_data_ = 0;   // Cache Line size datastructure.
 683
 684   sat_assert(0 == pthread_mutex_init(&worker_lock_, NULL));
 685   file_threads_ = 0;
 686   net_threads_ = 0;
 687   listen_threads_ = 0;
 688   // Default to autodetect number of cpus, and run that many threads.
 689   memory_threads_ = -1;
 690   invert_threads_ = 0;
 691   fill_threads_ = 8;
 692   check_threads_ = 0;
 693   cpu_stress_threads_ = 0;
 694   disk_threads_ = 0;
 695   total_threads_ = 0;
 696
 697   region_mask_ = 0;
 698   region_count_ = 0;
 699   for (int i = 0; i < 32; i++) {
 700     region_[i] = 0;
 701   }
 702   region_mode_ = 0;
 703
 704   errorcount_ = 0;
 705   statuscount_ = 0;
 706
 707   valid_ = 0;
 708   empty_ = 0;
 709   finelock_q_ = 0;
 710   // Default to use fine-grain lock for better performance.
 711   pe_q_implementation_ = SAT_FINELOCK;
 712
 713   os_ = 0;
 714   patternlist_ = 0;
 715   logfilename_[0] = 0;
 716
 717   read_block_size_ = 512;
 718   write_block_size_ = -1;
 719   segment_size_ = -1;
 720   cache_size_ = -1;
 721   blocks_per_segment_ = -1;
 722   read_threshold_ = -1;
 723   write_threshold_ = -1;
 724   non_destructive_ = 1;
 725   monitor_mode_ = 0;
 726   tag_mode_ = 0;
 727   random_threads_ = 0;
 728
 729   pause_delay_ = 600;
 730   pause_duration_ = 15;
 731 }
 732
 733 // Destructor.
 734 Sat::~Sat() {
 735   // We need to have called Cleanup() at this point.
 736   // We should probably enforce this.
 737 }
 738
 739
 740 #define ARG_KVALUE(argument, variable, value)         \
 741   if (!strcmp(argv[i], argument)) {                   \
 742     variable = value;                                 \
 743     continue;                                         \
 744   }
 745
 746 #define ARG_IVALUE(argument, variable)                \
 747   if (!strcmp(argv[i], argument)) {                   \
 748     i++;                                              \
 749     if (i < argc)                                     \
 750       variable = strtoull(argv[i], NULL, 0);          \
 751     continue;                                         \
 752   }
 753
 754 #define ARG_SVALUE(argument, variable)                     \
 755   if (!strcmp(argv[i], argument)) {                        \
 756     i++;                                                   \
 757     if (i < argc)                                          \
 758       snprintf(variable, sizeof(variable), "%s", argv[i]); \
 759     continue;                                              \
 760   }
 761
 762 // Configures SAT from command line arguments.
 763 // This will call exit() given a request for
 764 // self-documentation or unexpected args.
 765 bool Sat::ParseArgs(int argc, char **argv) {
 766   int i;
 767   uint64 filesize = page_length_ * disk_pages_;
 768
 769   // Parse each argument.
 770   for (i = 1; i < argc; i++) {
 771     // Switch to fall back to corase-grain-lock queue. (for benchmarking)
 772     ARG_KVALUE("--coarse_grain_lock", pe_q_implementation_, SAT_ONELOCK);
 773
 774     // Set number of megabyte to use.
 775     ARG_IVALUE("-M", size_mb_);
 776
 777     // Set minimum megabytes of hugepages to require.
 778     ARG_IVALUE("-H", min_hugepages_mbytes_);
 779
 780     // Set number of seconds to run.
 781     ARG_IVALUE("-s", runtime_seconds_);
 782
 783     // Set number of memory copy threads.
 784     ARG_IVALUE("-m", memory_threads_);
 785
 786     // Set number of memory invert threads.
 787     ARG_IVALUE("-i", invert_threads_);
 788
 789     // Set number of check-only threads.
 790     ARG_IVALUE("-c", check_threads_);
 791
 792     // Set number of cache line size datastructures.
 793     ARG_IVALUE("--cc_inc_count", cc_inc_count_);
 794
 795     // Set number of cache line size datastructures
 796     ARG_IVALUE("--cc_line_count", cc_cacheline_count_);
 797
 798     // Flag set when cache coherency tests need to be run
 799     ARG_KVALUE("--cc_test", cc_test_, 1);
 800
 801     // Set number of CPU stress threads.
 802     ARG_IVALUE("-C", cpu_stress_threads_);
 803
 804     // Set logfile name.
 805     ARG_SVALUE("-l", logfilename_);
 806
 807     // Verbosity level.
 808     ARG_IVALUE("-v", verbosity_);
 809
 810     // Set maximum number of errors to collect. Stop running after this many.
 811     ARG_IVALUE("--max_errors", max_errorcount_);
 812
 813     // Set pattern block size.
 814     ARG_IVALUE("-p", page_length_);
 815
 816     // Set pattern block size.
 817     ARG_IVALUE("--filesize", filesize);
 818
 819     // NUMA options.
 820     ARG_KVALUE("--local_numa", region_mode_, kLocalNuma);
 821     ARG_KVALUE("--remote_numa", region_mode_, kRemoteNuma);
 822
 823     // Autodetect tempfile locations.
 824     ARG_KVALUE("--findfiles", findfiles_, 1);
 825
 826     // Inject errors to force miscompare code paths
 827     ARG_KVALUE("--force_errors", error_injection_, true);
 828     ARG_KVALUE("--force_errors_like_crazy", crazy_error_injection_, true);
 829     if (crazy_error_injection_)
 830       error_injection_ = true;
 831
 832     // Stop immediately on any arror, for debugging HW problems.
 833     ARG_KVALUE("--stop_on_errors", stop_on_error_, 1);
 834
 835     // Don't use internal error polling, allow external detection.
 836     ARG_KVALUE("--no_errors", error_poll_, 0);
 837
 838     // Never check data as you go.
 839     ARG_KVALUE("-F", strict_, 0);
 840
 841     // Warm the cpu as you go.
 842     ARG_KVALUE("-W", warm_, 1);
 843
 844     // Allow runnign on unknown systems with base unimplemented OsLayer
 845     ARG_KVALUE("-A", run_on_anything_, 1);
 846
 847     // Size of read blocks for disk test.
 848     ARG_IVALUE("--read-block-size", read_block_size_);
 849
 850     // Size of write blocks for disk test.
 851     ARG_IVALUE("--write-block-size", write_block_size_);
 852
 853     // Size of segment for disk test.
 854     ARG_IVALUE("--segment-size", segment_size_);
 855
 856     // Size of disk cache size for disk test.
 857     ARG_IVALUE("--cache-size", cache_size_);
 858
 859     // Number of blocks to test per segment.
 860     ARG_IVALUE("--blocks-per-segment", blocks_per_segment_);
 861
 862     // Maximum time a block read should take before warning.
 863     ARG_IVALUE("--read-threshold", read_threshold_);
 864
 865     // Maximum time a block write should take before warning.
 866     ARG_IVALUE("--write-threshold", write_threshold_);
 867
 868     // Do not write anything to disk in the disk test.
 869     ARG_KVALUE("--destructive", non_destructive_, 0);
 870
 871     // Run SAT in monitor mode. No test load at all.
 872     ARG_KVALUE("--monitor_mode", monitor_mode_, true);
 873
 874     // Run SAT in address mode. Tag all cachelines by virt addr.
 875     ARG_KVALUE("--tag_mode", tag_mode_, true);
 876
 877     // Dump range map of tested pages..
 878     ARG_KVALUE("--do_page_map", do_page_map_, true);
 879
 880     // Specify the physical address base to test.
 881     ARG_IVALUE("--paddr_base", paddr_base_);
 882
 883     // Specify the frequency for power spikes.
 884     ARG_IVALUE("--pause_delay", pause_delay_);
 885
 886     // Specify the duration of each pause (for power spikes).
 887     ARG_IVALUE("--pause_duration", pause_duration_);
 888
 889     // Disk device names
 890     if (!strcmp(argv[i], "-d")) {
 891       i++;
 892       if (i < argc) {
 893         disk_threads_++;
 894         diskfilename_.push_back(string(argv[i]));
 895         blocktables_.push_back(new DiskBlockTable());
 896       }
 897       continue;
 898     }
 899
 900     // Set number of disk random threads for each disk write thread.
 901     ARG_IVALUE("--random-threads", random_threads_);
 902
 903     // Set a tempfile to use in a file thread.
 904     if (!strcmp(argv[i], "-f")) {
 905       i++;
 906       if (i < argc) {
 907         file_threads_++;
 908         filename_.push_back(string(argv[i]));
 909       }
 910       continue;
 911     }
 912
 913     // Set a hostname to use in a network thread.
 914     if (!strcmp(argv[i], "-n")) {
 915       i++;
 916       if (i < argc) {
 917         net_threads_++;
 918         ipaddrs_.push_back(string(argv[i]));
 919       }
 920       continue;
 921     }
 922
 923     // Run threads that listen for incoming SAT net connections.
 924     ARG_KVALUE("--listen", listen_threads_, 1);
 925
 926     if (CheckGoogleSpecificArgs(argc, argv, &i)) {
 927       continue;
 928     }
 929
 930     ARG_IVALUE("--interleave_size", interleave_size_);
 931     ARG_IVALUE("--channel_width", channel_width_);
 932
 933     if (!strcmp(argv[i], "--memory_channel")) {
 934       i++;
 935       if (i < argc) {
 936         char *module = argv[i];
 937         modules_.push_back(vector<string>());
 938         while (char* next = strchr(module, ',')) {
 939           modules_.back().push_back(string(module, next - module));
 940           module = next + 1;
 941         }
 942         modules_.back().push_back(string(module));
 943       }
 944       continue;
 945     }
 946
 947     // Default:
 948     PrintVersion();
 949     PrintHelp();
 950     if (strcmp(argv[i], "-h") && strcmp(argv[i], "--help")) {
 951       printf("\n Unknown argument %s\n", argv[i]);
 952       bad_status();
 953       exit(1);
 954     }
 955     // Forget it, we printed the help, just bail.
 956     // We don't want to print test status, or any log parser stuff.
 957     exit(0);
 958   }
 959
 960   Logger::GlobalLogger()->SetVerbosity(verbosity_);
 961
 962   // Update relevant data members with parsed input.
 963   // Translate MB into bytes.
 964   size_ = static_cast<int64>(size_mb_) * kMegabyte;
 965
 966   // Set logfile flag.
 967   if (strcmp(logfilename_, ""))
 968     use_logfile_ = 1;
 969   // Checks valid page length.
 970   if (page_length_ &&
 971       !(page_length_ & (page_length_ - 1)) &&
 972       (page_length_ > 1023)) {
 973     // Prints if we have changed from default.
 974     if (page_length_ != kSatPageSize)
 975       logprintf(12, "Log: Updating page size to %d\n", page_length_);
 976   } else {
 977     // Revert to default page length.
 978     logprintf(6, "Process Error: "
 979               "Invalid page size %d\n", page_length_);
 980     page_length_ = kSatPageSize;
 981     return false;
 982   }
 983
 984   // Set disk_pages_ if filesize or page size changed.
 985   if (filesize != static_cast<uint64>(page_length_) *
 986                   static_cast<uint64>(disk_pages_)) {
 987     disk_pages_ = filesize / page_length_;
 988     if (disk_pages_ == 0)
 989       disk_pages_ = 1;
 990   }
 991
 992   // Validate memory channel parameters if supplied
 993   if (modules_.size()) {
 994     if (interleave_size_ <= 0 ||
 995         interleave_size_ & (interleave_size_ - 1)) {
 996       logprintf(6, "Process Error: "
 997           "Interleave size %d is not a power of 2.\n", interleave_size_);
 998       bad_status();
 999       return false;
1000     }
1001     for (uint i = 0; i < modules_.size(); i++)
1002       if (modules_[i].size() != modules_[0].size()) {
1003         logprintf(6, "Process Error: "
1004             "Channels 0 and %d have a different amount of modules.\n",i);
1005         bad_status();
1006         return false;
1007       }
1008     if (modules_[0].size() & (modules_[0].size() - 1)) {
1009       logprintf(6, "Process Error: "
1010           "Amount of modules per memory channel is not a power of 2.\n");
1011       bad_status();
1012       return false;
1013     }
1014     if (channel_width_ < 16
1015         || channel_width_ & (channel_width_ - 1)) {
1016       logprintf(6, "Process Error: "
1017           "Channel width %d is invalid.\n", channel_width_);
1018       bad_status();
1019       return false;
1020     }
1021     if (channel_width_ / modules_[0].size() < 8) {
1022       logprintf(6, "Process Error: "
1023           "Chip width x%d must be x8 or greater.\n", channel_width_ / modules_[0].size());
1024       bad_status();
1025       return false;
1026     }
1027   }
1028
1029
1030   // Print each argument.
1031   for (int i = 0; i < argc; i++) {
1032     if (i)
1033       cmdline_ += " ";
1034     cmdline_ += argv[i];
1035   }
1036
1037   return true;
1038 }
1039
1040 void Sat::PrintHelp() {
1041   printf("Usage: ./sat(32|64) [options]\n"
1042          " -M mbytes        megabytes of ram to test\n"
1043          " -H mbytes        minimum megabytes of hugepages to require\n"
1044          " -s seconds       number of seconds to run\n"
1045          " -m threads       number of memory copy threads to run\n"
1046          " -i threads       number of memory invert threads to run\n"
1047          " -C threads       number of memory CPU stress threads to run\n"
1048          " --findfiles      find locations to do disk IO automatically\n"
1049          " -d device        add a direct write disk thread with block "
1050          "device (or file) 'device'\n"
1051          " -f filename      add a disk thread with "
1052          "tempfile 'filename'\n"
1053          " -l logfile       log output to file 'logfile'\n"
1054          " --max_errors n   exit early after finding 'n' errors\n"
1055          " -v level         verbosity (0-20), default is 8\n"
1056          " -W               Use more CPU-stressful memory copy\n"
1057          " -A               run in degraded mode on incompatible systems\n"
1058          " -p pagesize      size in bytes of memory chunks\n"
1059          " --filesize size  size of disk IO tempfiles\n"
1060          " -n ipaddr        add a network thread connecting to "
1061          "system at 'ipaddr'\n"
1062          " --listen         run a thread to listen for and respond "
1063          "to network threads.\n"
1064          " --no_errors      run without checking for ECC or other errors\n"
1065          " --force_errors   inject false errors to test error handling\n"
1066          " --force_errors_like_crazy   inject a lot of false errors "
1067          "to test error handling\n"
1068          " -F               don't result check each transaction\n"
1069          " --stop_on_errors  Stop after finding the first error.\n"
1070          " --read-block-size     size of block for reading (-d)\n"
1071          " --write-block-size    size of block for writing (-d). If not "
1072          "defined, the size of block for writing will be defined as the "
1073          "size of block for reading\n"
1074          " --segment-size   size of segments to split disk into (-d)\n"
1075          " --cache-size     size of disk cache (-d)\n"
1076          " --blocks-per-segment  number of blocks to read/write per "
1077          "segment per iteration (-d)\n"
1078          " --read-threshold      maximum time (in us) a block read should "
1079          "take (-d)\n"
1080          " --write-threshold     maximum time (in us) a block write "
1081          "should take (-d)\n"
1082          " --random-threads      number of random threads for each disk "
1083          "write thread (-d)\n"
1084          " --destructive    write/wipe disk partition (-d)\n"
1085          " --monitor_mode   only do ECC error polling, no stress load.\n"
1086          " --cc_test        do the cache coherency testing\n"
1087          " --cc_inc_count   number of times to increment the "
1088          "cacheline's member\n"
1089          " --cc_line_count  number of cache line sized datastructures "
1090          "to allocate for the cache coherency threads to operate\n"
1091          " --paddr_base     allocate memory starting from this address\n"
1092          " --pause_delay    delay (in seconds) between power spikes\n"
1093          " --pause_duration duration (in seconds) of each pause\n"
1094          " --local_numa     choose memory regions associated with "
1095          "each CPU to be tested by that CPU\n"
1096          " --remote_numa    choose memory regions not associated with "
1097          "each CPU to be tested by that CPU\n"
1098          " --interleave_size bytes  size in bytes of each channel's data as interleaved "
1099          "between memory channels\n"
1100          " --channel_width bits     width in bits of each memory channel\n"
1101          " --memory_channel u1,u2   defines a comma-separated list of names\n"
1102          "                          for dram packages in a memory channel.\n"
1103          "                          Use multiple times to define multiple channels.\n");
1104 }
1105
1106 bool Sat::CheckGoogleSpecificArgs(int argc, char **argv, int *i) {
1107   // Do nothing, no google-specific argument on public stressapptest
1108   return false;
1109 }
1110
1111 void Sat::GoogleOsOptions(std::map<std::string, std::string> *options) {
1112   // Do nothing, no OS-specific argument on public stressapptest
1113 }
1114
1115 // Launch the SAT task threads. Returns 0 on error.
1116 void Sat::InitializeThreads() {
1117   // Memory copy threads.
1118   AcquireWorkerLock();
1119
1120   logprintf(12, "Log: Starting worker threads\n");
1121   WorkerVector *memory_vector = new WorkerVector();
1122
1123   // Error polling thread.
1124   // This may detect ECC corrected errors, disk problems, or
1125   // any other errors normally hidden from userspace.
1126   WorkerVector *error_vector = new WorkerVector();
1127   if (error_poll_) {
1128     ErrorPollThread *thread = new ErrorPollThread();
1129     thread->InitThread(total_threads_++, this, os_, patternlist_,
1130                        &continuous_status_);
1131
1132     error_vector->insert(error_vector->end(), thread);
1133   } else {
1134     logprintf(5, "Log: Skipping error poll thread due to --no_errors flag\n");
1135   }
1136   workers_map_.insert(make_pair(kErrorType, error_vector));
1137
1138   // Only start error poll threads for monitor-mode SAT,
1139   // skip all other types of worker threads.
1140   if (monitor_mode_) {
1141     ReleaseWorkerLock();
1142     return;
1143   }
1144
1145   for (int i = 0; i < memory_threads_; i++) {
1146     CopyThread *thread = new CopyThread();
1147     thread->InitThread(total_threads_++, this, os_, patternlist_,
1148                        &power_spike_status_);
1149
1150     if ((region_count_ > 1) && (region_mode_)) {
1151       int32 region = region_find(i % region_count_);
1152       cpu_set_t *cpuset = os_->FindCoreMask(region);
1153       sat_assert(cpuset);
1154       if (region_mode_ == kLocalNuma) {
1155         // Choose regions associated with this CPU.
1156         thread->set_cpu_mask(cpuset);
1157         thread->set_tag(1 << region);
1158       } else if (region_mode_ == kRemoteNuma) {
1159         // Choose regions not associated with this CPU..
1160         thread->set_cpu_mask(cpuset);
1161         thread->set_tag(region_mask_ & ~(1 << region));
1162       }
1163     } else {
1164       cpu_set_t available_cpus;
1165       thread->AvailableCpus(&available_cpus);
1166       int cores = cpuset_count(&available_cpus);
1167       // Don't restrict thread location if we have more than one
1168       // thread per core. Not so good for performance.
1169       if (cpu_stress_threads_ + memory_threads_ <= cores) {
1170         // Place a thread on alternating cores first.
1171         // This assures interleaved core use with no overlap.
1172         int nthcore = i;
1173         int nthbit = (((2 * nthcore) % cores) +
1174                       (((2 * nthcore) / cores) % 2)) % cores;
1175         cpu_set_t all_cores;
1176         cpuset_set_ab(&all_cores, 0, cores);
1177         if (!cpuset_isequal(&available_cpus, &all_cores)) {
1178           // We are assuming the bits are contiguous.
1179           // Complain if this is not so.
1180           logprintf(0, "Log: cores = %s, expected %s\n",
1181                     cpuset_format(&available_cpus).c_str(),
1182                     cpuset_format(&all_cores).c_str());
1183         }
1184
1185         // Set thread affinity.
1186         thread->set_cpu_mask_to_cpu(nthbit);
1187       }
1188     }
1189     memory_vector->insert(memory_vector->end(), thread);
1190   }
1191   workers_map_.insert(make_pair(kMemoryType, memory_vector));
1192
1193   // File IO threads.
1194   WorkerVector *fileio_vector = new WorkerVector();
1195   for (int i = 0; i < file_threads_; i++) {
1196     FileThread *thread = new FileThread();
1197     thread->InitThread(total_threads_++, this, os_, patternlist_,
1198                        &power_spike_status_);
1199     thread->SetFile(filename_[i].c_str());
1200     // Set disk threads high priority. They don't take much processor time,
1201     // but blocking them will delay disk IO.
1202     thread->SetPriority(WorkerThread::High);
1203
1204     fileio_vector->insert(fileio_vector->end(), thread);
1205   }
1206   workers_map_.insert(make_pair(kFileIOType, fileio_vector));
1207
1208   // Net IO threads.
1209   WorkerVector *netio_vector = new WorkerVector();
1210   WorkerVector *netslave_vector = new WorkerVector();
1211   if (listen_threads_ > 0) {
1212     // Create a network slave thread. This listens for connections.
1213     NetworkListenThread *thread = new NetworkListenThread();
1214     thread->InitThread(total_threads_++, this, os_, patternlist_,
1215                        &continuous_status_);
1216
1217     netslave_vector->insert(netslave_vector->end(), thread);
1218   }
1219   for (int i = 0; i < net_threads_; i++) {
1220     NetworkThread *thread = new NetworkThread();
1221     thread->InitThread(total_threads_++, this, os_, patternlist_,
1222                        &continuous_status_);
1223     thread->SetIP(ipaddrs_[i].c_str());
1224
1225     netio_vector->insert(netio_vector->end(), thread);
1226   }
1227   workers_map_.insert(make_pair(kNetIOType, netio_vector));
1228   workers_map_.insert(make_pair(kNetSlaveType, netslave_vector));
1229
1230   // Result check threads.
1231   WorkerVector *check_vector = new WorkerVector();
1232   for (int i = 0; i < check_threads_; i++) {
1233     CheckThread *thread = new CheckThread();
1234     thread->InitThread(total_threads_++, this, os_, patternlist_,
1235                        &continuous_status_);
1236
1237     check_vector->insert(check_vector->end(), thread);
1238   }
1239   workers_map_.insert(make_pair(kCheckType, check_vector));
1240
1241   // Memory invert threads.
1242   logprintf(12, "Log: Starting invert threads\n");
1243   WorkerVector *invert_vector = new WorkerVector();
1244   for (int i = 0; i < invert_threads_; i++) {
1245     InvertThread *thread = new InvertThread();
1246     thread->InitThread(total_threads_++, this, os_, patternlist_,
1247                        &continuous_status_);
1248
1249     invert_vector->insert(invert_vector->end(), thread);
1250   }
1251   workers_map_.insert(make_pair(kInvertType, invert_vector));
1252
1253   // Disk stress threads.
1254   WorkerVector *disk_vector = new WorkerVector();
1255   WorkerVector *random_vector = new WorkerVector();
1256   logprintf(12, "Log: Starting disk stress threads\n");
1257   for (int i = 0; i < disk_threads_; i++) {
1258     // Creating write threads
1259     DiskThread *thread = new DiskThread(blocktables_[i]);
1260     thread->InitThread(total_threads_++, this, os_, patternlist_,
1261                        &power_spike_status_);
1262     thread->SetDevice(diskfilename_[i].c_str());
1263     if (thread->SetParameters(read_block_size_, write_block_size_,
1264                               segment_size_, cache_size_,
1265                               blocks_per_segment_,
1266                               read_threshold_, write_threshold_,
1267                               non_destructive_)) {
1268       disk_vector->insert(disk_vector->end(), thread);
1269     } else {
1270       logprintf(12, "Log: DiskThread::SetParameters() failed\n");
1271       delete thread;
1272     }
1273
1274     for (int j = 0; j < random_threads_; j++) {
1275       // Creating random threads
1276       RandomDiskThread *rthread = new RandomDiskThread(blocktables_[i]);
1277       rthread->InitThread(total_threads_++, this, os_, patternlist_,
1278                           &power_spike_status_);
1279       rthread->SetDevice(diskfilename_[i].c_str());
1280       if (rthread->SetParameters(read_block_size_, write_block_size_,
1281                                  segment_size_, cache_size_,
1282                                  blocks_per_segment_,
1283                                  read_threshold_, write_threshold_,
1284                                  non_destructive_)) {
1285         random_vector->insert(random_vector->end(), rthread);
1286       } else {
1287       logprintf(12, "Log: RandomDiskThread::SetParameters() failed\n");
1288         delete rthread;
1289       }
1290     }
1291   }
1292
1293   workers_map_.insert(make_pair(kDiskType, disk_vector));
1294   workers_map_.insert(make_pair(kRandomDiskType, random_vector));
1295
1296   // CPU stress threads.
1297   WorkerVector *cpu_vector = new WorkerVector();
1298   logprintf(12, "Log: Starting cpu stress threads\n");
1299   for (int i = 0; i < cpu_stress_threads_; i++) {
1300     CpuStressThread *thread = new CpuStressThread();
1301     thread->InitThread(total_threads_++, this, os_, patternlist_,
1302                        &continuous_status_);
1303
1304     // Don't restrict thread location if we have more than one
1305     // thread per core. Not so good for performance.
1306     cpu_set_t available_cpus;
1307     thread->AvailableCpus(&available_cpus);
1308     int cores = cpuset_count(&available_cpus);
1309     if (cpu_stress_threads_ + memory_threads_ <= cores) {
1310       // Place a thread on alternating cores first.
1311       // Go in reverse order for CPU stress threads. This assures interleaved
1312       // core use with no overlap.
1313       int nthcore = (cores - 1) - i;
1314       int nthbit = (((2 * nthcore) % cores) +
1315                     (((2 * nthcore) / cores) % 2)) % cores;
1316       cpu_set_t all_cores;
1317       cpuset_set_ab(&all_cores, 0, cores);
1318       if (!cpuset_isequal(&available_cpus, &all_cores)) {
1319         logprintf(0, "Log: cores = %s, expected %s\n",
1320                   cpuset_format(&available_cpus).c_str(),
1321                   cpuset_format(&all_cores).c_str());
1322       }
1323
1324       // Set thread affinity.
1325       thread->set_cpu_mask_to_cpu(nthbit);
1326     }
1327
1328
1329     cpu_vector->insert(cpu_vector->end(), thread);
1330   }
1331   workers_map_.insert(make_pair(kCPUType, cpu_vector));
1332
1333   // CPU Cache Coherency Threads - one for each core available.
1334   if (cc_test_) {
1335     WorkerVector *cc_vector = new WorkerVector();
1336     logprintf(12, "Log: Starting cpu cache coherency threads\n");
1337
1338     // Allocate the shared datastructure to be worked on by the threads.
1339     cc_cacheline_data_ = reinterpret_cast<cc_cacheline_data*>(
1340         malloc(sizeof(cc_cacheline_data) * cc_cacheline_count_));
1341     sat_assert(cc_cacheline_data_ != NULL);
1342
1343     // Initialize the strucutre.
1344     memset(cc_cacheline_data_, 0,
1345            sizeof(cc_cacheline_data) * cc_cacheline_count_);
1346
1347     int num_cpus = CpuCount();
1348     // Allocate all the nums once so that we get a single chunk
1349     // of contiguous memory.
1350     int *num;
1351 #ifdef HAVE_POSIX_MEMALIGN
1352     int err_result = posix_memalign(
1353         reinterpret_cast<void**>(&num),
1354         kCacheLineSize, sizeof(*num) * num_cpus * cc_cacheline_count_);
1355 #else
1356     num = reinterpret_cast<int*>(memalign(kCacheLineSize,
1357                         sizeof(*num) * num_cpus * cc_cacheline_count_));
1358     int err_result = (num == 0);
1359 #endif
1360     sat_assert(err_result == 0);
1361
1362     int cline;
1363     for (cline = 0; cline < cc_cacheline_count_; cline++) {
1364       memset(num, 0, sizeof(num_cpus) * num_cpus);
1365       cc_cacheline_data_[cline].num = num;
1366       num += num_cpus;
1367     }
1368
1369     int tnum;
1370     for (tnum = 0; tnum < num_cpus; tnum++) {
1371       CpuCacheCoherencyThread *thread =
1372           new CpuCacheCoherencyThread(cc_cacheline_data_, cc_cacheline_count_,
1373                                       tnum, cc_inc_count_);
1374       thread->InitThread(total_threads_++, this, os_, patternlist_,
1375                          &continuous_status_);
1376       // Pin the thread to a particular core.
1377       thread->set_cpu_mask_to_cpu(tnum);
1378
1379       // Insert the thread into the vector.
1380       cc_vector->insert(cc_vector->end(), thread);
1381     }
1382     workers_map_.insert(make_pair(kCCType, cc_vector));
1383   }
1384   ReleaseWorkerLock();
1385 }
1386
1387 // Return the number of cpus actually present in the machine.
1388 int Sat::CpuCount() {
1389   return sysconf(_SC_NPROCESSORS_CONF);
1390 }
1391
1392 // Notify and reap worker threads.
1393 void Sat::JoinThreads() {
1394   logprintf(12, "Log: Joining worker threads\n");
1395   power_spike_status_.StopWorkers();
1396   continuous_status_.StopWorkers();
1397
1398   AcquireWorkerLock();
1399   for (WorkerMap::const_iterator map_it = workers_map_.begin();
1400        map_it != workers_map_.end(); ++map_it) {
1401     for (WorkerVector::const_iterator it = map_it->second->begin();
1402          it != map_it->second->end(); ++it) {
1403       logprintf(12, "Log: Joining thread %d\n", (*it)->ThreadID());
1404       (*it)->JoinThread();
1405     }
1406   }
1407   ReleaseWorkerLock();
1408
1409   QueueStats();
1410
1411   // Finish up result checking.
1412   // Spawn 4 check threads to minimize check time.
1413   logprintf(12, "Log: Finished countdown, begin to result check\n");
1414   WorkerStatus reap_check_status;
1415   WorkerVector reap_check_vector;
1416
1417   // No need for check threads for monitor mode.
1418   if (!monitor_mode_) {
1419     // Initialize the check threads.
1420     for (int i = 0; i < fill_threads_; i++) {
1421       CheckThread *thread = new CheckThread();
1422       thread->InitThread(total_threads_++, this, os_, patternlist_,
1423                          &reap_check_status);
1424       logprintf(12, "Log: Finished countdown, begin to result check\n");
1425       reap_check_vector.push_back(thread);
1426     }
1427   }
1428
1429   reap_check_status.Initialize();
1430   // Check threads should be marked to stop ASAP.
1431   reap_check_status.StopWorkers();
1432
1433   // Spawn the check threads.
1434   for (WorkerVector::const_iterator it = reap_check_vector.begin();
1435        it != reap_check_vector.end(); ++it) {
1436     logprintf(12, "Log: Spawning thread %d\n", (*it)->ThreadID());
1437     (*it)->SpawnThread();
1438   }
1439
1440   // Join the check threads.
1441   for (WorkerVector::const_iterator it = reap_check_vector.begin();
1442        it != reap_check_vector.end(); ++it) {
1443     logprintf(12, "Log: Joining thread %d\n", (*it)->ThreadID());
1444     (*it)->JoinThread();
1445   }
1446
1447   // Reap all children. Stopped threads should have already ended.
1448   // Result checking threads will end when they have finished
1449   // result checking.
1450   logprintf(12, "Log: Join all outstanding threads\n");
1451
1452   // Find all errors.
1453   errorcount_ = GetTotalErrorCount();
1454
1455   AcquireWorkerLock();
1456   for (WorkerMap::const_iterator map_it = workers_map_.begin();
1457        map_it != workers_map_.end(); ++map_it) {
1458     for (WorkerVector::const_iterator it = map_it->second->begin();
1459          it != map_it->second->end(); ++it) {
1460       logprintf(12, "Log: Reaping thread status %d\n", (*it)->ThreadID());
1461       if ((*it)->GetStatus() != 1) {
1462         logprintf(0, "Process Error: Thread %d failed with status %d at "
1463                   "%.2f seconds\n",
1464                   (*it)->ThreadID(), (*it)->GetStatus(),
1465                   (*it)->GetRunDurationUSec()*1.0/1000000);
1466         bad_status();
1467       }
1468       int priority = 12;
1469       if ((*it)->GetErrorCount())
1470         priority = 5;
1471       logprintf(priority, "Log: Thread %d found %lld hardware incidents\n",
1472                 (*it)->ThreadID(), (*it)->GetErrorCount());
1473     }
1474   }
1475   ReleaseWorkerLock();
1476
1477
1478   // Add in any errors from check threads.
1479   for (WorkerVector::const_iterator it = reap_check_vector.begin();
1480        it != reap_check_vector.end(); ++it) {
1481     logprintf(12, "Log: Reaping thread status %d\n", (*it)->ThreadID());
1482     if ((*it)->GetStatus() != 1) {
1483       logprintf(0, "Process Error: Thread %d failed with status %d at "
1484                 "%.2f seconds\n",
1485                 (*it)->ThreadID(), (*it)->GetStatus(),
1486                 (*it)->GetRunDurationUSec()*1.0/1000000);
1487       bad_status();
1488     }
1489     errorcount_ += (*it)->GetErrorCount();
1490     int priority = 12;
1491     if ((*it)->GetErrorCount())
1492       priority = 5;
1493     logprintf(priority, "Log: Thread %d found %lld hardware incidents\n",
1494               (*it)->ThreadID(), (*it)->GetErrorCount());
1495     delete (*it);
1496   }
1497   reap_check_vector.clear();
1498   reap_check_status.Destroy();
1499 }
1500
1501 // Print queuing information.
1502 void Sat::QueueStats() {
1503   finelock_q_->QueueAnalysis();
1504 }
1505
1506 void Sat::AnalysisAllStats() {
1507   float max_runtime_sec = 0.;
1508   float total_data = 0.;
1509   float total_bandwidth = 0.;
1510   float thread_runtime_sec = 0.;
1511
1512   for (WorkerMap::const_iterator map_it = workers_map_.begin();
1513        map_it != workers_map_.end(); ++map_it) {
1514     for (WorkerVector::const_iterator it = map_it->second->begin();
1515          it != map_it->second->end(); ++it) {
1516       thread_runtime_sec = (*it)->GetRunDurationUSec()*1.0/1000000;
1517       total_data += (*it)->GetMemoryCopiedData();
1518       total_data += (*it)->GetDeviceCopiedData();
1519       if (thread_runtime_sec > max_runtime_sec) {
1520         max_runtime_sec = thread_runtime_sec;
1521       }
1522     }
1523   }
1524
1525   total_bandwidth = total_data / max_runtime_sec;
1526
1527   logprintf(0, "Stats: Completed: %.2fM in %.2fs %.2fMB/s, "
1528             "with %d hardware incidents, %d errors\n",
1529             total_data,
1530             max_runtime_sec,
1531             total_bandwidth,
1532             errorcount_,
1533             statuscount_);
1534 }
1535
1536 void Sat::MemoryStats() {
1537   float memcopy_data = 0.;
1538   float memcopy_bandwidth = 0.;
1539   WorkerMap::const_iterator mem_it = workers_map_.find(
1540       static_cast<int>(kMemoryType));
1541   WorkerMap::const_iterator file_it = workers_map_.find(
1542       static_cast<int>(kFileIOType));
1543   sat_assert(mem_it != workers_map_.end());
1544   sat_assert(file_it != workers_map_.end());
1545   for (WorkerVector::const_iterator it = mem_it->second->begin();
1546        it != mem_it->second->end(); ++it) {
1547     memcopy_data += (*it)->GetMemoryCopiedData();
1548     memcopy_bandwidth += (*it)->GetMemoryBandwidth();
1549   }
1550   for (WorkerVector::const_iterator it = file_it->second->begin();
1551        it != file_it->second->end(); ++it) {
1552     memcopy_data += (*it)->GetMemoryCopiedData();
1553     memcopy_bandwidth += (*it)->GetMemoryBandwidth();
1554   }
1555   GoogleMemoryStats(&memcopy_data, &memcopy_bandwidth);
1556   logprintf(4, "Stats: Memory Copy: %.2fM at %.2fMB/s\n",
1557             memcopy_data,
1558             memcopy_bandwidth);
1559 }
1560
1561 void Sat::GoogleMemoryStats(float *memcopy_data,
1562                             float *memcopy_bandwidth) {
1563   // Do nothing, should be implemented by subclasses.
1564 }
1565
1566 void Sat::FileStats() {
1567   float file_data = 0.;
1568   float file_bandwidth = 0.;
1569   WorkerMap::const_iterator file_it = workers_map_.find(
1570       static_cast<int>(kFileIOType));
1571   sat_assert(file_it != workers_map_.end());
1572   for (WorkerVector::const_iterator it = file_it->second->begin();
1573        it != file_it->second->end(); ++it) {
1574     file_data += (*it)->GetDeviceCopiedData();
1575     file_bandwidth += (*it)->GetDeviceBandwidth();
1576   }
1577   logprintf(4, "Stats: File Copy: %.2fM at %.2fMB/s\n",
1578             file_data,
1579             file_bandwidth);
1580 }
1581
1582 void Sat::CheckStats() {
1583   float check_data = 0.;
1584   float check_bandwidth = 0.;
1585   WorkerMap::const_iterator check_it = workers_map_.find(
1586       static_cast<int>(kCheckType));
1587   sat_assert(check_it != workers_map_.end());
1588   for (WorkerVector::const_iterator it = check_it->second->begin();
1589        it != check_it->second->end(); ++it) {
1590     check_data += (*it)->GetMemoryCopiedData();
1591     check_bandwidth += (*it)->GetMemoryBandwidth();
1592   }
1593   logprintf(4, "Stats: Data Check: %.2fM at %.2fMB/s\n",
1594             check_data,
1595             check_bandwidth);
1596 }
1597
1598 void Sat::NetStats() {
1599   float net_data = 0.;
1600   float net_bandwidth = 0.;
1601   WorkerMap::const_iterator netio_it = workers_map_.find(
1602       static_cast<int>(kNetIOType));
1603   WorkerMap::const_iterator netslave_it = workers_map_.find(
1604       static_cast<int>(kNetSlaveType));
1605   sat_assert(netio_it != workers_map_.end());
1606   sat_assert(netslave_it != workers_map_.end());
1607   for (WorkerVector::const_iterator it = netio_it->second->begin();
1608        it != netio_it->second->end(); ++it) {
1609     net_data += (*it)->GetDeviceCopiedData();
1610     net_bandwidth += (*it)->GetDeviceBandwidth();
1611   }
1612   for (WorkerVector::const_iterator it = netslave_it->second->begin();
1613        it != netslave_it->second->end(); ++it) {
1614     net_data += (*it)->GetDeviceCopiedData();
1615     net_bandwidth += (*it)->GetDeviceBandwidth();
1616   }
1617   logprintf(4, "Stats: Net Copy: %.2fM at %.2fMB/s\n",
1618             net_data,
1619             net_bandwidth);
1620 }
1621
1622 void Sat::InvertStats() {
1623   float invert_data = 0.;
1624   float invert_bandwidth = 0.;
1625   WorkerMap::const_iterator invert_it = workers_map_.find(
1626       static_cast<int>(kInvertType));
1627   sat_assert(invert_it != workers_map_.end());
1628   for (WorkerVector::const_iterator it = invert_it->second->begin();
1629        it != invert_it->second->end(); ++it) {
1630     invert_data += (*it)->GetMemoryCopiedData();
1631     invert_bandwidth += (*it)->GetMemoryBandwidth();
1632   }
1633   logprintf(4, "Stats: Invert Data: %.2fM at %.2fMB/s\n",
1634             invert_data,
1635             invert_bandwidth);
1636 }
1637
1638 void Sat::DiskStats() {
1639   float disk_data = 0.;
1640   float disk_bandwidth = 0.;
1641   WorkerMap::const_iterator disk_it = workers_map_.find(
1642       static_cast<int>(kDiskType));
1643   WorkerMap::const_iterator random_it = workers_map_.find(
1644       static_cast<int>(kRandomDiskType));
1645   sat_assert(disk_it != workers_map_.end());
1646   sat_assert(random_it != workers_map_.end());
1647   for (WorkerVector::const_iterator it = disk_it->second->begin();
1648        it != disk_it->second->end(); ++it) {
1649     disk_data += (*it)->GetDeviceCopiedData();
1650     disk_bandwidth += (*it)->GetDeviceBandwidth();
1651   }
1652   for (WorkerVector::const_iterator it = random_it->second->begin();
1653        it != random_it->second->end(); ++it) {
1654     disk_data += (*it)->GetDeviceCopiedData();
1655     disk_bandwidth += (*it)->GetDeviceBandwidth();
1656   }
1657
1658   logprintf(4, "Stats: Disk: %.2fM at %.2fMB/s\n",
1659             disk_data,
1660             disk_bandwidth);
1661 }
1662
1663 // Process worker thread data for bandwidth information, and error results.
1664 // You can add more methods here just subclassing SAT.
1665 void Sat::RunAnalysis() {
1666   AnalysisAllStats();
1667   MemoryStats();
1668   FileStats();
1669   NetStats();
1670   CheckStats();
1671   InvertStats();
1672   DiskStats();
1673 }
1674
1675 // Get total error count, summing across all threads..
1676 int64 Sat::GetTotalErrorCount() {
1677   int64 errors = 0;
1678
1679   AcquireWorkerLock();
1680   for (WorkerMap::const_iterator map_it = workers_map_.begin();
1681        map_it != workers_map_.end(); ++map_it) {
1682     for (WorkerVector::const_iterator it = map_it->second->begin();
1683          it != map_it->second->end(); ++it) {
1684       errors += (*it)->GetErrorCount();
1685     }
1686   }
1687   ReleaseWorkerLock();
1688   return errors;
1689 }
1690
1691
1692 void Sat::SpawnThreads() {
1693   logprintf(12, "Log: Initializing WorkerStatus objects\n");
1694   power_spike_status_.Initialize();
1695   continuous_status_.Initialize();
1696   logprintf(12, "Log: Spawning worker threads\n");
1697   for (WorkerMap::const_iterator map_it = workers_map_.begin();
1698        map_it != workers_map_.end(); ++map_it) {
1699     for (WorkerVector::const_iterator it = map_it->second->begin();
1700          it != map_it->second->end(); ++it) {
1701       logprintf(12, "Log: Spawning thread %d\n", (*it)->ThreadID());
1702       (*it)->SpawnThread();
1703     }
1704   }
1705 }
1706
1707 // Delete used worker thread objects.
1708 void Sat::DeleteThreads() {
1709   logprintf(12, "Log: Deleting worker threads\n");
1710   for (WorkerMap::const_iterator map_it = workers_map_.begin();
1711        map_it != workers_map_.end(); ++map_it) {
1712     for (WorkerVector::const_iterator it = map_it->second->begin();
1713          it != map_it->second->end(); ++it) {
1714       logprintf(12, "Log: Deleting thread %d\n", (*it)->ThreadID());
1715       delete (*it);
1716     }
1717     delete map_it->second;
1718   }
1719   workers_map_.clear();
1720   logprintf(12, "Log: Destroying WorkerStatus objects\n");
1721   power_spike_status_.Destroy();
1722   continuous_status_.Destroy();
1723 }
1724
1725 namespace {
1726 // Calculates the next time an action in Sat::Run() should occur, based on a
1727 // schedule derived from a start point and a regular frequency.
1728 //
1729 // Using frequencies instead of intervals with their accompanying drift allows
1730 // users to better predict when the actions will occur throughout a run.
1731 //
1732 // Arguments:
1733 //   frequency: seconds
1734 //   start: unixtime
1735 //   now: unixtime
1736 //
1737 // Returns: unixtime
1738 inline time_t NextOccurance(time_t frequency, time_t start, time_t now) {
1739   return start + frequency + (((now - start) / frequency) * frequency);
1740 }
1741 }
1742
1743 // Run the actual test.
1744 bool Sat::Run() {
1745   // Install signal handlers to gracefully exit in the middle of a run.
1746   //
1747   // Why go through this whole rigmarole?  It's the only standards-compliant
1748   // (C++ and POSIX) way to handle signals in a multithreaded program.
1749   // Specifically:
1750   //
1751   // 1) (C++) The value of a variable not of type "volatile sig_atomic_t" is
1752   //    unspecified upon entering a signal handler and, if modified by the
1753   //    handler, is unspecified after leaving the handler.
1754   //
1755   // 2) (POSIX) After the value of a variable is changed in one thread, another
1756   //    thread is only guaranteed to see the new value after both threads have
1757   //    acquired or released the same mutex or rwlock, synchronized to the
1758   //    same barrier, or similar.
1759   //
1760   // #1 prevents the use of #2 in a signal handler, so the signal handler must
1761   // be called in the same thread that reads the "volatile sig_atomic_t"
1762   // variable it sets.  We enforce that by blocking the signals in question in
1763   // the worker threads, forcing them to be handled by this thread.
1764   logprintf(12, "Log: Installing signal handlers\n");
1765   sigset_t new_blocked_signals;
1766   sigemptyset(&new_blocked_signals);
1767   sigaddset(&new_blocked_signals, SIGINT);
1768   sigaddset(&new_blocked_signals, SIGTERM);
1769   sigset_t prev_blocked_signals;
1770   pthread_sigmask(SIG_BLOCK, &new_blocked_signals, &prev_blocked_signals);
1771   sighandler_t prev_sigint_handler = signal(SIGINT, SatHandleBreak);
1772   sighandler_t prev_sigterm_handler = signal(SIGTERM, SatHandleBreak);
1773
1774   // Kick off all the worker threads.
1775   logprintf(12, "Log: Launching worker threads\n");
1776   InitializeThreads();
1777   SpawnThreads();
1778   pthread_sigmask(SIG_SETMASK, &prev_blocked_signals, NULL);
1779
1780   logprintf(12, "Log: Starting countdown with %d seconds\n", runtime_seconds_);
1781
1782   // In seconds.
1783   static const time_t kSleepFrequency = 5;
1784   // All of these are in seconds.  You probably want them to be >=
1785   // kSleepFrequency and multiples of kSleepFrequency, but neither is necessary.
1786   static const time_t kInjectionFrequency = 10;
1787   static const time_t kPrintFrequency = 10;
1788
1789   const time_t start = time(NULL);
1790   const time_t end = start + runtime_seconds_;
1791   time_t now = start;
1792   time_t next_print = start + kPrintFrequency;
1793   time_t next_pause = start + pause_delay_;
1794   time_t next_resume = 0;
1795   time_t next_injection;
1796   if (crazy_error_injection_) {
1797     next_injection = start + kInjectionFrequency;
1798   } else {
1799     next_injection = 0;
1800   }
1801
1802   while (now < end) {
1803     // This is an int because it's for logprintf().
1804     const int seconds_remaining = end - now;
1805
1806     if (user_break_) {
1807       // Handle early exit.
1808       logprintf(0, "Log: User exiting early (%d seconds remaining)\n",
1809                 seconds_remaining);
1810       break;
1811     }
1812
1813     // If we have an error limit, check it here and see if we should exit.
1814     if (max_errorcount_ != 0) {
1815       uint64 errors = GetTotalErrorCount();
1816       if (errors > max_errorcount_) {
1817         logprintf(0, "Log: Exiting early (%d seconds remaining) "
1818                      "due to excessive failures (%lld)\n",
1819                   seconds_remaining,
1820                   errors);
1821         break;
1822       }
1823     }
1824
1825     if (now >= next_print) {
1826       // Print a count down message.
1827       logprintf(5, "Log: Seconds remaining: %d\n", seconds_remaining);
1828       next_print = NextOccurance(kPrintFrequency, start, now);
1829     }
1830
1831     if (next_injection && now >= next_injection) {
1832       // Inject an error.
1833       logprintf(4, "Log: Injecting error (%d seconds remaining)\n",
1834                 seconds_remaining);
1835       struct page_entry src;
1836       GetValid(&src);
1837       src.pattern = patternlist_->GetPattern(0);
1838       PutValid(&src);
1839       next_injection = NextOccurance(kInjectionFrequency, start, now);
1840     }
1841
1842     if (next_pause && now >= next_pause) {
1843       // Tell worker threads to pause in preparation for a power spike.
1844       logprintf(4, "Log: Pausing worker threads in preparation for power spike "
1845                 "(%d seconds remaining)\n", seconds_remaining);
1846       power_spike_status_.PauseWorkers();
1847       logprintf(12, "Log: Worker threads paused\n");
1848       next_pause = 0;
1849       next_resume = now + pause_duration_;
1850     }
1851
1852     if (next_resume && now >= next_resume) {
1853       // Tell worker threads to resume in order to cause a power spike.
1854       logprintf(4, "Log: Resuming worker threads to cause a power spike (%d "
1855                 "seconds remaining)\n", seconds_remaining);
1856       power_spike_status_.ResumeWorkers();
1857       logprintf(12, "Log: Worker threads resumed\n");
1858       next_pause = NextOccurance(pause_delay_, start, now);
1859       next_resume = 0;
1860     }
1861
1862     sat_sleep(NextOccurance(kSleepFrequency, start, now) - now);
1863     now = time(NULL);
1864   }
1865
1866   JoinThreads();
1867
1868   logprintf(0, "Stats: Found %lld hardware incidents\n", errorcount_);
1869
1870   if (!monitor_mode_)
1871     RunAnalysis();
1872
1873   DeleteThreads();
1874
1875   logprintf(12, "Log: Uninstalling signal handlers\n");
1876   signal(SIGINT, prev_sigint_handler);
1877   signal(SIGTERM, prev_sigterm_handler);
1878
1879   return true;
1880 }
1881
1882 // Clean up all resources.
1883 bool Sat::Cleanup() {
1884   g_sat = NULL;
1885   Logger::GlobalLogger()->StopThread();
1886   Logger::GlobalLogger()->SetStdoutOnly();
1887   if (logfile_) {
1888     close(logfile_);
1889     logfile_ = 0;
1890   }
1891   if (patternlist_) {
1892     patternlist_->Destroy();
1893     delete patternlist_;
1894     patternlist_ = 0;
1895   }
1896   if (os_) {
1897     os_->FreeTestMem();
1898     delete os_;
1899     os_ = 0;
1900   }
1901   if (empty_) {
1902     delete empty_;
1903     empty_ = 0;
1904   }
1905   if (valid_) {
1906     delete valid_;
1907     valid_ = 0;
1908   }
1909   if (finelock_q_) {
1910     delete finelock_q_;
1911     finelock_q_ = 0;
1912   }
1913   if (page_bitmap_) {
1914     delete[] page_bitmap_;
1915   }
1916
1917   for (size_t i = 0; i < blocktables_.size(); i++) {
1918     delete blocktables_[i];
1919   }
1920
1921   if (cc_cacheline_data_) {
1922     // The num integer arrays for all the cacheline structures are
1923     // allocated as a single chunk. The pointers in the cacheline struct
1924     // are populated accordingly. Hence calling free on the first
1925     // cacheline's num's address is going to free the entire array.
1926     // TODO(aganti): Refactor this to have a class for the cacheline
1927     // structure (currently defined in worker.h) and clean this up
1928     // in the destructor of that class.
1929     if (cc_cacheline_data_[0].num) {
1930       free(cc_cacheline_data_[0].num);
1931     }
1932     free(cc_cacheline_data_);
1933   }
1934
1935   sat_assert(0 == pthread_mutex_destroy(&worker_lock_));
1936
1937   return true;
1938 }
1939
1940
1941 // Pretty print really obvious results.
1942 bool Sat::PrintResults() {
1943   bool result = true;
1944
1945   logprintf(4, "\n");
1946   if (statuscount_) {
1947     logprintf(4, "Status: FAIL - test encountered procedural errors\n");
1948     result = false;
1949   } else if (errorcount_) {
1950     logprintf(4, "Status: FAIL - test discovered HW problems\n");
1951     result = false;
1952   } else {
1953     logprintf(4, "Status: PASS - please verify no corrected errors\n");
1954   }
1955   logprintf(4, "\n");
1956
1957   return result;
1958 }
1959
1960 // Helper functions.
1961 void Sat::AcquireWorkerLock() {
1962   sat_assert(0 == pthread_mutex_lock(&worker_lock_));
1963 }
1964 void Sat::ReleaseWorkerLock() {
1965   sat_assert(0 == pthread_mutex_unlock(&worker_lock_));
1966 }
1967
1968 void logprintf(int priority, const char *format, ...) {
1969   va_list args;
1970   va_start(args, format);
1971   Logger::GlobalLogger()->VLogF(priority, format, args);
1972   va_end(args);
1973 }