From: ewout <ewout@google.com>
Date: Tue, 10 Sep 2013 21:27:49 +0000 (+0000)
Subject: New frequency test, fixed error accounting, added logging timestamps, and miscellaneo... 
X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~ian/git?a=commitdiff_plain;h=2ea87b7996f4f433d5d946eaf8f0d2f6fd18c144;p=stressapptest

New frequency test, fixed error accounting, added logging timestamps, and miscellaneous smaller changes.

* Added a CPU Frequency test for select X86 processors to verify a minimum frequency is maintained during non-pause periods.
* Fixed the error accounting in WorkerThread::CheckRegion if more than 128 miscompares are found and when block errors are detected.
* Updated the logger to include timestamps and the associated timezone.
* Moved from apicid() to sched_getcpu() for determining the core ID.
* Added the ability to reserve a specified amount of memory. This can override the requested memory allocation.
* If not using POSIX shared memory or hugepages, explicitly mmap memory if the pagesize is 4kB otherwise use memalign.
* Removed the OSLayer's unused PCI device handling.
* Numerous refactoring changes.
---

diff --git a/configure.ac b/configure.ac
index ca10966..6f09eb9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -5,10 +5,10 @@ AC_ARG_WITH(static, [  --with-static            enable static linking])
 
 if test "$with_static" == "yes"
 then
-	AC_MSG_NOTICE([Compiling with staticaly linked libraries.])
-	LIBS="$LIBS -static"
+  AC_MSG_NOTICE([Compiling with staticaly linked libraries.])
+  LIBS="$LIBS -static"
 else
-	AC_MSG_NOTICE([Compiling with dynamically linked libraries.])
+  AC_MSG_NOTICE([Compiling with dynamically linked libraries.])
 fi
 
 AC_CANONICAL_HOST
diff --git a/src/Makefile.am b/src/Makefile.am
index 2179b42..16f539d 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -28,6 +28,7 @@ HFILES += error_diag.h
 HFILES += disk_blocks.h
 HFILES += adler32memcpy.h
 HFILES += logger.h
+HFILES += clock.h
 
 stressapptest_SOURCES = $(MAINFILES) $(CFILES) $(HFILES)
 findmask_SOURCES = findmask.c findmask.inc
diff --git a/src/clock.h b/src/clock.h
new file mode 100644
index 0000000..4204188
--- /dev/null
+++ b/src/clock.h
@@ -0,0 +1,29 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+// Author: cferris
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//      http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STRESSAPPTEST_CLOCK_H_  // NOLINT
+#define STRESSAPPTEST_CLOCK_H_
+
+#include <time.h>
+
+// This class implements a clock that can be overriden for unit tests.
+class Clock {
+ public:
+  virtual ~Clock() {}
+
+  virtual time_t Now() { return time(NULL); }
+};
+
+#endif  // STRESSAPPTEST_CLOCK_H_ NOLINT
diff --git a/src/disk_blocks.cc b/src/disk_blocks.cc
index c7860b0..60018f9 100644
--- a/src/disk_blocks.cc
+++ b/src/disk_blocks.cc
@@ -14,38 +14,51 @@
 
 // Thread-safe container of disk blocks
 
-#include <utility>
-
 // This file must work with autoconf on its public version,
 // so these includes are correct.
 #include "disk_blocks.h"
 
-DiskBlockTable::DiskBlockTable() {
-  nelems_ = 0;
+#include <utility>
+
+// BlockData
+BlockData::BlockData() : address_(0), size_(0),
+                         references_(0), initialized_(false),
+                         pattern_(NULL) {
+  pthread_mutex_init(&data_mutex_, NULL);
+}
+
+BlockData::~BlockData() {
+  pthread_mutex_destroy(&data_mutex_);
+}
+
+void BlockData::set_initialized() {
+  pthread_mutex_lock(&data_mutex_);
+  initialized_ = true;
+  pthread_mutex_unlock(&data_mutex_);
+}
+
+bool BlockData::initialized() const {
+  pthread_mutex_lock(&data_mutex_);
+  bool initialized = initialized_;
+  pthread_mutex_unlock(&data_mutex_);
+  return initialized;
+}
+
+// DiskBlockTable
+DiskBlockTable::DiskBlockTable() : sector_size_(0), write_block_size_(0),
+                                   device_name_(""), device_sectors_(0),
+                                   segment_size_(0), size_(0) {
   pthread_mutex_init(&data_mutex_, NULL);
   pthread_mutex_init(&parameter_mutex_, NULL);
   pthread_cond_init(&data_condition_, NULL);
 }
 
 DiskBlockTable::~DiskBlockTable() {
-  CleanTable();
   pthread_mutex_destroy(&data_mutex_);
   pthread_mutex_destroy(&parameter_mutex_);
   pthread_cond_destroy(&data_condition_);
 }
 
-void DiskBlockTable::CleanTable() {
-  pthread_mutex_lock(&data_mutex_);
-  for (map<int64, StorageData*>::iterator it =
-           addr_to_block_.begin(); it != addr_to_block_.end(); ++it) {
-    delete it->second;
-  }
-  addr_to_block_.erase(addr_to_block_.begin(), addr_to_block_.end());
-  nelems_ = 0;
-  pthread_cond_broadcast(&data_condition_);
-  pthread_mutex_unlock(&data_mutex_);
-}
-
 // 64-bit non-negative random number generator.  Stolen from
 // depot/google3/base/tracecontext_unittest.cc.
 int64 DiskBlockTable::Random64() {
@@ -58,28 +71,27 @@ int64 DiskBlockTable::Random64() {
     return -x;
 }
 
-int64 DiskBlockTable::NumElems() {
-  unsigned int nelems;
+uint64 DiskBlockTable::Size() {
   pthread_mutex_lock(&data_mutex_);
-  nelems = nelems_;
+  uint64 size = size_;
   pthread_mutex_unlock(&data_mutex_);
-  return nelems;
+  return size;
 }
 
 void DiskBlockTable::InsertOnStructure(BlockData *block) {
-  int64 address = block->GetAddress();
+  int64 address = block->address();
   StorageData *sd = new StorageData();
   sd->block = block;
-  sd->pos = nelems_;
+  sd->pos = size_;
   // Creating new block ...
   pthread_mutex_lock(&data_mutex_);
-  if (pos_to_addr_.size() <= nelems_) {
+  if (pos_to_addr_.size() <= size_) {
     pos_to_addr_.insert(pos_to_addr_.end(), address);
   } else {
-    pos_to_addr_[nelems_] = address;
+    pos_to_addr_[size_] = address;
   }
-  addr_to_block_.insert(std::make_pair(address, sd));
-  nelems_++;
+  addr_to_block_[address] = sd;
+  size_++;
   pthread_cond_broadcast(&data_condition_);
   pthread_mutex_unlock(&data_mutex_);
 }
@@ -87,26 +99,28 @@ void DiskBlockTable::InsertOnStructure(BlockData *block) {
 int DiskBlockTable::RemoveBlock(BlockData *block) {
   // For write threads, check the reference counter and remove
   // it from the structure.
-  int64 address = block->GetAddress();
+  int64 address = block->address();
   AddrToBlockMap::iterator it = addr_to_block_.find(address);
   int ret = 1;
   if (it != addr_to_block_.end()) {
     int curr_pos = it->second->pos;
-    int last_pos = nelems_ - 1;
+    int last_pos = size_ - 1;
     AddrToBlockMap::iterator last_it = addr_to_block_.find(
         pos_to_addr_[last_pos]);
-    sat_assert(nelems_ > 0);
+    sat_assert(size_ > 0);
     sat_assert(last_it != addr_to_block_.end());
-    // Everything is fine, updating ...
+    // Everything is fine, removing block from table.
     pthread_mutex_lock(&data_mutex_);
     pos_to_addr_[curr_pos] = pos_to_addr_[last_pos];
     last_it->second->pos = curr_pos;
     delete it->second;
     addr_to_block_.erase(it);
-    nelems_--;
+    size_--;
     block->DecreaseReferenceCounter();
     if (block->GetReferenceCounter() == 0)
       delete block;
+    else if (block->GetReferenceCounter() < 0)
+      ret = 0;
     pthread_cond_broadcast(&data_condition_);
     pthread_mutex_unlock(&data_mutex_);
   } else {
@@ -116,18 +130,16 @@ int DiskBlockTable::RemoveBlock(BlockData *block) {
 }
 
 int DiskBlockTable::ReleaseBlock(BlockData *block) {
-  // If is a random thread, just check the reference counter.
+  // If caller is a random thread, just check the reference counter.
   int ret = 1;
   pthread_mutex_lock(&data_mutex_);
   int references = block->GetReferenceCounter();
-  if (references > 0) {
-    if (references == 1)
-      delete block;
-    else
-      block->DecreaseReferenceCounter();
-  } else {
+  if (references == 1)
+    delete block;
+  else if (references > 0)
+    block->DecreaseReferenceCounter();
+  else
     ret = 0;
-  }
   pthread_mutex_unlock(&data_mutex_);
   return ret;
 }
@@ -135,13 +147,13 @@ int DiskBlockTable::ReleaseBlock(BlockData *block) {
 BlockData *DiskBlockTable::GetRandomBlock() {
   struct timespec ts;
   struct timeval tp;
-  int result = 0;
   gettimeofday(&tp, NULL);
   ts.tv_sec  = tp.tv_sec;
   ts.tv_nsec = tp.tv_usec * 1000;
   ts.tv_sec += 2;  // Wait for 2 seconds.
+  int result = 0;
   pthread_mutex_lock(&data_mutex_);
-  while (!nelems_ && result != ETIMEDOUT) {
+  while (!size_ && result != ETIMEDOUT) {
     result = pthread_cond_timedwait(&data_condition_, &data_mutex_, &ts);
   }
   if (result == ETIMEDOUT) {
@@ -149,13 +161,13 @@ BlockData *DiskBlockTable::GetRandomBlock() {
     return NULL;
   } else {
     int64 random_number = Random64();
-    int64 random_pos = random_number % nelems_;
+    int64 random_pos = random_number % size_;
     int64 address = pos_to_addr_[random_pos];
     AddrToBlockMap::const_iterator it = addr_to_block_.find(address);
     sat_assert(it != addr_to_block_.end());
     BlockData *b = it->second->block;
     // A block is returned only if its content is written on disk.
-    if (b->BlockIsInitialized()) {
+    if (b->initialized()) {
       b->IncreaseReferenceCounter();
     } else {
       b = NULL;
@@ -165,45 +177,38 @@ BlockData *DiskBlockTable::GetRandomBlock() {
   }
 }
 
-void DiskBlockTable::SetParameters(
-    int sector_size, int write_block_size, int64 device_sectors,
-    int64 segment_size, string device_name) {
+void DiskBlockTable::SetParameters(int sector_size,
+                                   int write_block_size,
+                                   int64 device_sectors,
+                                   int64 segment_size,
+                                   const string& device_name) {
+  sat_assert(size_ == 0);
   pthread_mutex_lock(&parameter_mutex_);
   sector_size_ = sector_size;
   write_block_size_ = write_block_size;
   device_sectors_ = device_sectors;
   segment_size_ = segment_size;
   device_name_ = device_name;
-  CleanTable();
   pthread_mutex_unlock(&parameter_mutex_);
 }
 
 BlockData *DiskBlockTable::GetUnusedBlock(int64 segment) {
   int64 sector = 0;
   BlockData *block = new BlockData();
-
   bool good_sequence = false;
-  int num_sectors;
-
   if (block == NULL) {
     logprintf(0, "Process Error: Unable to allocate memory "
               "for sector data for disk %s.\n", device_name_.c_str());
     return NULL;
   }
-
   pthread_mutex_lock(&parameter_mutex_);
-
   sat_assert(device_sectors_ != 0);
-
   // Align the first sector with the beginning of a write block
-  num_sectors = write_block_size_ / sector_size_;
-
+  int num_sectors = write_block_size_ / sector_size_;
   for (int i = 0; i < kBlockRetry && !good_sequence; i++) {
     good_sequence = true;
-
     // Use the entire disk or a small segment of the disk to allocate the first
     // sector in the block from.
-
     if (segment_size_ == -1) {
       sector = (Random64() & 0x7FFFFFFFFFFFFFFFLL) % (
           device_sectors_ / num_sectors);
@@ -213,7 +218,6 @@ BlockData *DiskBlockTable::GetUnusedBlock(int64 segment) {
           segment_size_ / num_sectors);
       sector *= num_sectors;
       sector += segment * segment_size_;
-
       // Make sure the block is within the segment.
       if (sector + num_sectors > (segment + 1) * segment_size_) {
         good_sequence = false;
@@ -229,7 +233,6 @@ BlockData *DiskBlockTable::GetUnusedBlock(int64 segment) {
     // now aligned to the write_block_size, it is not necessary
     // to check each sector, just the first block (a sector
     // overlap will never occur).
-
     pthread_mutex_lock(&data_mutex_);
     if (addr_to_block_.find(sector) != addr_to_block_.end()) {
       good_sequence = false;
@@ -238,7 +241,8 @@ BlockData *DiskBlockTable::GetUnusedBlock(int64 segment) {
   }
 
   if (good_sequence) {
-    block->SetParameters(sector, write_block_size_);
+    block->set_address(sector);
+    block->set_size(write_block_size_);
     block->IncreaseReferenceCounter();
     InsertOnStructure(block);
   } else {
@@ -248,66 +252,5 @@ BlockData *DiskBlockTable::GetUnusedBlock(int64 segment) {
     block = NULL;
   }
   pthread_mutex_unlock(&parameter_mutex_);
-
   return block;
 }
-
-// BlockData
-
-BlockData::BlockData() {
-  addr_ = 0;
-  size_ = 0;
-  references_ = 0;
-  initialized_ = false;
-  pthread_mutex_init(&data_mutex_, NULL);
-}
-
-BlockData::~BlockData() {
-  pthread_mutex_destroy(&data_mutex_);
-}
-
-void BlockData::SetParameters(int64 address, int64 size) {
-  addr_ = address;
-  size_ = size;
-}
-
-void BlockData::IncreaseReferenceCounter() {
-  references_++;
-}
-
-void BlockData::DecreaseReferenceCounter() {
-  references_--;
-}
-
-int BlockData::GetReferenceCounter() {
-  return references_;
-}
-
-void BlockData::SetBlockAsInitialized() {
-  pthread_mutex_lock(&data_mutex_);
-  initialized_ = true;
-  pthread_mutex_unlock(&data_mutex_);
-}
-
-bool BlockData::BlockIsInitialized() {
-  pthread_mutex_lock(&data_mutex_);
-  bool initialized = initialized_;
-  pthread_mutex_unlock(&data_mutex_);
-  return initialized;
-}
-
-int64 BlockData::GetAddress() {
-  return addr_;
-}
-
-int64 BlockData::GetSize() {
-  return size_;
-}
-
-Pattern *BlockData::GetPattern() {
-  return pattern_;
-}
-
-void BlockData::SetPattern(Pattern *p) {
-  pattern_ = p;
-}
diff --git a/src/disk_blocks.h b/src/disk_blocks.h
index cb634c9..638ee9f 100644
--- a/src/disk_blocks.h
+++ b/src/disk_blocks.h
@@ -25,87 +25,146 @@
 #include <map>
 #include <vector>
 #include <string>
-// This file must work with autoconf on its public version,
-// so these includes are correct.
-#include "pattern.h"
+
+#include "sattypes.h"
+
+class Pattern;
 
 // Data about a block written to disk so that it can be verified later.
+// Thread-unsafe, must be used with locks on non-const methods,
+// except for initialized accessor/mutator, which are thread-safe
+// (and in fact, is the only method supposed to be accessed from
+// someone which is not the thread-safe DiskBlockTable).
 class BlockData {
  public:
   BlockData();
   ~BlockData();
-  void SetParameters(int64 address, int64 size);
-  void IncreaseReferenceCounter();
-  void DecreaseReferenceCounter();
-  int GetReferenceCounter();
-  void SetBlockAsInitialized();
-  bool BlockIsInitialized();
-  int64 GetAddress();
-  int64 GetSize();
-  void SetPattern(Pattern *p);
-  Pattern *GetPattern();
- protected:
-  int64 addr_;         // address of first sector in block
-  int64 size_;         // size of block
-  int references_;      // reference counter
-  bool initialized_;     // flag indicating the block was written on disk
+
+  // These are reference counters used to control how many
+  // threads currently have a copy of this particular block.
+  void IncreaseReferenceCounter() { references_++; }
+  void DecreaseReferenceCounter() { references_--; }
+  int GetReferenceCounter() const { return references_; }
+
+  // Controls whether the block was written on disk or not.
+  // Once written, you cannot "un-written" then without destroying
+  // this object.
+  void set_initialized();
+  bool initialized() const;
+
+  // Accessor methods for some data related to blocks.
+  void set_address(uint64 address) { address_ = address; }
+  uint64 address() const { return address_; }
+  void set_size(uint64 size) { size_ = size; }
+  uint64 size() const { return size_; }
+  void set_pattern(Pattern *p) { pattern_ = p; }
+  Pattern *pattern() { return pattern_; }
+ private:
+  uint64 address_;  // Address of first sector in block
+  uint64 size_;  // Size of block
+  int references_;  // Reference counter
+  bool initialized_;  // Flag indicating the block was written on disk
   Pattern *pattern_;
-  pthread_mutex_t data_mutex_;
+  mutable pthread_mutex_t data_mutex_;
   DISALLOW_COPY_AND_ASSIGN(BlockData);
 };
 
-// Disk Block table - store data from blocks to be write / read by
-// a DiskThread
+// A thread-safe table used to store block data and control access
+// to these blocks, letting several threads read and write blocks on
+// disk.
 class DiskBlockTable {
  public:
   DiskBlockTable();
   virtual ~DiskBlockTable();
 
-  // Get Number of elements stored on table
-  int64 NumElems();
-  // Clean all table data
-  void CleanTable();
-  // Get a random block from the list. Only returns if a element
-  // is available (consider that other thread must have added them.
-  BlockData *GetRandomBlock();
-  // Set all initial parameters. Assumes all existent data is
+  // Returns number of elements stored on table.
+  uint64 Size();
+
+  // Sets all initial parameters. Assumes all existent data is
   // invalid and, therefore, must be removed.
   void SetParameters(int sector_size, int write_block_size,
                      int64 device_sectors,
                      int64 segment_size,
-                     string device_name);
-  // Return a new block in a unused address.
+                     const string& device_name);
+
+  // During the regular execution, there will be 2 types of threads:
+  // - Write thread:  gets a large number of blocks using GetUnusedBlock,
+  //                  writes them on disk (if on destructive mode),
+  //                  reads block content ONCE from disk and them removes
+  //                  the block from queue with RemoveBlock. After a removal a
+  //                  block is not available for read threads, but it is
+  //                  only removed from memory if there is no reference for
+  //                  this block. Note that a write thread also counts as
+  //                  a reference.
+  // - Read threads:  get one block at a time (if available) with
+  //                  GetRandomBlock, reads its content from disk,
+  //                  checking whether it is correct or not, and releases
+  //                  (Using ReleaseBlock) the block to be erased by the
+  //                  write threads. Since several read threads are allowed
+  //                  to read the same block, a reference counter is used to
+  //                  control when the block can be REALLY erased from
+  //                  memory, and all memory management is made by a
+  //                  DiskBlockTable instance.
+
+  // Returns a new block in a unused address. Does not
+  // grant ownership of the pointer to the caller
+  // (use RemoveBlock to delete the block from memory instead).
   BlockData *GetUnusedBlock(int64 segment);
-  // Remove block from structure (called by write threads)
+
+  // Removes block from structure (called by write threads). Returns
+  // 1 if successful, 0 otherwise.
   int RemoveBlock(BlockData *block);
-  // Release block to be erased (called by random threads)
-  int ReleaseBlock(BlockData *block);
 
- protected:
+  // Gets a random block from the list. Only returns if an element
+  // is available (a write thread has got this block, written it on disk,
+  // and set this block as initialized). Does not grant ownership of the
+  // pointer to the caller (use RemoveBlock to delete the block from
+  // memory instead).
+  BlockData *GetRandomBlock();
 
-  void InsertOnStructure(BlockData *block);
-  //  Generate a random 64-bit integer (virtual so it could be
-  //  override by the tests)
-  virtual int64 Random64();
+  // Releases block to be erased (called by random threads). Returns
+  // 1 if successful, 0 otherwise.
+  int ReleaseBlock(BlockData *block);
 
+ protected:
   struct StorageData {
     BlockData *block;
     int pos;
   };
-
-  static const int kBlockRetry = 100;       // Number of retries to allocate
-                                            // sectors.
-
   typedef map<int64, StorageData*> AddrToBlockMap;
   typedef vector<int64> PosToAddrVector;
+
+  // Inserts block in structure, used in tests and by other methods.
+  void InsertOnStructure(BlockData *block);
+
+  // Generates a random 64-bit integer.
+  // Virtual method so it can be overridden by the tests.
+  virtual int64 Random64();
+
+  // Accessor methods for testing.
+  const PosToAddrVector& pos_to_addr() const { return pos_to_addr_; }
+  const AddrToBlockMap& addr_to_block() const { return addr_to_block_; }
+
+  int sector_size() const { return sector_size_; }
+  int write_block_size() const { return write_block_size_; }
+  const string& device_name() const { return device_name_; }
+  int64 device_sectors() const { return device_sectors_; }
+  int64 segment_size() const { return segment_size_; }
+
+ private:
+  // Number of retries to allocate sectors.
+  static const int kBlockRetry = 100;
+  // Actual tables.
   PosToAddrVector pos_to_addr_;
   AddrToBlockMap addr_to_block_;
-  uint64 nelems_;
-  int sector_size_;          // Sector size, in bytes
-  int write_block_size_;     // Block size, in bytes
-  string device_name_;       // Device name
-  int64 device_sectors_;     // Number of sectors in device
-  int64 segment_size_;       // Segment size, in bytes
+
+  // Configuration parameters for block selection
+  int sector_size_;  // Sector size, in bytes
+  int write_block_size_;  // Block size, in bytes
+  string device_name_;  // Device name
+  int64 device_sectors_;  // Number of sectors in device
+  int64 segment_size_;  // Segment size in bytes
+  uint64 size_;  // Number of elements on table
   pthread_mutex_t data_mutex_;
   pthread_cond_t data_condition_;
   pthread_mutex_t parameter_mutex_;
diff --git a/src/findmask.c b/src/findmask.c
index d8ec300..1b10988 100644
--- a/src/findmask.c
+++ b/src/findmask.c
@@ -38,6 +38,7 @@
  * current progress.
  */
 
+#include <inttypes.h>
 #include <pthread.h>
 #include <signal.h>
 #include <stdint.h>
@@ -106,7 +107,7 @@ void* thread_func(void* arg) {
 
     if (a < NOISE) b = a;
     if (b < NOISE) {
-      printf("Found mask with just %d deviations: 0x%llx\n", b, mask);
+      printf("Found mask with just %d deviations: 0x%" PRIx64 "\n", b, mask);
       fflush(stdout);
     }
 
@@ -118,7 +119,8 @@ void* thread_func(void* arg) {
 }
 
 void signal_handler(int signum) {
-  printf("Received signal... currently evaluating mask 0x%llx!\n", lastmask);
+  printf("Received signal... currently evaluating mask 0x%" PRIx64 "!\n",
+         lastmask);
   fflush(stdout);
 }
 
diff --git a/src/logger.cc b/src/logger.cc
index e4ecb03..f13e003 100644
--- a/src/logger.cc
+++ b/src/logger.cc
@@ -17,6 +17,7 @@
 #include <pthread.h>
 #include <stdarg.h>
 #include <stdio.h>
+#include <time.h>
 #include <unistd.h>
 
 #include <string>
@@ -37,10 +38,20 @@ void Logger::VLogF(int priority, const char *format, va_list args) {
     return;
   }
   char buffer[4096];
-  int length = vsnprintf(buffer, sizeof buffer, format, args);
-  if (static_cast<size_t>(length) >= sizeof buffer) {
-    length = sizeof buffer;
-    buffer[sizeof buffer - 1] = '\n';
+  size_t length = 0;
+  if (log_timestamps_) {
+    time_t raw_time;
+    time(&raw_time);
+    struct tm time_struct;
+    localtime_r(&raw_time, &time_struct);
+    length = strftime(buffer, sizeof(buffer), "%Y/%m/%d-%H:%M:%S(%Z) ",
+                      &time_struct);
+    LOGGER_ASSERT(length);  // Catch if the buffer is set too small.
+  }
+  length += vsnprintf(buffer + length, sizeof(buffer) - length, format, args);
+  if (length >= sizeof(buffer)) {
+    length = sizeof(buffer);
+    buffer[sizeof(buffer) - 1] = '\n';
   }
   QueueLogLine(new string(buffer, length));
 }
@@ -52,19 +63,30 @@ void Logger::StartThread() {
 }
 
 void Logger::StopThread() {
-  LOGGER_ASSERT(thread_running_);
+  // Allow this to be called before the thread has started.
+  if (!thread_running_) {
+    return;
+  }
   thread_running_ = false;
-  LOGGER_ASSERT(0 == pthread_mutex_lock(&queued_lines_mutex_));
+  int retval = pthread_mutex_lock(&queued_lines_mutex_);
+  LOGGER_ASSERT(0 == retval);
   bool need_cond_signal = queued_lines_.empty();
   queued_lines_.push_back(NULL);
-  LOGGER_ASSERT(0 == pthread_mutex_unlock(&queued_lines_mutex_));
+  retval = pthread_mutex_unlock(&queued_lines_mutex_);
+  LOGGER_ASSERT(0 == retval);
   if (need_cond_signal) {
-    LOGGER_ASSERT(0 == pthread_cond_signal(&queued_lines_cond_));
+    retval = pthread_cond_signal(&queued_lines_cond_);
+    LOGGER_ASSERT(0 == retval);
   }
-  LOGGER_ASSERT(0 == pthread_join(thread_, NULL));
+  retval = pthread_join(thread_, NULL);
+  LOGGER_ASSERT(0 == retval);
 }
 
-Logger::Logger() : verbosity_(20), log_fd_(-1), thread_running_(false) {
+Logger::Logger()
+    : verbosity_(20),
+      log_fd_(-1),
+      thread_running_(false),
+      log_timestamps_(true) {
   LOGGER_ASSERT(0 == pthread_mutex_init(&queued_lines_mutex_, NULL));
   LOGGER_ASSERT(0 == pthread_cond_init(&queued_lines_cond_, NULL));
   LOGGER_ASSERT(0 == pthread_cond_init(&full_queue_cond_, NULL));
@@ -94,19 +116,15 @@ void Logger::QueueLogLine(string *line) {
   LOGGER_ASSERT(0 == pthread_mutex_unlock(&queued_lines_mutex_));
 }
 
-namespace {
-void WriteToFile(const string& line, int fd) {
-  LOGGER_ASSERT(write(fd, line.data(), line.size()) ==
-                static_cast<ssize_t>(line.size()));
-}
-}
-
 void Logger::WriteAndDeleteLogLine(string *line) {
   LOGGER_ASSERT(line != NULL);
+  ssize_t bytes_written;
   if (log_fd_ >= 0) {
-    WriteToFile(*line, log_fd_);
+    bytes_written = write(log_fd_, line->data(), line->size());
+    LOGGER_ASSERT(bytes_written == static_cast<ssize_t>(line->size()));
   }
-  WriteToFile(*line, 1);
+  bytes_written = write(STDOUT_FILENO, line->data(), line->size());
+  LOGGER_ASSERT(bytes_written == static_cast<ssize_t>(line->size()));
   delete line;
 }
 
diff --git a/src/logger.h b/src/logger.h
index 1d70107..21b3c6b 100644
--- a/src/logger.h
+++ b/src/logger.h
@@ -62,7 +62,7 @@ class Logger {
 
   // Lines with a priority numerically greater than this will not be logged.
   // May not be called while multiple threads are running.
-  void SetVerbosity(int verbosity) {
+  virtual void SetVerbosity(int verbosity) {
     verbosity_ = verbosity;
   }
 
@@ -72,17 +72,22 @@ class Logger {
   // Args:
   //   log_fd: The file descriptor to write to.  Will not be closed by this
   //           object.
-  void SetLogFd(int log_fd) {
+  virtual void SetLogFd(int log_fd) {
     LOGGER_ASSERT(log_fd >= 0);
     log_fd_ = log_fd;
   }
 
   // Set output to be written to stdout only.  This is the default mode.  May
   // not be called while multiple threads are running.
-  void SetStdoutOnly() {
+  virtual void SetStdoutOnly() {
     log_fd_ = -1;
   }
 
+  // Enable or disable logging of timestamps.
+  void SetTimestampLogging(bool log_ts_enabled) {
+    log_timestamps_ = log_ts_enabled;
+  }
+
   // Logs a line, with a vprintf(3)-like interface.  This will block on writing
   // the line to stdout/disk iff the dedicated logging thread is not running.
   // This will block on adding the line to the queue if doing so would exceed
@@ -104,11 +109,12 @@ class Logger {
   // before this returns.  Waits for the thread to finish before returning.
   void StopThread();
 
- private:
+ protected:
   Logger();
 
-  ~Logger();
+  virtual ~Logger();
 
+ private:
   // Args:
   //   line: Must be non-NULL.  This function takes ownership of it.
   void QueueLogLine(string *line);
@@ -127,6 +133,7 @@ class Logger {
   int verbosity_;
   int log_fd_;
   bool thread_running_;
+  bool log_timestamps_;
   vector<string*> queued_lines_;
   // This doubles as a mutex for log_fd_ when the logging thread is not running.
   pthread_mutex_t queued_lines_mutex_;
diff --git a/src/os.cc b/src/os.cc
index 7cae23b..6358398 100644
--- a/src/os.cc
+++ b/src/os.cc
@@ -48,6 +48,7 @@
 // so these includes are correct.
 #include "sattypes.h"
 #include "error_diag.h"
+#include "clock.h"
 
 // OsLayer initialization.
 OsLayer::OsLayer() {
@@ -55,10 +56,12 @@ OsLayer::OsLayer() {
   testmemsize_ = 0;
   totalmemsize_ = 0;
   min_hugepages_bytes_ = 0;
+  reserve_mb_ = 0;
   normal_mem_ = true;
   use_hugepages_ = false;
   use_posix_shm_ = false;
   dynamic_mapped_shmem_ = false;
+  mmapped_allocation_ = false;
   shmid_ = 0;
 
   time_initialized_ = 0;
@@ -79,17 +82,25 @@ OsLayer::OsLayer() {
   has_sse2_ = false;
 
   use_flush_page_cache_ = false;
+
+  clock_ = NULL;
 }
 
 // OsLayer cleanup.
 OsLayer::~OsLayer() {
   if (error_diagnoser_)
     delete error_diagnoser_;
+  if (clock_)
+    delete clock_;
 }
 
 // OsLayer initialization.
 bool OsLayer::Initialize() {
-  time_initialized_ = time(NULL);
+  if (!clock_) {
+    clock_ = new Clock();
+  }
+
+  time_initialized_ = clock_->Now();
   // Detect asm support.
   GetFeatures();
 
@@ -130,7 +141,7 @@ int OsLayer::AddressMode() {
 // Translates user virtual to physical address.
 uint64 OsLayer::VirtualToPhysical(void *vaddr) {
   uint64 frame, shift;
-  off64_t off = ((uintptr_t)vaddr) / getpagesize() * 8;
+  off64_t off = ((uintptr_t)vaddr) / sysconf(_SC_PAGESIZE) * 8;
   int fd = open(kPagemapPath, O_RDONLY);
   // /proc/self/pagemap is available in kernel >= 2.6.25
   if (fd < 0)
@@ -169,22 +180,10 @@ list<string> OsLayer::FindFileDevices() {
 // Get HW core features from cpuid instruction.
 void OsLayer::GetFeatures() {
 #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
-  // CPUID features documented at:
-  // http://www.sandpile.org/ia32/cpuid.htm
-  int ax, bx, cx, dx;
-  __asm__ __volatile__ (
-# if defined(STRESSAPPTEST_CPU_I686) && defined(__PIC__)
-      "xchg %%ebx, %%esi;"
-      "cpuid;"
-      "xchg %%esi, %%ebx;"
-      : "=S" (bx),
-# else
-      "cpuid;"
-      : "=b" (bx),
-# endif
-        "=a" (ax), "=c" (cx), "=d" (dx) : "a" (1));
-  has_clflush_ = (dx >> 19) & 1;
-  has_sse2_ = (dx >> 26) & 1;
+  unsigned int eax = 1, ebx, ecx, edx;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  has_clflush_ = (edx >> 19) & 1;
+  has_sse2_ = (edx >> 26) & 1;
 
   logprintf(9, "Log: has clflush: %s, has sse2: %s\n",
             has_clflush_ ? "true" : "false",
@@ -244,8 +243,9 @@ bool OsLayer::FlushPageCache(void) {
 void OsLayer::Flush(void *vaddr) {
   // Use the generic flush. This function is just so we can override
   // this if we are so inclined.
-  if (has_clflush_)
-    FastFlush(vaddr);
+  if (has_clflush_) {
+    OsLayer::FastFlush(vaddr);
+  }
 }
 
 
@@ -266,15 +266,14 @@ bool OsLayer::AdlerMemcpyWarm(uint64 *dstmem, uint64 *srcmem,
 // all address bits in the 'channel_hash' mask, with repeated 'channel_width_'
 // blocks with bits distributed from each chip in that channel.
 int OsLayer::FindDimm(uint64 addr, char *buf, int len) {
-  static const string unknown = "DIMM Unknown";
   if (!channels_) {
-    snprintf(buf, len, "%s", unknown.c_str());
-    return 0;
+    snprintf(buf, len, "DIMM Unknown");
+    return -1;
   }
 
   // Find channel by XORing address bits in channel_hash mask.
-  uint32 low = (uint32)(addr & channel_hash_);
-  uint32 high = (uint32)((addr & channel_hash_) >> 32);
+  uint32 low = static_cast<uint32>(addr & channel_hash_);
+  uint32 high = static_cast<uint32>((addr & channel_hash_) >> 32);
   vector<string>& channel = (*channels_)[
       __builtin_parity(high) ^ __builtin_parity(low)];
 
@@ -342,9 +341,17 @@ string OsLayer::FindCoreMaskFormat(int32 region) {
 
 // Report an error in an easily parseable way.
 bool OsLayer::ErrorReport(const char *part, const char *symptom, int count) {
-  time_t now = time(NULL);
+  time_t now = clock_->Now();
   int ttf = now - time_initialized_;
-  logprintf(0, "Report Error: %s : %s : %d : %ds\n", symptom, part, count, ttf);
+  if (strlen(symptom) && strlen(part)) {
+    logprintf(0, "Report Error: %s : %s : %d : %ds\n",
+              symptom, part, count, ttf);
+  } else {
+    // Log something so the error still shows up, but this won't break the
+    // parser.
+    logprintf(0, "Warning: Invalid Report Error: "
+              "%s : %s : %d : %ds\n", symptom, part, count, ttf);
+  }
   return true;
 }
 
@@ -408,12 +415,31 @@ int64 OsLayer::FindFreeMemSize() {
   //
   // TODO(nsanders): is there a more correct way to determine target
   // memory size?
-  if (hugepagesize > 0 && min_hugepages_bytes_ > 0) {
-    minsize = min_hugepages_bytes_;
-  } else if (physsize < 2048LL * kMegabyte) {
-    minsize = ((pages * 85) / 100) * pagesize;
+  if (hugepagesize > 0) {
+    if (min_hugepages_bytes_ > 0) {
+      minsize = min_hugepages_bytes_;
+    } else {
+      minsize = hugepagesize;
+    }
   } else {
-    minsize = ((pages * 95) / 100) * pagesize - (192 * kMegabyte);
+    if (physsize < 2048LL * kMegabyte) {
+      minsize = ((pages * 85) / 100) * pagesize;
+    } else {
+      minsize = ((pages * 95) / 100) * pagesize - (192 * kMegabyte);
+    }
+    // Make sure that at least reserve_mb_ is left for the system.
+    if (reserve_mb_ > 0) {
+      int64 totalsize = pages * pagesize;
+      int64 reserve_kb = reserve_mb_ * kMegabyte;
+      if (reserve_kb > totalsize) {
+        logprintf(0, "Procedural Error: %lld is bigger than the total memory "
+                  "available %lld\n", reserve_kb, totalsize);
+      } else if (reserve_kb > totalsize - minsize) {
+        logprintf(5, "Warning: Overriding memory to use: original %lld, "
+                  "current %lld\n", minsize, totalsize - reserve_kb);
+        minsize = totalsize - reserve_kb;
+      }
+    }
   }
 
   // Use hugepage sizing if available.
@@ -484,7 +510,7 @@ bool OsLayer::AllocateTestMem(int64 length, uint64 paddr_base) {
                  "'sudo mount -o remount,size=100\% /dev/shm.'\n");
   } else if (hugepagesize >= length) {
     prefer_hugepages = true;
-    logprintf(3, "Log: Prefer using hugepace allocation.\n");
+    logprintf(3, "Log: Prefer using hugepage allocation.\n");
   } else {
     logprintf(3, "Log: Prefer plain malloc memory allocation.\n");
   }
@@ -507,7 +533,7 @@ bool OsLayer::AllocateTestMem(int64 length, uint64 paddr_base) {
         break;
       }
 
-      shmaddr = shmat(shmid, NULL, NULL);
+      shmaddr = shmat(shmid, NULL, 0);
       if (shmaddr == reinterpret_cast<void*>(-1)) {
         int err = errno;
         string errtxt = ErrorString(err);
@@ -564,7 +590,7 @@ bool OsLayer::AllocateTestMem(int64 length, uint64 paddr_base) {
         // Do a full mapping here otherwise.
         shmaddr = mmap64(NULL, length, PROT_READ | PROT_WRITE,
                          MAP_SHARED | MAP_NORESERVE | MAP_LOCKED | MAP_POPULATE,
-                         shm_object, NULL);
+                         shm_object, 0);
         if (shmaddr == reinterpret_cast<void*>(-1)) {
           int err = errno;
           string errtxt = ErrorString(err);
@@ -589,18 +615,32 @@ bool OsLayer::AllocateTestMem(int64 length, uint64 paddr_base) {
     } while (0);
     shm_unlink("/stressapptest");
   }
-#endif // HAVE_SYS_SHM_H
+#endif  // HAVE_SYS_SHM_H
 
   if (!use_hugepages_ && !use_posix_shm_) {
-    // Use memalign to ensure that blocks are aligned enough for disk direct IO.
-    buf = static_cast<char*>(memalign(4096, length));
-    if (buf) {
-      logprintf(0, "Log: Using memaligned allocation at %p.\n", buf);
-    } else {
-      logprintf(0, "Process Error: memalign returned 0\n");
-      if ((length >= 1499LL * kMegabyte) && (address_mode_ == 32)) {
-        logprintf(0, "Log: You are trying to allocate > 1.4G on a 32 "
-                     "bit process. Please setup shared memory.\n");
+    // If the page size is what SAT is expecting explicitly perform mmap()
+    // allocation.
+    if (sysconf(_SC_PAGESIZE) >= 4096) {
+      void *map_buf = mmap(NULL, length, PROT_READ | PROT_WRITE,
+                           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+      if (map_buf != MAP_FAILED) {
+        buf = map_buf;
+        mmapped_allocation_ = true;
+        logprintf(0, "Log: Using mmap() allocation at %p.\n", buf);
+      }
+    }
+    if (!mmapped_allocation_) {
+      // Use memalign to ensure that blocks are aligned enough for disk direct
+      // IO.
+      buf = static_cast<char*>(memalign(4096, length));
+      if (buf) {
+        logprintf(0, "Log: Using memaligned allocation at %p.\n", buf);
+      } else {
+        logprintf(0, "Process Error: memalign returned 0\n");
+        if ((length >= 1499LL * kMegabyte) && (address_mode_ == 32)) {
+          logprintf(0, "Log: You are trying to allocate > 1.4G on a 32 "
+                       "bit process. Please setup shared memory.\n");
+        }
       }
     }
   }
@@ -628,6 +668,8 @@ void OsLayer::FreeTestMem() {
         munmap(testmem_, testmemsize_);
       }
       close(shmid_);
+    } else if (mmapped_allocation_) {
+      munmap(testmem_, testmemsize_);
     } else {
       free(testmem_);
     }
@@ -849,7 +891,9 @@ uint32 OsLayer::GetBitField(uint32 val, uint32 n, uint32 len) {
 bool OsLayer::CpuStressWorkload() {
   double float_arr[100];
   double sum = 0;
+#ifdef HAVE_RAND_R
   unsigned int seed = 12345;
+#endif
 
   // Initialize array with random numbers.
   for (int i = 0; i < 100; i++) {
@@ -858,8 +902,9 @@ bool OsLayer::CpuStressWorkload() {
     if (rand_r(&seed) % 2)
       float_arr[i] *= -1.0;
 #else
-    float_arr[i] = rand();
-    if (rand() % 2)
+    srand(time(NULL));
+    float_arr[i] = rand();  // NOLINT
+    if (rand() % 2)         // NOLINT
       float_arr[i] *= -1.0;
 #endif
   }
@@ -877,82 +922,3 @@ bool OsLayer::CpuStressWorkload() {
     logprintf(12, "Log: I'm Feeling Lucky!\n");
   return true;
 }
-
-PCIDevices OsLayer::GetPCIDevices() {
-  PCIDevices device_list;
-  DIR *dir;
-  struct dirent *buf = new struct dirent();
-  struct dirent *entry;
-  dir = opendir(kSysfsPath);
-  if (!dir)
-    logprintf(0, "Process Error: Cannot open %s", kSysfsPath);
-  while (readdir_r(dir, buf, &entry) == 0 && entry) {
-    PCIDevice *device;
-    unsigned int dev, func;
-    // ".", ".." or a special non-device perhaps.
-    if (entry->d_name[0] == '.')
-      continue;
-
-    device = new PCIDevice();
-    if (sscanf(entry->d_name, "%04x:%02hx:%02x.%d",
-               &device->domain, &device->bus, &dev, &func) < 4) {
-      logprintf(0, "Process Error: Couldn't parse %s", entry->d_name);
-      free(device);
-      continue;
-    }
-    device->dev = dev;
-    device->func = func;
-    device->vendor_id = PCIGetValue(entry->d_name, "vendor");
-    device->device_id = PCIGetValue(entry->d_name, "device");
-    PCIGetResources(entry->d_name, device);
-    device_list.insert(device_list.end(), device);
-  }
-  closedir(dir);
-  delete buf;
-  return device_list;
-}
-
-int OsLayer::PCIGetValue(string name, string object) {
-  int fd, len;
-  char filename[256];
-  char buf[256];
-  snprintf(filename, sizeof(filename), "%s/%s/%s", kSysfsPath,
-           name.c_str(), object.c_str());
-  fd = open(filename, O_RDONLY);
-  if (fd < 0)
-    return 0;
-  len = read(fd, buf, 256);
-  close(fd);
-  buf[len] = '\0';
-  return strtol(buf, NULL, 0);  // NOLINT
-}
-
-int OsLayer::PCIGetResources(string name, PCIDevice *device) {
-  char filename[256];
-  char buf[256];
-  FILE *file;
-  int64 start;
-  int64 end;
-  int64 size;
-  int i;
-  snprintf(filename, sizeof(filename), "%s/%s/%s", kSysfsPath,
-           name.c_str(), "resource");
-  file = fopen(filename, "r");
-  if (!file) {
-    logprintf(0, "Process Error: impossible to find resource file for %s",
-              filename);
-    return errno;
-  }
-  for (i = 0; i < 6; i++) {
-    if (!fgets(buf, 256, file))
-      break;
-    sscanf(buf, "%llx %llx", &start, &end);  // NOLINT
-    size = 0;
-    if (start)
-      size = end - start + 1;
-    device->base_addr[i] = start;
-    device->size[i] = size;
-  }
-  fclose(file);
-  return 0;
-}
diff --git a/src/os.h b/src/os.h
index a928577..13660d8 100644
--- a/src/os.h
+++ b/src/os.h
@@ -17,6 +17,8 @@
 #define STRESSAPPTEST_OS_H_
 
 #include <dirent.h>
+#include <sys/syscall.h>
+
 #include <string>
 #include <list>
 #include <map>
@@ -26,9 +28,9 @@
 // so these includes are correct.
 #include "adler32memcpy.h"  // NOLINT
 #include "sattypes.h"       // NOLINT
+#include "clock.h"          // NOLINT
 
 const char kPagemapPath[] = "/proc/self/pagemap";
-const char kSysfsPath[] = "/sys/bus/pci/devices";
 
 struct PCIDevice {
   int32 domain;
@@ -45,6 +47,8 @@ typedef vector<PCIDevice*> PCIDevices;
 
 class ErrorDiag;
 
+class Clock;
+
 // This class implements OS/Platform specific funtions.
 class OsLayer {
  public:
@@ -57,6 +61,13 @@ class OsLayer {
     min_hugepages_bytes_ = min_bytes;
   }
 
+  // Set the minium amount of memory that should not be allocated. This only
+  // has any affect if hugepages are not used.
+  // Must be set before Initialize().
+  void SetReserveSize(int64 reserve_mb) {
+    reserve_mb_ = reserve_mb;
+  }
+
   // Set parameters needed to translate physical address to memory module.
   void SetDramMappingParams(uintptr_t channel_hash, int channel_width,
                             vector< vector<string> > *channels) {
@@ -77,13 +88,11 @@ class OsLayer {
   // Prints failed dimm. This implementation is optional for
   // subclasses to implement.
   // Takes a bus address and string, and prints the DIMM name
-  // into the string. Returns error status.
+  // into the string. Returns the DIMM number that corresponds to the
+  // address given, or -1 if unable to identify the DIMM number.
+  // Note that subclass implementations of FindDimm() MUST fill
+  // buf with at LEAST one non-whitespace character (provided len > 0).
   virtual int FindDimm(uint64 addr, char *buf, int len);
-  // Print dimm info, plus more available info.
-  virtual int FindDimmExtended(uint64 addr, char *buf, int len) {
-    return FindDimm(addr, buf, len);
-  }
-
 
   // Classifies addresses according to "regions"
   // This may mean different things on different platforms.
@@ -141,10 +150,95 @@ class OsLayer {
     // instruction. For example, software can use an MFENCE instruction to
     // insure that previous stores are included in the write-back.
     asm volatile("mfence");
-    asm volatile("clflush (%0)" :: "r" (vaddr));
+    asm volatile("clflush (%0)" : : "r" (vaddr));
+    asm volatile("mfence");
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+    #warning "Unsupported CPU type ARMV7A: Using syscall to cache flush."
+    // ARMv7a cachelines are 8 words (32 bytes).
+    syscall(__ARM_NR_cacheflush, vaddr, reinterpret_cast<char*>(vaddr) + 32, 0);
+#else
+  #warning "Unsupported CPU type: Unable to force cache flushes."
+#endif
+  }
+
+  // Fast flush, for use in performance critical code.
+  // This is bound at compile time, and will not pick up
+  // any runtime machine configuration info.  Takes a NULL-terminated
+  // array of addresses to flush.
+  inline static void FastFlushList(void **vaddrs) {
+#ifdef STRESSAPPTEST_CPU_PPC
+    while (*vaddrs) {
+      asm volatile("dcbf 0,%0" : : "r" (*vaddrs++));
+    }
+    asm volatile("sync");
+#elif defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
+    // Put mfence before and after clflush to make sure:
+    // 1. The write before the clflush is committed to memory bus;
+    // 2. The read after the clflush is hitting the memory bus.
+    //
+    // From Intel manual:
+    // CLFLUSH is only ordered by the MFENCE instruction. It is not guaranteed
+    // to be ordered by any other fencing, serializing or other CLFLUSH
+    // instruction. For example, software can use an MFENCE instruction to
+    // insure that previous stores are included in the write-back.
+    asm volatile("mfence");
+    while (*vaddrs) {
+      asm volatile("clflush (%0)" : : "r" (*vaddrs++));
+    }
+    asm volatile("mfence");
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+    while (*vaddrs) {
+      FastFlush(*vaddrs++);
+    }
+#else
+    #warning "Unsupported CPU type: Unable to force cache flushes."
+#endif
+  }
+
+  // Fast flush hint, for use in performance critical code.
+  // This is bound at compile time, and will not pick up
+  // any runtime machine configuration info.  Note that this
+  // will not guarantee that a flush happens, but will at least
+  // hint that it should.  This is useful for speeding up
+  // parallel march algorithms.
+  inline static void FastFlushHint(void *vaddr) {
+#ifdef STRESSAPPTEST_CPU_PPC
+    asm volatile("dcbf 0,%0" : : "r" (vaddr));
+#elif defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
+    // From Intel manual:
+    // CLFLUSH is only ordered by the MFENCE instruction. It is not guaranteed
+    // to be ordered by any other fencing, serializing or other CLFLUSH
+    // instruction. For example, software can use an MFENCE instruction to
+    // insure that previous stores are included in the write-back.
+    asm volatile("clflush (%0)" : : "r" (vaddr));
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+    FastFlush(vaddr);
+#else
+    #warning "Unsupported CPU type: Unable to force cache flushes."
+#endif
+  }
+
+  // Fast flush, for use in performance critical code.
+  // This is bound at compile time, and will not pick up
+  // any runtime machine configuration info.  Sync's any
+  // transactions for ordering FastFlushHints.
+  inline static void FastFlushSync() {
+#ifdef STRESSAPPTEST_CPU_PPC
+    asm volatile("sync");
+#elif defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
+    // Put mfence before and after clflush to make sure:
+    // 1. The write before the clflush is committed to memory bus;
+    // 2. The read after the clflush is hitting the memory bus.
+    //
+    // From Intel manual:
+    // CLFLUSH is only ordered by the MFENCE instruction. It is not guaranteed
+    // to be ordered by any other fencing, serializing or other CLFLUSH
+    // instruction. For example, software can use an MFENCE instruction to
+    // insure that previous stores are included in the write-back.
     asm volatile("mfence");
 #elif defined(STRESSAPPTEST_CPU_ARMV7A)
-  #warning "Unsupported CPU type ARMV7A: Unable to force cache flushes."
+    // This is a NOP, FastFlushHint() always does a full flush, so there's
+    // nothing to do for FastFlushSync().
 #else
   #warning "Unsupported CPU type: Unable to force cache flushes."
 #endif
@@ -239,9 +333,6 @@ class OsLayer {
   // Handle to platform-specific error diagnoser.
   ErrorDiag *error_diagnoser_;
 
-  // Detect all PCI Devices.
-  virtual PCIDevices GetPCIDevices();
-
   // Disambiguate between different "warm" memcopies.
   virtual bool AdlerMemcpyWarm(uint64 *dstmem, uint64 *srcmem,
                                unsigned int size_in_bytes,
@@ -258,16 +349,27 @@ class OsLayer {
   }
   ErrCallback get_err_log_callback() { return err_log_callback_; }
 
+  // Set a clock object that can be overridden for use with unit tests.
+  void SetClock(Clock *clock) {
+    if (clock_) {
+      delete clock_;
+    }
+    clock_ = clock;
+    time_initialized_ = clock_->Now();
+  }
+
  protected:
   void *testmem_;                // Location of test memory.
   uint64 testmemsize_;           // Size of test memory.
   int64 totalmemsize_;           // Size of available memory.
   int64 min_hugepages_bytes_;    // Minimum hugepages size.
+  int64 reserve_mb_;             // Minimum amount of memory to reserve in MB.
   bool  error_injection_;        // Do error injection?
   bool  normal_mem_;             // Memory DMA capable?
   bool  use_hugepages_;          // Use hugepage shmem?
   bool  use_posix_shm_;          // Use 4k page shmem?
   bool  dynamic_mapped_shmem_;   // Conserve virtual address space.
+  bool  mmapped_allocation_;     // Was memory allocated using mmap()?
   int   shmid_;                  // Handle to shmem
   vector< vector<string> > *channels_;  // Memory module names per channel.
   uint64 channel_hash_;          // Mask of address bits XORed for channel.
@@ -291,9 +393,6 @@ class OsLayer {
 
   // Get file descriptor for dev msr.
   virtual int OpenMSR(uint32 core, uint32 address);
-  // Auxiliary methods for PCI device configuration
-  int PCIGetValue(string name, string object);
-  int PCIGetResources(string name, PCIDevice *device);
 
   // Look up how many hugepages there are.
   virtual int64 FindHugePages();
@@ -301,6 +400,9 @@ class OsLayer {
   // Link to find last transaction at an error location.
   ErrCallback err_log_callback_;
 
+  // Object to wrap the time function.
+  Clock *clock_;
+
  private:
   DISALLOW_COPY_AND_ASSIGN(OsLayer);
 };
diff --git a/src/sat.cc b/src/sat.cc
index 4f4e684..57fd4fe 100644
--- a/src/sat.cc
+++ b/src/sat.cc
@@ -125,6 +125,26 @@ bool Sat::CheckEnvironment() {
   #error Build system regression - COPTS disregarded.
 #endif
 
+  // Check if the cpu frequency test is enabled and able to run.
+  if (cpu_freq_test_) {
+    if (!CpuFreqThread::CanRun()) {
+      logprintf(0, "Process Error: This platform does not support this "
+                "test.\n");
+      bad_status();
+      return false;
+    } else if (cpu_freq_threshold_ <= 0) {
+      logprintf(0, "Process Error: The cpu frequency test requires "
+                "--cpu_freq_threshold set to a value > 0\n");
+      bad_status();
+      return false;
+    } else if (cpu_freq_round_ < 0) {
+      logprintf(0, "Process Error: The --cpu_freq_round option must be greater"
+                " than or equal to zero. A value of zero means no rounding.\n");
+      bad_status();
+      return false;
+    }
+  }
+
   // Use all CPUs if nothing is specified.
   if (memory_threads_ == -1) {
     memory_threads_ = os_->num_cpus();
@@ -491,12 +511,6 @@ bool Sat::InitializePages() {
     if (GetValid(&pe, kInvalidTag)) {
       int64 paddr = os_->VirtualToPhysical(pe.addr);
       int32 region = os_->FindRegion(paddr);
-
-      if (i < 256) {
-        char buf[256];
-        os_->FindDimm(paddr, buf, sizeof(buf));
-        logprintf(12, "Log: address: %#llx, %s\n", paddr, buf);
-      }
       region_[region]++;
       pe.paddr = paddr;
       pe.tag = 1 << region;
@@ -554,6 +568,7 @@ bool Sat::Initialize() {
   // Initializes sync'd log file to ensure output is saved.
   if (!InitializeLogfile())
     return false;
+  Logger::GlobalLogger()->SetTimestampLogging(log_timestamps_);
   Logger::GlobalLogger()->StartThread();
 
   logprintf(5, "Log: Commandline - %s\n", cmdline_.c_str());
@@ -572,6 +587,10 @@ bool Sat::Initialize() {
 
   if (min_hugepages_mbytes_ > 0)
     os_->SetMinimumHugepagesSize(min_hugepages_mbytes_ * kMegabyte);
+
+  if (reserve_mb_ > 0)
+    os_->SetReserveSize(reserve_mb_);
+
   if (channels_.size() > 0) {
     logprintf(6, "Log: Decoding memory: %dx%d bit channels,"
         "%d modules per channel (x%d), decoding hash 0x%x\n",
@@ -647,6 +666,7 @@ Sat::Sat() {
   pages_ = 0;
   size_mb_ = 0;
   size_ = size_mb_ * kMegabyte;
+  reserve_mb_ = 0;
   min_hugepages_mbytes_ = 0;
   freepages_ = 0;
   paddr_base_ = 0;
@@ -661,6 +681,7 @@ Sat::Sat() {
   run_on_anything_ = 0;
   use_logfile_ = 0;
   logfile_ = 0;
+  log_timestamps_ = true;
   // Detect 32/64 bit binary.
   void *pvoid = 0;
   address_mode_ = sizeof(pvoid) * 8;
@@ -678,9 +699,15 @@ Sat::Sat() {
   // Cache coherency data initialization.
   cc_test_ = false;         // Flag to trigger cc threads.
   cc_cacheline_count_ = 2;  // Two datastructures of cache line size.
+  cc_cacheline_size_ = 0;   // Size of a cacheline (0 for auto-detect).
   cc_inc_count_ = 1000;     // Number of times to increment the shared variable.
   cc_cacheline_data_ = 0;   // Cache Line size datastructure.
 
+  // Cpu frequency data initialization.
+  cpu_freq_test_ = false;   // Flag to trigger cpu frequency thread.
+  cpu_freq_threshold_ = 0;  // Threshold, in MHz, at which a cpu fails.
+  cpu_freq_round_ = 10;     // Round the computed frequency to this value.
+
   sat_assert(0 == pthread_mutex_init(&worker_lock_, NULL));
   file_threads_ = 0;
   net_threads_ = 0;
@@ -774,6 +801,9 @@ bool Sat::ParseArgs(int argc, char **argv) {
     // Set number of megabyte to use.
     ARG_IVALUE("-M", size_mb_);
 
+    // Specify the amount of megabytes to be reserved for system.
+    ARG_IVALUE("--reserve_memory", reserve_mb_);
+
     // Set minimum megabytes of hugepages to require.
     ARG_IVALUE("-H", min_hugepages_mbytes_);
 
@@ -795,8 +825,21 @@ bool Sat::ParseArgs(int argc, char **argv) {
     // Set number of cache line size datastructures
     ARG_IVALUE("--cc_line_count", cc_cacheline_count_);
 
+    // Override the detected or assumed cache line size.
+    ARG_IVALUE("--cc_line_size", cc_cacheline_size_);
+
     // Flag set when cache coherency tests need to be run
-    ARG_KVALUE("--cc_test", cc_test_, 1);
+    ARG_KVALUE("--cc_test", cc_test_, true);
+
+    // Set when the cpu_frequency test needs to be run
+    ARG_KVALUE("--cpu_freq_test", cpu_freq_test_, true);
+
+    // Set the threshold in MHz at which the cpu frequency test will fail.
+    ARG_IVALUE("--cpu_freq_threshold", cpu_freq_threshold_);
+
+    // Set the rounding value for the cpu frequency test. The default is to
+    // round to the nearest 10s value.
+    ARG_IVALUE("--cpu_freq_round", cpu_freq_round_);
 
     // Set number of CPU stress threads.
     ARG_IVALUE("-C", cpu_stress_threads_);
@@ -807,6 +850,9 @@ bool Sat::ParseArgs(int argc, char **argv) {
     // Verbosity level.
     ARG_IVALUE("-v", verbosity_);
 
+    // Turn off timestamps logging.
+    ARG_KVALUE("--no_timestamps", log_timestamps_, false);
+
     // Set maximum number of errors to collect. Stop running after this many.
     ARG_IVALUE("--max_errors", max_errorcount_);
 
@@ -1004,7 +1050,7 @@ bool Sat::ParseArgs(int argc, char **argv) {
     for (uint i = 0; i < channels_.size(); i++)
       if (channels_[i].size() != channels_[0].size()) {
         logprintf(6, "Process Error: "
-            "Channels 0 and %d have a different count of dram modules.\n",i);
+            "Channels 0 and %d have a different count of dram modules.\n", i);
         bad_status();
         return false;
       }
@@ -1043,6 +1089,8 @@ bool Sat::ParseArgs(int argc, char **argv) {
 void Sat::PrintHelp() {
   printf("Usage: ./sat(32|64) [options]\n"
          " -M mbytes        megabytes of ram to test\n"
+         " --reserve-memory If not using hugepages, the amount of memory to "
+         " reserve for the system\n"
          " -H mbytes        minimum megabytes of hugepages to require\n"
          " -s seconds       number of seconds to run\n"
          " -m threads       number of memory copy threads to run\n"
@@ -1054,6 +1102,7 @@ void Sat::PrintHelp() {
          " -f filename      add a disk thread with "
          "tempfile 'filename'\n"
          " -l logfile       log output to file 'logfile'\n"
+         " --no_timestamps  do not prefix timestamps to log messages\n"
          " --max_errors n   exit early after finding 'n' errors\n"
          " -v level         verbosity (0-20), default is 8\n"
          " -W               Use more CPU-stressful memory copy\n"
@@ -1091,6 +1140,13 @@ void Sat::PrintHelp() {
          "cacheline's member\n"
          " --cc_line_count  number of cache line sized datastructures "
          "to allocate for the cache coherency threads to operate\n"
+         " --cc_line_size   override the auto-detected cache line size\n"
+         " --cpu_freq_test  enable the cpu frequency test (requires the "
+         "--cpu_freq_threshold argument to be set)\n"
+         " --cpu_freq_threshold  fail the cpu frequency test if the frequency "
+         "goes below this value (specified in MHz)\n"
+         " --cpu_freq_round round the computed frequency to this value, if set"
+         " to zero, only round to the nearest MHz\n"
          " --paddr_base     allocate memory starting from this address\n"
          " --pause_delay    delay (in seconds) between power spikes\n"
          " --pause_duration duration (in seconds) of each pause\n"
@@ -1098,12 +1154,12 @@ void Sat::PrintHelp() {
          "each CPU to be tested by that CPU\n"
          " --remote_numa    choose memory regions not associated with "
          "each CPU to be tested by that CPU\n"
-         " --channel_hash   mask of address bits XORed to determine channel.\n"
-         "                  Mask 0x40 interleaves cachelines between channels\n"
+         " --channel_hash   mask of address bits XORed to determine channel. "
+         "Mask 0x40 interleaves cachelines between channels\n"
          " --channel_width bits     width in bits of each memory channel\n"
-         " --memory_channel u1,u2   defines a comma-separated list of names\n"
-         "                          for dram packages in a memory channel.\n"
-         "                          Use multiple times to define multiple channels.\n");
+         " --memory_channel u1,u2   defines a comma-separated list of names "
+         "for dram packages in a memory channel. Use multiple times to "
+         "define multiple channels.\n");
 }
 
 bool Sat::CheckGoogleSpecificArgs(int argc, char **argv, int *i) {
@@ -1348,32 +1404,45 @@ void Sat::InitializeThreads() {
            sizeof(cc_cacheline_data) * cc_cacheline_count_);
 
     int num_cpus = CpuCount();
+    char *num;
+    // Calculate the number of cache lines needed just to give each core
+    // its own counter.
+    int line_size = cc_cacheline_size_;
+    if (line_size <= 0) {
+      line_size = CacheLineSize();
+      if (line_size < kCacheLineSize)
+        line_size = kCacheLineSize;
+      logprintf(12, "Log: Using %d as cache line size\n", line_size);
+    }
+    // The number of cache lines needed to hold an array of num_cpus.
+    // "num" must be the same type as cc_cacheline_data[X].num or the memory
+    // size calculations will fail.
+    int needed_lines = (sizeof(*num) * num_cpus + line_size - 1) / line_size;
     // Allocate all the nums once so that we get a single chunk
     // of contiguous memory.
-    int *num;
 #ifdef HAVE_POSIX_MEMALIGN
     int err_result = posix_memalign(
         reinterpret_cast<void**>(&num),
-        kCacheLineSize, sizeof(*num) * num_cpus * cc_cacheline_count_);
+        line_size, line_size * needed_lines * cc_cacheline_count_);
 #else
-    num = reinterpret_cast<int*>(memalign(kCacheLineSize,
-			sizeof(*num) * num_cpus * cc_cacheline_count_));
+    num = reinterpret_cast<int*>(memalign(
+        line_size, line_size * needed_lines * cc_cacheline_count_));
     int err_result = (num == 0);
 #endif
     sat_assert(err_result == 0);
 
     int cline;
     for (cline = 0; cline < cc_cacheline_count_; cline++) {
-      memset(num, 0, sizeof(num_cpus) * num_cpus);
+      memset(num, 0, sizeof(*num) * num_cpus);
       cc_cacheline_data_[cline].num = num;
-      num += num_cpus;
+      num += (line_size * needed_lines) / sizeof(*num);
     }
 
     int tnum;
     for (tnum = 0; tnum < num_cpus; tnum++) {
       CpuCacheCoherencyThread *thread =
           new CpuCacheCoherencyThread(cc_cacheline_data_, cc_cacheline_count_,
-                                      tnum, cc_inc_count_);
+                                      tnum, num_cpus, cc_inc_count_);
       thread->InitThread(total_threads_++, this, os_, patternlist_,
                          &continuous_status_);
       // Pin the thread to a particular core.
@@ -1384,6 +1453,22 @@ void Sat::InitializeThreads() {
     }
     workers_map_.insert(make_pair(kCCType, cc_vector));
   }
+
+  if (cpu_freq_test_) {
+    // Create the frequency test thread.
+    logprintf(5, "Log: Running cpu frequency test: threshold set to %dMHz.\n",
+              cpu_freq_threshold_);
+    CpuFreqThread *thread = new CpuFreqThread(CpuCount(), cpu_freq_threshold_,
+                                              cpu_freq_round_);
+    // This thread should be paused when other threads are paused.
+    thread->InitThread(total_threads_++, this, os_, NULL,
+                       &power_spike_status_);
+
+    WorkerVector *cpu_freq_vector = new WorkerVector();
+    cpu_freq_vector->insert(cpu_freq_vector->end(), thread);
+    workers_map_.insert(make_pair(kCPUFreqType, cpu_freq_vector));
+  }
+
   ReleaseWorkerLock();
 }
 
@@ -1392,6 +1477,19 @@ int Sat::CpuCount() {
   return sysconf(_SC_NPROCESSORS_CONF);
 }
 
+// Return the worst case (largest) cache line size of the various levels of
+// cache actually prsent in the machine.
+int Sat::CacheLineSize() {
+  int max_linesize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+  int linesize = sysconf(_SC_LEVEL2_CACHE_LINESIZE);
+  if (linesize > max_linesize) max_linesize = linesize;
+  linesize = sysconf(_SC_LEVEL3_CACHE_LINESIZE);
+  if (linesize > max_linesize) max_linesize = linesize;
+  linesize = sysconf(_SC_LEVEL4_CACHE_LINESIZE);
+  if (linesize > max_linesize) max_linesize = linesize;
+  return max_linesize;
+}
+
 // Notify and reap worker threads.
 void Sat::JoinThreads() {
   logprintf(12, "Log: Joining worker threads\n");
@@ -1974,3 +2072,9 @@ void logprintf(int priority, const char *format, ...) {
   Logger::GlobalLogger()->VLogF(priority, format, args);
   va_end(args);
 }
+
+// Stop the logging thread and verify any pending data is written to the log.
+void logstop() {
+  Logger::GlobalLogger()->StopThread();
+}
+
diff --git a/src/sat.h b/src/sat.h
index 93d6b34..92396d8 100644
--- a/src/sat.h
+++ b/src/sat.h
@@ -134,6 +134,8 @@ class Sat {
 
   // Return the number of cpus in the system.
   int CpuCount();
+  // Return the worst-case (largest) cache line size of the system.
+  int CacheLineSize();
 
   // Collect error counts from threads.
   int64 GetTotalErrorCount();
@@ -147,13 +149,15 @@ class Sat {
   int64 pages_;                       // Number of memory blocks.
   int64 size_;                        // Size of memory tested, in bytes.
   int64 size_mb_;                     // Size of memory tested, in MB.
+  int64 reserve_mb_;                  // Reserve at least this amount of memory
+                                      // for the system, in MB.
   int64 min_hugepages_mbytes_;        // Minimum hugepages size.
   int64 freepages_;                   // How many invalid pages we need.
   int disk_pages_;                    // Number of pages per temp file.
   uint64 paddr_base_;                 // Physical address base.
-  vector< vector<string> > channels_; // Memory module names per channel.
   uint64 channel_hash_;               // Mask of address bits XORed for channel.
   int channel_width_;                 // Channel width in bits.
+  vector< vector<string> > channels_;  // Memory module names per channel.
 
   // Control flags.
   volatile sig_atomic_t user_break_;  // User has signalled early exit.  Used as
@@ -172,6 +176,7 @@ class Sat {
   int use_logfile_;                   // Log to a file.
   char logfilename_[255];             // Name of file to log to.
   int logfile_;                       // File handle to log to.
+  bool log_timestamps_;               // Whether to add timestamps to log lines.
 
   // Disk thread options.
   int read_block_size_;               // Size of block to read from disk.
@@ -202,9 +207,18 @@ class Sat {
   bool cc_test_;                      // Flag to decide whether to start the
                                       // cache coherency threads.
   int cc_cacheline_count_;            // Number of cache line size structures.
+  int cc_cacheline_size_;             // Size of a cache line.
   int cc_inc_count_;                  // Number of times to increment the shared
                                       // cache lines structure members.
 
+  // Cpu Frequency Options.
+  bool cpu_freq_test_;                // Flag to decide whether to start the
+                                      // cpu frequency thread.
+  int cpu_freq_threshold_;            // The MHz threshold which will cause
+                                      // the test to fail.
+  int cpu_freq_round_;                // Round the computed frequency to this
+                                      // value.
+
   // Thread control.
   int file_threads_;                  // Threads of file IO.
   int net_threads_;                   // Threads of network IO.
@@ -252,7 +266,8 @@ class Sat {
     kRandomDiskType = 7,
     kCPUType = 8,
     kErrorType = 9,
-    kCCType = 10
+    kCCType = 10,
+    kCPUFreqType = 11,
   };
 
   // Helper functions.
diff --git a/src/sattypes.h b/src/sattypes.h
index c9341d0..e51db31 100644
--- a/src/sattypes.h
+++ b/src/sattypes.h
@@ -27,11 +27,11 @@
 
 #ifdef HAVE_CONFIG_H  // Built using autoconf
 #ifdef __ANDROID__
-#include "stressapptest_config_android.h"
+#include "stressapptest_config_android.h"  // NOLINT
 #else
-#include "stressapptest_config.h"
-using namespace __gnu_cxx;
-#endif
+#include "stressapptest_config.h"  // NOLINT
+using namespace __gnu_cxx;  //NOLINT
+#endif  // __ANDROID__
 using namespace std;
 
 typedef signed long long   int64;
@@ -57,10 +57,10 @@ inline const char* BuildChangelist() {
 }
 
 static const bool kOpenSource = true;
-#else
+#else  // !HAVE_CONFIG_H
 static const bool kOpenSource = false;
-  #include "googlesattypes.h"
-#endif
+  #include "googlesattypes.h"  // NOLINT
+#endif  // HAVE_CONFIG_H
 // Workaround to allow 32/64 bit conversion
 // without running into strict aliasing problems.
 union datacast_t {
@@ -75,11 +75,15 @@ union datacast_t {
 // File sync'd print to console and log
 void logprintf(int priority, const char *format, ...);
 
+// Stop the log and dump any queued lines.
+void logstop();
+
 // We print to stderr ourselves first in case we're in such a bad state that the
 // logger can't work.
 #define sat_assert(x) \
 {\
   if (!(x)) {\
+    logstop();\
     fprintf(stderr, "Assertion failed at %s:%d\n", __FILE__, __LINE__);\
     logprintf(0, "Assertion failed at %s:%d\n", __FILE__, __LINE__);\
     exit(1);\
@@ -186,6 +190,46 @@ inline string ErrorString(int error_num) {
 #endif
 }
 
+// Execute the cpuid instruction and pass back the contents of the registers.
+// This only works on x86 based platforms.
+inline void cpuid(
+  unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) {
+  *ebx = 0;
+  *ecx = 0;
+  *edx = 0;
+  // CPUID features documented at:
+  // http://www.sandpile.org/ia32/cpuid.htm
+#if defined(STRESSAPPTEST_CPU_I686) || defined(STRESSAPPTEST_CPU_X86_64)
+#if defined(__PIC__) && defined(STRESSAPPTEST_CPU_I686)
+  // In PIC compilations using the i686 cpu type, ebx contains the address
+  // of the global offset table. The compiler can't properly handle constraints
+  // using the ebx register for this compile, so preserve the register
+  // ourselves.
+  asm(
+    "mov %%ebx, %%edi;"
+    "cpuid;"
+    "xchg %%edi, %%ebx;"
+    // Output registers.
+    : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx)
+    // Input registers.
+    : "a" (*eax)
+  );  // Asm
+#else
+  asm(
+    "cpuid;"
+    // Output registers.
+    : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
+    // Input registers.
+    : "a" (*eax)
+  );  // Asm
+#endif  // defined(__PIC__) && defined(STRESSAPPTEST_CPU_I686)
+#elif defined(STRESSAPPTEST_CPU_PPC)
+  return;
+#else
+#warning "Unsupported CPU type."
+#endif
+}
+
 // Define handy constants here
 static const int kTicksPerSec = 100;
 static const int kMegabyte = (1024LL*1024LL);
diff --git a/src/worker.cc b/src/worker.cc
index d24b5cd..dcffd4e 100644
--- a/src/worker.cc
+++ b/src/worker.cc
@@ -78,31 +78,6 @@ _syscall3(int, sched_setaffinity, pid_t, pid,
 #endif
 
 namespace {
-  // Get HW core ID from cpuid instruction.
-  inline int apicid(void) {
-    int cpu;
-#if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
-    __asm__ __volatile__ (
-# if defined(STRESSAPPTEST_CPU_I686) && defined(__PIC__)
-        "xchg %%ebx, %%esi;"
-        "cpuid;"
-        "xchg %%esi, %%ebx;"
-        : "=S" (cpu)
-# else
-        "cpuid;"
-        : "=b" (cpu)
-# endif
-        : "a" (1) : "cx", "dx");
-#elif defined(STRESSAPPTEST_CPU_ARMV7A)
-  #warning "Unsupported CPU type ARMV7A: unable to determine core ID."
-    cpu = 0;
-#else
-  #warning "Unsupported CPU type: unable to determine core ID."
-    cpu = 0;
-#endif
-    return (cpu >> 24);
-  }
-
   // Work around the sad fact that there are two (gnu, xsi) incompatible
   // versions of strerror_r floating around google. Awesome.
   bool sat_strerror(int err, char *buf, int len) {
@@ -124,7 +99,7 @@ namespace {
   inline uint64 addr_to_tag(void *address) {
     return reinterpret_cast<uint64>(address);
   }
-}
+}  // namespace
 
 #if !defined(O_DIRECT)
 // Sometimes this isn't available.
@@ -183,10 +158,13 @@ void WorkerStatus::StopWorkers() {
     WaitOnPauseBarrier();
 }
 
-bool WorkerStatus::ContinueRunning() {
+bool WorkerStatus::ContinueRunning(bool *paused) {
   // This loop is an optimization.  We use it to immediately re-check the status
   // after resuming from a pause, instead of returning and waiting for the next
   // call to this function.
+  if (paused) {
+    *paused = false;
+  }
   for (;;) {
     switch (GetStatus()) {
       case RUN:
@@ -197,6 +175,10 @@ bool WorkerStatus::ContinueRunning() {
         WaitOnPauseBarrier();
         // Wait for ResumeWorkers() to be called.
         WaitOnPauseBarrier();
+        // Indicate that a pause occurred.
+        if (paused) {
+          *paused = true;
+        }
         break;
       case STOP:
         return false;
@@ -325,8 +307,8 @@ bool WorkerThread::InitPriority() {
     logprintf(11, "Log: Bind to %s failed.\n",
               cpuset_format(&cpu_mask_).c_str());
 
-  logprintf(11, "Log: Thread %d running on apic ID %d mask %s (%s).\n",
-            thread_num_, apicid(),
+  logprintf(11, "Log: Thread %d running on core ID %d mask %s (%s).\n",
+            thread_num_, sched_getcpu(),
             CurrentCpusFormat().c_str(),
             cpuset_format(&cpu_mask_).c_str());
 #if 0
@@ -590,7 +572,7 @@ void WorkerThread::ProcessError(struct ErrorRecord *error,
                                 const char *message) {
   char dimm_string[256] = "";
 
-  int apic_id = apicid();
+  int core_id = sched_getcpu();
 
   // Determine if this is a write or read error.
   os_->Flush(error->vaddr);
@@ -625,7 +607,7 @@ void WorkerThread::ProcessError(struct ErrorRecord *error,
               "%s: miscompare on CPU %d(0x%s) at %p(0x%llx:%s): "
               "read:0x%016llx, reread:0x%016llx expected:0x%016llx\n",
               message,
-              apic_id,
+              core_id,
               CurrentCpusFormat().c_str(),
               error->vaddr,
               error->paddr,
@@ -825,6 +807,9 @@ int WorkerThread::CheckRegion(void *addr,
       if ((state == kGoodAgain) || (state == kBad)) {
         unsigned int blockerrors = badend - badstart + 1;
         errormessage = "Block Error";
+        // It's okay for the 1st entry to be corrected multiple times,
+        // it will simply be reported twice. Once here and once below
+        // when processing the error queue.
         ProcessError(&recorded[0], 0, errormessage.c_str());
         logprintf(0, "Block Error: (%p) pattern %s instead of %s, "
                   "%d bytes from offset 0x%x to 0x%x\n",
@@ -833,8 +818,6 @@ int WorkerThread::CheckRegion(void *addr,
                   blockerrors * wordsize_,
                   offset + badstart * wordsize_,
                   offset + badend * wordsize_);
-        errorcount_ += blockerrors;
-        return blockerrors;
       }
     }
   }
@@ -850,7 +833,6 @@ int WorkerThread::CheckRegion(void *addr,
 
   if (page_error) {
     // For each word in the data region.
-    int error_recount = 0;
     for (int i = 0; i < length / wordsize_; i++) {
       uint64 actual = memblock[i];
       uint64 expected;
@@ -869,21 +851,16 @@ int WorkerThread::CheckRegion(void *addr,
 
       // If the value is incorrect, save an error record for later printing.
       if (actual != expected) {
-        if (error_recount < kErrorLimit) {
-          // We already reported these.
-          error_recount++;
-        } else {
-          // If we have overflowed the error queue, print the errors now.
-          struct ErrorRecord er;
-          er.actual = actual;
-          er.expected = expected;
-          er.vaddr = &memblock[i];
-
-          // Do the error printout. This will take a long time and
-          // likely change the machine state.
-          ProcessError(&er, 12, errormessage.c_str());
-          overflowerrors++;
-        }
+        // If we have overflowed the error queue, print the errors now.
+        struct ErrorRecord er;
+        er.actual = actual;
+        er.expected = expected;
+        er.vaddr = &memblock[i];
+
+        // Do the error printout. This will take a long time and
+        // likely change the machine state.
+        ProcessError(&er, 12, errormessage.c_str());
+        overflowerrors++;
       }
     }
   }
@@ -958,7 +935,7 @@ void WorkerThread::ProcessTagError(struct ErrorRecord *error,
   char tag_dimm_string[256] = "";
   bool read_error = false;
 
-  int apic_id = apicid();
+  int core_id = sched_getcpu();
 
   // Determine if this is a write or read error.
   os_->Flush(error->vaddr);
@@ -992,7 +969,7 @@ void WorkerThread::ProcessTagError(struct ErrorRecord *error,
               error->tagvaddr, error->tagpaddr,
               tag_dimm_string,
               read_error ? "read error" : "write error",
-              apic_id,
+              core_id,
               CurrentCpusFormat().c_str(),
               error->vaddr,
               error->paddr,
@@ -1110,12 +1087,18 @@ bool WorkerThread::AdlerAddrMemcpyWarm(uint64 *dstmem64,
   AdlerChecksum ignored_checksum;
   os_->AdlerMemcpyWarm(dstmem64, srcmem64, size_in_bytes, &ignored_checksum);
 
-  // Force cache flush.
-  int length = size_in_bytes / sizeof(*dstmem64);
-  for (int i = 0; i < length; i += sizeof(*dstmem64)) {
-    os_->FastFlush(dstmem64 + i);
-    os_->FastFlush(srcmem64 + i);
+  // Force cache flush of both the source and destination addresses.
+  //  length - length of block to flush in cachelines.
+  //  mem_increment - number of dstmem/srcmem values per cacheline.
+  int length = size_in_bytes / kCacheLineSize;
+  int mem_increment = kCacheLineSize / sizeof(*dstmem64);
+  OsLayer::FastFlushSync();
+  for (int i = 0; i < length; ++i) {
+    OsLayer::FastFlushHint(dstmem64 + (i * mem_increment));
+    OsLayer::FastFlushHint(srcmem64 + (i * mem_increment));
   }
+  OsLayer::FastFlushSync();
+
   // Check results.
   AdlerAddrCrcC(srcmem64, size_in_bytes, checksum, pe);
   // Patch up address tags.
@@ -1246,11 +1229,11 @@ int WorkerThread::CrcCopyPage(struct page_entry *dstpe,
                                    blocksize,
                                    currentblock * blocksize, 0);
           if (errorcount == 0) {
-            int apic_id = apicid();
+            int core_id = sched_getcpu();
             logprintf(0, "Process Error: CPU %d(0x%s) CrcCopyPage "
                          "CRC mismatch %s != %s, "
                          "but no miscompares found on second pass.\n",
-                      apic_id, CurrentCpusFormat().c_str(),
+                      core_id, CurrentCpusFormat().c_str(),
                       crc.ToHexString().c_str(),
                       expectedcrc->ToHexString().c_str());
             struct ErrorRecord er;
@@ -1390,11 +1373,11 @@ int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe,
                                    blocksize,
                                    currentblock * blocksize, 0);
           if (errorcount == 0) {
-            int apic_id = apicid();
+            int core_id = sched_getcpu();
             logprintf(0, "Process Error: CPU %d(0x%s) CrciWarmCopyPage "
                          "CRC mismatch %s != %s, "
                          "but no miscompares found on second pass.\n",
-                      apic_id, CurrentCpusFormat().c_str(),
+                      core_id, CurrentCpusFormat().c_str(),
                       crc.ToHexString().c_str(),
                       expectedcrc->ToHexString().c_str());
             struct ErrorRecord er;
@@ -1610,12 +1593,11 @@ void FileThread::SetFile(const char *filename_init) {
 
 // Open the file for access.
 bool FileThread::OpenFile(int *pfile) {
-  bool no_O_DIRECT = false;
   int flags = O_RDWR | O_CREAT | O_SYNC;
   int fd = open(filename_.c_str(), flags | O_DIRECT, 0644);
   if (O_DIRECT != 0 && fd < 0 && errno == EINVAL) {
-    no_O_DIRECT = true;
-    fd = open(filename_.c_str(), flags, 0644); // Try without O_DIRECT
+    fd = open(filename_.c_str(), flags, 0644);  // Try without O_DIRECT
+    os_->ActivateFlushPageCache();  // Not using O_DIRECT fixed EINVAL
   }
   if (fd < 0) {
     logprintf(0, "Process Error: Failed to create file %s!!\n",
@@ -1623,8 +1605,6 @@ bool FileThread::OpenFile(int *pfile) {
     pages_copied_ = 0;
     return false;
   }
-  if (no_O_DIRECT)
-    os_->ActivateFlushPageCache(); // Not using O_DIRECT fixed EINVAL
   *pfile = fd;
   return true;
 }
@@ -1695,7 +1675,7 @@ bool FileThread::WritePages(int fd) {
     if (!result)
       return false;
   }
-  return os_->FlushPageCache(); // If O_DIRECT worked, this will be a NOP.
+  return os_->FlushPageCache();  // If O_DIRECT worked, this will be a NOP.
 }
 
 // Copy data from file into memory block.
@@ -2475,13 +2455,22 @@ bool CpuStressThread::Work() {
 CpuCacheCoherencyThread::CpuCacheCoherencyThread(cc_cacheline_data *data,
                                                  int cacheline_count,
                                                  int thread_num,
+                                                 int thread_count,
                                                  int inc_count) {
   cc_cacheline_data_ = data;
   cc_cacheline_count_ = cacheline_count;
   cc_thread_num_ = thread_num;
+  cc_thread_count_ = thread_count;
   cc_inc_count_ = inc_count;
 }
 
+// A very simple psuedorandom generator.  Since the random number is based
+// on only a few simple logic operations, it can be done quickly in registers
+// and the compiler can inline it.
+uint64 CpuCacheCoherencyThread::SimpleRandom(uint64 seed) {
+  return (seed >> 1) ^ (-(seed & 1) & kRandomPolynomial);
+}
+
 // Worked thread to test the cache coherency of the CPUs
 // Return false on fatal sw error.
 bool CpuCacheCoherencyThread::Work() {
@@ -2490,7 +2479,19 @@ bool CpuCacheCoherencyThread::Work() {
   uint64 time_start, time_end;
   struct timeval tv;
 
+  // Use a slightly more robust random number for the initial
+  // value, so the random sequences from the simple generator will
+  // be more divergent.
+#ifdef HAVE_RAND_R
   unsigned int seed = static_cast<unsigned int>(gettid());
+  uint64 r = static_cast<uint64>(rand_r(&seed));
+  r |= static_cast<uint64>(rand_r(&seed)) << 32;
+#else
+  srand(time(NULL));
+  uint64 r = static_cast<uint64>(rand());  // NOLINT
+  r |= static_cast<uint64>(rand()) << 32;  // NOLINT
+#endif
+
   gettimeofday(&tv, NULL);  // Get the timestamp before increments.
   time_start = tv.tv_sec * 1000000ULL + tv.tv_usec;
 
@@ -2500,14 +2501,19 @@ bool CpuCacheCoherencyThread::Work() {
       // Choose a datastructure in random and increment the appropriate
       // member in that according to the offset (which is the same as the
       // thread number.
-#ifdef HAVE_RAND_R
-      int r = rand_r(&seed);
-#else
-      int r = rand();
-#endif
-      r = cc_cacheline_count_ * (r / (RAND_MAX + 1.0));
+      r = SimpleRandom(r);
+      int cline_num = r % cc_cacheline_count_;
+      int offset;
+      // Reverse the order for odd numbered threads in odd numbered cache
+      // lines.  This is designed for massively multi-core systems where the
+      // number of cores exceeds the bytes in a cache line, so "distant" cores
+      // get a chance to exercize cache coherency between them.
+      if (cline_num & cc_thread_num_ & 1)
+        offset = (cc_thread_count_ & ~1) - cc_thread_num_;
+      else
+        offset = cc_thread_num_;
       // Increment the member of the randomely selected structure.
-      (cc_cacheline_data_[r].num[cc_thread_num_])++;
+      (cc_cacheline_data_[cline_num].num[offset])++;
     }
 
     total_inc += cc_inc_count_;
@@ -2516,14 +2522,26 @@ bool CpuCacheCoherencyThread::Work() {
     // in all the cache line structures for this particular thread.
     int cc_global_num = 0;
     for (int cline_num = 0; cline_num < cc_cacheline_count_; cline_num++) {
-      cc_global_num += cc_cacheline_data_[cline_num].num[cc_thread_num_];
+      int offset;
+      // Perform the same offset calculation from above.
+      if (cline_num & cc_thread_num_ & 1)
+        offset = (cc_thread_count_ & ~1) - cc_thread_num_;
+      else
+        offset = cc_thread_num_;
+      cc_global_num += cc_cacheline_data_[cline_num].num[offset];
       // Reset the cachline member's value for the next run.
-      cc_cacheline_data_[cline_num].num[cc_thread_num_] = 0;
+      cc_cacheline_data_[cline_num].num[offset] = 0;
     }
     if (sat_->error_injection())
       cc_global_num = -1;
 
-    if (cc_global_num != cc_inc_count_) {
+    // Since the count is only stored in a byte, to squeeze more into a
+    // single cache line, only compare it as a byte.  In the event that there
+    // is something detected, the chance that it would be missed by a single
+    // thread is 1 in 256.  If it affects all cores, that makes the chance
+    // of it being missed terribly minute.  It seems unlikely any failure
+    // case would be off by more than a small number.
+    if ((cc_global_num & 0xff) != (cc_inc_count_ & 0xff)) {
       errorcount_++;
       logprintf(0, "Hardware Error: global(%d) and local(%d) do not match\n",
                 cc_global_num, cc_inc_count_);
@@ -2707,20 +2725,17 @@ bool DiskThread::SetParameters(int read_block_size,
 
 // Open a device, return false on failure.
 bool DiskThread::OpenDevice(int *pfile) {
-  bool no_O_DIRECT = false;
   int flags = O_RDWR | O_SYNC | O_LARGEFILE;
   int fd = open(device_name_.c_str(), flags | O_DIRECT, 0);
   if (O_DIRECT != 0 && fd < 0 && errno == EINVAL) {
-    no_O_DIRECT = true;
-    fd = open(device_name_.c_str(), flags, 0); // Try without O_DIRECT
+    fd = open(device_name_.c_str(), flags, 0);  // Try without O_DIRECT
+    os_->ActivateFlushPageCache();
   }
   if (fd < 0) {
     logprintf(0, "Process Error: Failed to open device %s (thread %d)!!\n",
               device_name_.c_str(), thread_num_);
     return false;
   }
-  if (no_O_DIRECT)
-    os_->ActivateFlushPageCache();
   *pfile = fd;
 
   return GetDiskSize(fd);
@@ -2876,11 +2891,11 @@ bool DiskThread::DoWork(int fd) {
 
       // Block is either initialized by writing, or in nondestructive case,
       // initialized by being added into the datastructure for later reading.
-      block->SetBlockAsInitialized();
+      block->initialized();
 
       in_flight_sectors_.push(block);
     }
-    if (!os_->FlushPageCache()) // If O_DIRECT worked, this will be a NOP.
+    if (!os_->FlushPageCache())  // If O_DIRECT worked, this will be a NOP.
       return false;
 
     // Verify blocks on disk.
@@ -2989,8 +3004,9 @@ bool DiskThread::AsyncDiskIO(IoOp op, int fd, void *buf, int64 size,
     errorcount_++;
     os_->ErrorReport(device_name_.c_str(), operations[op].error_str, 1);
 
-    if (event.res < 0) {
-      switch (event.res) {
+    int64 result = static_cast<int64>(event.res);
+    if (result < 0) {
+      switch (result) {
         case -EIO:
           logprintf(0, "Hardware Error: Low-level I/O error while doing %s to "
                        "sectors starting at %lld on disk %s (thread %d).\n",
@@ -3013,7 +3029,7 @@ bool DiskThread::AsyncDiskIO(IoOp op, int fd, void *buf, int64 size,
   }
 
   return true;
-#else // !HAVE_LIBAIO_H
+#else  // !HAVE_LIBAIO_H
   return false;
 #endif
 }
@@ -3021,7 +3037,7 @@ bool DiskThread::AsyncDiskIO(IoOp op, int fd, void *buf, int64 size,
 // Write a block to disk.
 // Return false if the block is not written.
 bool DiskThread::WriteBlockToDisk(int fd, BlockData *block) {
-  memset(block_buffer_, 0, block->GetSize());
+  memset(block_buffer_, 0, block->size());
 
   // Fill block buffer with a pattern
   struct page_entry pe;
@@ -3029,30 +3045,30 @@ bool DiskThread::WriteBlockToDisk(int fd, BlockData *block) {
     // Even though a valid page could not be obatined, it is not an error
     // since we can always fill in a pattern directly, albeit slower.
     unsigned int *memblock = static_cast<unsigned int *>(block_buffer_);
-    block->SetPattern(patternlist_->GetRandomPattern());
+    block->set_pattern(patternlist_->GetRandomPattern());
 
     logprintf(11, "Log: Warning, using pattern fill fallback in "
                   "DiskThread::WriteBlockToDisk on disk %s (thread %d).\n",
               device_name_.c_str(), thread_num_);
 
-    for (int i = 0; i < block->GetSize()/wordsize_; i++) {
-      memblock[i] = block->GetPattern()->pattern(i);
+    for (unsigned int i = 0; i < block->size()/wordsize_; i++) {
+      memblock[i] = block->pattern()->pattern(i);
     }
   } else {
-    memcpy(block_buffer_, pe.addr, block->GetSize());
-    block->SetPattern(pe.pattern);
+    memcpy(block_buffer_, pe.addr, block->size());
+    block->set_pattern(pe.pattern);
     sat_->PutValid(&pe);
   }
 
   logprintf(12, "Log: Writing %lld sectors starting at %lld on disk %s"
             " (thread %d).\n",
-            block->GetSize()/kSectorSize, block->GetAddress(),
+            block->size()/kSectorSize, block->address(),
             device_name_.c_str(), thread_num_);
 
   int64 start_time = GetTime();
 
-  if (!AsyncDiskIO(ASYNC_IO_WRITE, fd, block_buffer_, block->GetSize(),
-                   block->GetAddress() * kSectorSize, write_timeout_)) {
+  if (!AsyncDiskIO(ASYNC_IO_WRITE, fd, block_buffer_, block->size(),
+                   block->address() * kSectorSize, write_timeout_)) {
     return false;
   }
 
@@ -3073,11 +3089,11 @@ bool DiskThread::WriteBlockToDisk(int fd, BlockData *block) {
 // Return true if the block was read, also increment errorcount
 // if the block had data errors or performance problems.
 bool DiskThread::ValidateBlockOnDisk(int fd, BlockData *block) {
-  int64 blocks = block->GetSize() / read_block_size_;
+  int64 blocks = block->size() / read_block_size_;
   int64 bytes_read = 0;
   int64 current_blocks;
   int64 current_bytes;
-  uint64 address = block->GetAddress();
+  uint64 address = block->address();
 
   logprintf(20, "Log: Reading sectors starting at %lld on disk %s "
             "(thread %d).\n",
@@ -3129,7 +3145,7 @@ bool DiskThread::ValidateBlockOnDisk(int fd, BlockData *block) {
     // In non-destructive mode, don't compare the block to the pattern since
     // the block was never written to disk in the first place.
     if (!non_destructive_) {
-      if (CheckRegion(block_buffer_, block->GetPattern(), current_bytes,
+      if (CheckRegion(block_buffer_, block->pattern(), current_bytes,
                       0, bytes_read)) {
         os_->ErrorReport(device_name_.c_str(), "disk-pattern-error", 1);
         errorcount_ += 1;
@@ -3166,7 +3182,7 @@ bool DiskThread::Work() {
   // when using direct IO.
 #ifdef HAVE_POSIX_MEMALIGN
   int memalign_result = posix_memalign(&block_buffer_, kBufferAlignment,
-                              sat_->page_length());
+                                       sat_->page_length());
 #else
   block_buffer_ = memalign(kBufferAlignment, sat_->page_length());
   int memalign_result = (block_buffer_ == 0);
@@ -3410,3 +3426,224 @@ bool MemoryRegionThread::Work() {
             "pages checked\n", thread_num_, status_, pages_copied_);
   return result;
 }
+
+// The list of MSRs to read from each cpu.
+const CpuFreqThread::CpuRegisterType CpuFreqThread::kCpuRegisters[] = {
+  { kMsrTscAddr, "TSC" },
+  { kMsrAperfAddr, "APERF" },
+  { kMsrMperfAddr, "MPERF" },
+};
+
+CpuFreqThread::CpuFreqThread(int num_cpus, int freq_threshold, int round)
+  : num_cpus_(num_cpus),
+    freq_threshold_(freq_threshold),
+    round_(round) {
+  sat_assert(round >= 0);
+  if (round == 0) {
+    // If rounding is off, force rounding to the nearest MHz.
+    round_ = 1;
+    round_value_ = 0.5;
+  } else {
+    round_value_ = round/2.0;
+  }
+}
+
+CpuFreqThread::~CpuFreqThread() {
+}
+
+// Compute the difference between the currently read MSR values and the
+// previously read values and store the results in delta. If any of the
+// values did not increase, or the TSC value is too small, returns false.
+// Otherwise, returns true.
+bool CpuFreqThread::ComputeDelta(CpuDataType *current, CpuDataType *previous,
+                                 CpuDataType *delta) {
+  // Loop through the msrs.
+  for (int msr = 0; msr < kMsrLast; msr++) {
+    if (previous->msrs[msr] > current->msrs[msr]) {
+      logprintf(0, "Log: Register %s went backwards 0x%llx to 0x%llx "
+                "skipping interval\n", kCpuRegisters[msr], previous->msrs[msr],
+                current->msrs[msr]);
+      return false;
+    } else {
+      delta->msrs[msr] = current->msrs[msr] - previous->msrs[msr];
+    }
+  }
+
+  // Check for TSC < 1 Mcycles over interval.
+  if (delta->msrs[kMsrTsc] < (1000 * 1000)) {
+    logprintf(0, "Log: Insanely slow TSC rate, TSC stops in idle?\n");
+    return false;
+  }
+  timersub(&current->tv, &previous->tv, &delta->tv);
+
+  return true;
+}
+
+// Compute the change in values of the MSRs between current and previous,
+// set the frequency in MHz of the cpu. If there is an error computing
+// the delta, return false. Othewise, return true.
+bool CpuFreqThread::ComputeFrequency(CpuDataType *current,
+                                     CpuDataType *previous, int *freq) {
+  CpuDataType delta;
+  if (!ComputeDelta(current, previous, &delta)) {
+    return false;
+  }
+
+  double interval = delta.tv.tv_sec + delta.tv.tv_usec / 1000000.0;
+  double frequency = 1.0 * delta.msrs[kMsrTsc] / 1000000
+                     * delta.msrs[kMsrAperf] / delta.msrs[kMsrMperf] / interval;
+
+  // Use the rounding value to round up properly.
+  int computed = static_cast<int>(frequency + round_value_);
+  *freq = computed - (computed % round_);
+  return true;
+}
+
+// This is the task function that the thread executes.
+bool CpuFreqThread::Work() {
+  cpu_set_t cpuset;
+  if (!AvailableCpus(&cpuset)) {
+    logprintf(0, "Process Error: Cannot get information about the cpus.\n");
+    return false;
+  }
+
+  // Start off indicating the test is passing.
+  status_ = true;
+
+  int curr = 0;
+  int prev = 1;
+  uint32 num_intervals = 0;
+  bool paused = false;
+  bool valid;
+  bool pass = true;
+
+  vector<CpuDataType> data[2];
+  data[0].resize(num_cpus_);
+  data[1].resize(num_cpus_);
+  while (IsReadyToRun(&paused)) {
+    if (paused) {
+      // Reset the intervals and restart logic after the pause.
+      num_intervals = 0;
+    }
+    if (num_intervals == 0) {
+      // If this is the first interval, then always wait a bit before
+      // starting to collect data.
+      sat_sleep(kStartupDelay);
+    }
+
+    // Get the per cpu counters.
+    valid = true;
+    for (int cpu = 0; cpu < num_cpus_; cpu++) {
+      if (CPU_ISSET(cpu, &cpuset)) {
+        if (!GetMsrs(cpu, &data[curr][cpu])) {
+          logprintf(0, "Failed to get msrs on cpu %d.\n", cpu);
+          valid = false;
+          break;
+        }
+      }
+    }
+    if (!valid) {
+      // Reset the number of collected intervals since something bad happened.
+      num_intervals = 0;
+      continue;
+    }
+
+    num_intervals++;
+
+    // Only compute a delta when we have at least two intervals worth of data.
+    if (num_intervals > 2) {
+      for (int cpu = 0; cpu < num_cpus_; cpu++) {
+        if (CPU_ISSET(cpu, &cpuset)) {
+          int freq;
+          if (!ComputeFrequency(&data[curr][cpu], &data[prev][cpu],
+                                &freq)) {
+            // Reset the number of collected intervals since an unknown
+            // error occurred.
+            logprintf(0, "Log: Cannot get frequency of cpu %d.\n", cpu);
+            num_intervals = 0;
+            break;
+          }
+          logprintf(15, "Cpu %d Freq %d\n", cpu, freq);
+          if (freq < freq_threshold_) {
+            errorcount_++;
+            pass = false;
+            logprintf(0, "Log: Cpu %d frequency is too low, frequency %d MHz "
+                      "threshold %d MHz.\n", cpu, freq, freq_threshold_);
+          }
+        }
+      }
+    }
+
+    sat_sleep(kIntervalPause);
+
+    // Swap the values in curr and prev (these values flip between 0 and 1).
+    curr ^= 1;
+    prev ^= 1;
+  }
+
+  return pass;
+}
+
+
+// Get the MSR values for this particular cpu and save them in data. If
+// any error is encountered, returns false. Otherwise, returns true.
+bool CpuFreqThread::GetMsrs(int cpu, CpuDataType *data) {
+  for (int msr = 0; msr < kMsrLast; msr++) {
+    if (!os_->ReadMSR(cpu, kCpuRegisters[msr].msr, &data->msrs[msr])) {
+      return false;
+    }
+  }
+  // Save the time at which we acquired these values.
+  gettimeofday(&data->tv, NULL);
+
+  return true;
+}
+
+// Returns true if this test can run on the current machine. Otherwise,
+// returns false.
+bool CpuFreqThread::CanRun() {
+#if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
+  unsigned int eax, ebx, ecx, edx;
+
+  // Check that the TSC feature is supported.
+  // This check is valid for both Intel and AMD.
+  eax = 1;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if (!(edx & (1 << 5))) {
+    logprintf(0, "Process Error: No TSC support.\n");
+    return false;
+  }
+
+  // Check the highest extended function level supported.
+  // This check is valid for both Intel and AMD.
+  eax = 0x80000000;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if (eax < 0x80000007) {
+    logprintf(0, "Process Error: No invariant TSC support.\n");
+    return false;
+  }
+
+  // Non-Stop TSC is advertised by CPUID.EAX=0x80000007: EDX.bit8
+  // This check is valid for both Intel and AMD.
+  eax = 0x80000007;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if ((edx & (1 << 8)) == 0) {
+    logprintf(0, "Process Error: No non-stop TSC support.\n");
+    return false;
+  }
+
+  // APERF/MPERF is advertised by CPUID.EAX=0x6: ECX.bit0
+  // This check is valid for both Intel and AMD.
+  eax = 0x6;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if ((ecx & 1) == 0) {
+    logprintf(0, "Process Error: No APERF MSR support.\n");
+    return false;
+  }
+  return true;
+#else
+  logprintf(0, "Process Error: "
+               "cpu_freq_test is only supported on X86 processors.\n");
+  return false;
+#endif
+}
diff --git a/src/worker.h b/src/worker.h
index 31e0225..6f9fde7 100644
--- a/src/worker.h
+++ b/src/worker.h
@@ -44,7 +44,7 @@
 
 // Global Datastruture shared by the Cache Coherency Worker Threads.
 struct cc_cacheline_data {
-  int *num;
+  char *num;
 };
 
 // Typical usage:
@@ -127,10 +127,8 @@ class WorkerStatus {
   // ResumeWorkers() or StopWorkers() has been called.  Number of distinct
   // calling threads must match the worker count (see AddWorkers() and
   // RemoveSelf()).
-  bool ContinueRunning();
+  bool ContinueRunning(bool *paused);
 
-  // TODO(matthewb): Is this functionality really necessary?  Remove it if not.
-  //
   // This is a hack!  It's like ContinueRunning(), except it won't pause.  If
   // any worker threads use this exclusively in place of ContinueRunning() then
   // PauseWorkers() should never be used!
@@ -304,9 +302,10 @@ class WorkerThread {
   //   do {
   //     // work.
   //   } while (IsReadyToRun());
-  virtual bool IsReadyToRun() { return worker_status_->ContinueRunning(); }
-  // TODO(matthewb): Is this function really necessary? Remove it if not.
-  //
+  virtual bool IsReadyToRun(bool *paused = NULL) {
+    return worker_status_->ContinueRunning(paused);
+  }
+
   // Like IsReadyToRun(), except it won't pause.
   virtual bool IsReadyToRunNoPause() {
     return worker_status_->ContinueRunningNoPause();
@@ -641,16 +640,27 @@ class CpuCacheCoherencyThread : public WorkerThread {
   CpuCacheCoherencyThread(cc_cacheline_data *cc_data,
                           int cc_cacheline_count_,
                           int cc_thread_num_,
+                          int cc_thread_count_,
                           int cc_inc_count_);
   virtual bool Work();
 
  protected:
+  // Used by the simple random number generator as a shift feedback;
+  // this polynomial (x^64 + x^63 + x^61 + x^60 + 1) will produce a
+  // psuedorandom cycle of period 2^64-1.
+  static const uint64 kRandomPolynomial = 0xD800000000000000ULL;
+  // A very simple psuedorandom generator that can be inlined and use
+  // registers, to keep the CC test loop tight and focused.
+  static uint64 SimpleRandom(uint64 seed);
+
   cc_cacheline_data *cc_cacheline_data_;  // Datstructure for each cacheline.
   int cc_local_num_;        // Local counter for each thread.
   int cc_cacheline_count_;  // Number of cache lines to operate on.
   int cc_thread_num_;       // The integer id of the thread which is
                             // used as an index into the integer array
                             // of the cacheline datastructure.
+  int cc_thread_count_;     // Total number of threads being run, for
+                            // calculations mixing up cache line access.
   int cc_inc_count_;        // Number of times to increment the counter.
 
  private:
@@ -809,4 +819,80 @@ class MemoryRegionThread : public WorkerThread {
   DISALLOW_COPY_AND_ASSIGN(MemoryRegionThread);
 };
 
+// Worker thread to check that the frequency of every cpu does not go below a
+// certain threshold.
+class CpuFreqThread : public WorkerThread {
+ public:
+  CpuFreqThread(int num_cpus, int freq_threshold, int round);
+  ~CpuFreqThread();
+
+  // This is the task function that the thread executes.
+  virtual bool Work();
+
+  // Returns true if this test can run on the current machine. Otherwise,
+  // returns false.
+  static bool CanRun();
+
+ private:
+  static const int kIntervalPause = 10;   // The number of seconds to pause
+                                          // between acquiring the MSR data.
+  static const int kStartupDelay = 5;     // The number of seconds to wait
+                                          // before acquiring MSR data.
+  static const int kMsrTscAddr = 0x10;    // The address of the TSC MSR.
+  static const int kMsrAperfAddr = 0xE8;  // The address of the APERF MSR.
+  static const int kMsrMperfAddr = 0xE7;  // The address of the MPERF MSR.
+
+  // The index values into the CpuDataType.msr[] array.
+  enum MsrValues {
+    kMsrTsc = 0,           // MSR index 0 = TSC.
+    kMsrAperf = 1,         // MSR index 1 = APERF.
+    kMsrMperf = 2,         // MSR index 2 = MPERF.
+    kMsrLast,              // Last MSR index.
+  };
+
+  typedef struct {
+    uint32 msr;         // The address of the MSR.
+    const char *name;   // A human readable string for the MSR.
+  } CpuRegisterType;
+
+  typedef struct {
+    uint64 msrs[kMsrLast];  // The values of the MSRs.
+    struct timeval tv;      // The time at which the MSRs were read.
+  } CpuDataType;
+
+  // The set of MSR addresses and register names.
+  static const CpuRegisterType kCpuRegisters[kMsrLast];
+
+  // Compute the change in values of the MSRs between current and previous,
+  // set the frequency in MHz of the cpu. If there is an error computing
+  // the delta, return false. Othewise, return true.
+  bool ComputeFrequency(CpuDataType *current, CpuDataType *previous,
+                        int *frequency);
+
+  // Get the MSR values for this particular cpu and save them in data. If
+  // any error is encountered, returns false. Otherwise, returns true.
+  bool GetMsrs(int cpu, CpuDataType *data);
+
+  // Compute the difference between the currently read MSR values and the
+  // previously read values and store the results in delta. If any of the
+  // values did not increase, or the TSC value is too small, returns false.
+  // Otherwise, returns true.
+  bool ComputeDelta(CpuDataType *current, CpuDataType *previous,
+                    CpuDataType *delta);
+
+  // The total number of cpus on the system.
+  int num_cpus_;
+
+  // The minimum frequency that each cpu must operate at (in MHz).
+  int freq_threshold_;
+
+  // The value to round the computed frequency to.
+  int round_;
+
+  // Precomputed value to add to the frequency to do the rounding.
+  double round_value_;
+
+  DISALLOW_COPY_AND_ASSIGN(CpuFreqThread);
+};
+
 #endif  // STRESSAPPTEST_WORKER_H_
diff --git a/stressapptest.1 b/stressapptest.1
index 695f9ee..2c91478 100644
--- a/stressapptest.1
+++ b/stressapptest.1
@@ -86,9 +86,14 @@ Number of times to increment the cacheline's member.
 
 .TP
 .B \-\-cc_line_count <number>
-Mumber of cache line sized datastructures to allocate for the cache coherency
+Number of cache line sized datastructures to allocate for the cache coherency
 threads to operate.
 
+.TP
+.B \-\-cc_line_size <number>
+Size of cache line to use as the basis for cache coherency test data
+structures.
+
 .TP
 .B \-\-cc_test
 Do the cache coherency testing.