bin_PROGRAMS = stressapptest
+noinst_PROGRAMS = findmask
AM_DEFAULT_SOURCE_EXT=.cc
HFILES += logger.h
stressapptest_SOURCES = $(MAINFILES) $(CFILES) $(HFILES)
+findmask_SOURCES = findmask.c findmask.inc
host_triplet = @host@
target_triplet = @target@
bin_PROGRAMS = stressapptest$(EXEEXT)
+noinst_PROGRAMS = findmask$(EXEEXT)
subdir = src
DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
$(srcdir)/stressapptest_config.h.in
CONFIG_CLEAN_FILES =
CONFIG_CLEAN_VPATH_FILES =
am__installdirs = "$(DESTDIR)$(bindir)"
-PROGRAMS = $(bin_PROGRAMS)
+PROGRAMS = $(bin_PROGRAMS) $(noinst_PROGRAMS)
+am_findmask_OBJECTS = findmask.$(OBJEXT)
+findmask_OBJECTS = $(am_findmask_OBJECTS)
+findmask_LDADD = $(LDADD)
am__objects_1 = main.$(OBJEXT)
am__objects_2 = os.$(OBJEXT) os_factory.$(OBJEXT) pattern.$(OBJEXT) \
queue.$(OBJEXT) sat.$(OBJEXT) sat_factory.$(OBJEXT) \
depcomp = $(SHELL) $(top_srcdir)/depcomp
am__depfiles_maybe = depfiles
am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
CXXLD = $(CXX)
CXXLINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
-o $@
-COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
- $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-CCLD = $(CC)
-LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
-SOURCES = $(stressapptest_SOURCES)
-DIST_SOURCES = $(stressapptest_SOURCES)
+SOURCES = $(findmask_SOURCES) $(stressapptest_SOURCES)
+DIST_SOURCES = $(findmask_SOURCES) $(stressapptest_SOURCES)
ETAGS = etags
CTAGS = ctags
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
finelock_queue.h error_diag.h disk_blocks.h adler32memcpy.h \
logger.h
stressapptest_SOURCES = $(MAINFILES) $(CFILES) $(HFILES)
+findmask_SOURCES = findmask.c findmask.inc
all: stressapptest_config.h
$(MAKE) $(AM_MAKEFLAGS) all-am
.SUFFIXES:
-.SUFFIXES: .cc .o .obj
+.SUFFIXES: .c .cc .o .obj
$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
@for dep in $?; do \
case '$(am__configure_deps)' in \
clean-binPROGRAMS:
-test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS)
+
+clean-noinstPROGRAMS:
+ -test -z "$(noinst_PROGRAMS)" || rm -f $(noinst_PROGRAMS)
+findmask$(EXEEXT): $(findmask_OBJECTS) $(findmask_DEPENDENCIES)
+ @rm -f findmask$(EXEEXT)
+ $(LINK) $(findmask_OBJECTS) $(findmask_LDADD) $(LIBS)
stressapptest$(EXEEXT): $(stressapptest_OBJECTS) $(stressapptest_DEPENDENCIES)
@rm -f stressapptest$(EXEEXT)
$(CXXLINK) $(stressapptest_OBJECTS) $(stressapptest_LDADD) $(LIBS)
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/adler32memcpy.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/disk_blocks.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/error_diag.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/findmask.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/finelock_queue.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/logger.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/main.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sat_factory.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/worker.Po@am__quote@
+.c.o:
+@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'`
+
.cc.o:
@am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@echo "it deletes files that may require special tools to rebuild."
clean: clean-am
-clean-am: clean-binPROGRAMS clean-generic mostlyclean-am
+clean-am: clean-binPROGRAMS clean-generic clean-noinstPROGRAMS \
+ mostlyclean-am
distclean: distclean-am
-rm -rf ./$(DEPDIR)
.MAKE: all install-am install-strip
.PHONY: CTAGS GTAGS all all-am check check-am clean clean-binPROGRAMS \
- clean-generic ctags distclean distclean-compile \
- distclean-generic distclean-hdr distclean-tags distdir dvi \
- dvi-am html html-am info info-am install install-am \
- install-binPROGRAMS install-data install-data-am install-dvi \
- install-dvi-am install-exec install-exec-am install-html \
- install-html-am install-info install-info-am install-man \
- install-pdf install-pdf-am install-ps install-ps-am \
- install-strip installcheck installcheck-am installdirs \
- maintainer-clean maintainer-clean-generic mostlyclean \
- mostlyclean-compile mostlyclean-generic pdf pdf-am ps ps-am \
- tags uninstall uninstall-am uninstall-binPROGRAMS
+ clean-generic clean-noinstPROGRAMS ctags distclean \
+ distclean-compile distclean-generic distclean-hdr \
+ distclean-tags distdir dvi dvi-am html html-am info info-am \
+ install install-am install-binPROGRAMS install-data \
+ install-data-am install-dvi install-dvi-am install-exec \
+ install-exec-am install-html install-html-am install-info \
+ install-info-am install-man install-pdf install-pdf-am \
+ install-ps install-ps-am install-strip installcheck \
+ installcheck-am installdirs maintainer-clean \
+ maintainer-clean-generic mostlyclean mostlyclean-compile \
+ mostlyclean-generic pdf pdf-am ps ps-am tags uninstall \
+ uninstall-am uninstall-binPROGRAMS
# Tell versions [3.59,3.63) of GNU make to not export all variables.
--- /dev/null
+/* Copyright 2013 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * This "tool" can be used to brute force the XOR bitmask that a memory
+ * controller uses to interleave addresses onto its two channels. To use it,
+ * you need to have a bunch of addresses that are known to go to only one
+ * of the memory channels... easiest way to get these is to run stressapptest on
+ * a machine while holding a soldering iron close to the chips of one channel.
+ * Generate about a thousand failures and extract their physical addresses
+ * from the output. Write them to findmask.inc in a way that forms a valid
+ * definition for the addrs array. Make and run on a big machine.
+ *
+ * The program iterates over all possible bitmasks within the first NUM_BITS,
+ * parallelizing execution over NUM_THREADS. Every integer is masked
+ * onto all supplied addresses, counting the amount of times this results in
+ * an odd or even amount of bits. If all but NOISE addresses fall on one side,
+ * it will print that mask to stdout. Note that the script will always "find"
+ * the mask 0x0, and may also report masks such as 0x100000000 depending on
+ * your test machines memory size... you will need to use your own judgement to
+ * interpret the results.
+ *
+ * As the program might run for a long time, you can send SIGUSR1 to it to
+ * output the last mask that was processed and get a rough idea of the
+ * current progress.
+ */
+
+#include <pthread.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define NOISE 20
+#define NUM_BITS 32
+#define NUM_THREADS 128 // keep this a power of two
+
+static uint64_t addrs[] = {
+#include "findmask.inc"
+};
+static uint64_t lastmask;
+
+__attribute__((optimize(3, "unroll-loops")))
+void* thread_func(void* arg) {
+ register uint64_t mask;
+ register uintptr_t num = (uintptr_t)arg;
+
+ for (mask = num; mask < (1ULL << (NUM_BITS + 1)); mask += NUM_THREADS) {
+ register const uint64_t* cur;
+ register int a = 0;
+ register int b = 0;
+
+ for (cur = addrs; (char*)cur < (char*)addrs + sizeof(addrs); cur++) {
+#ifdef __x86_64__
+ register uint64_t addr asm("rdx") = *cur & mask;
+ register uint32_t tmp asm("ebx");
+
+ // Behold: the dark bit counting magic!
+ asm (
+ // Fold high and low 32 bits onto each other
+ "MOVl %%edx, %%ebx\n\t"
+ "SHRq $32, %%rdx\n\t"
+ "XORl %%ebx, %%edx\n\t"
+ // Fold high and low 16 bits onto each other
+ "MOVl %%edx, %%ebx\n\t"
+ "SHRl $16, %%edx\n\t"
+ "XORw %%bx, %%dx\n\t"
+ // Fold high and low 8 bits onto each other
+ "XORb %%dh, %%dl\n\t"
+ // Invoke ancient 8086 parity flag (only counts lowest byte)
+ "SETnp %%bl\n\t"
+ "SETp %%dl\n\t"
+ // Stupid SET instruction can only affect the lowest byte...
+ "ANDl $1, %%ebx\n\t"
+ "ANDl $1, %%edx\n\t"
+ // Increment either 'a' or 'b' without needing another branch
+ "ADDl %%ebx, %2\n\t"
+ "ADDl %%edx, %1\n\t"
+ : "=b" (tmp), "+r"(a), "+r"(b) : "d"(addr) : "cc");
+
+#else // generic processor
+ register uint64_t addr = *cur & mask;
+ register uint32_t low = (uint32_t)addr;
+ register uint32_t high = (uint32_t)(addr >> 32);
+
+ // Takes about twice as long as the version above... take that GCC!
+ __builtin_parity(low) ^ __builtin_parity(high) ? a++ : b++;
+#endif
+
+ // Early abort: probably still the most valuable optimization in here
+ if (a >= NOISE && b >= NOISE) break;
+ }
+
+ if (a < NOISE) b = a;
+ if (b < NOISE) {
+ printf("Found mask with just %d deviations: 0x%llx\n", b, mask);
+ fflush(stdout);
+ }
+
+ // I'm a little paranoid about performance: don't write to memory too often
+ if (!(mask & 0x7ff)) lastmask = mask;
+ }
+
+ return 0;
+}
+
+void signal_handler(int signum) {
+ printf("Received signal... currently evaluating mask 0x%llx!\n", lastmask);
+ fflush(stdout);
+}
+
+int main(int argc, char** argv) {
+ uintptr_t i;
+ pthread_t threads[NUM_THREADS];
+
+ signal(SIGUSR1, signal_handler);
+
+ for (i = 0; i < NUM_THREADS; i++)
+ pthread_create(&threads[i], 0, thread_func, (void*)i);
+
+ for (i = 0; i < NUM_THREADS; i++)
+ pthread_join(threads[i], 0);
+
+ return 0;
+}
--- /dev/null
+// This is the body of a uintptr_t array definition. Fill in your own addresses.
+0x116bb312c, // example values (can be >32 bit)
+0x38d3c5ad, // replace with your own
+0x77c1e96d // don't forget: no comma after the last one
}
-// Translate physical address to memory module name.
-// Assumes simple round-robin interleaving between memory channels of
-// 'interleave_size_' sized chunks, with repeated 'channel_width_'
+// Translate physical address to memory module/chip name.
+// Assumes interleaving between two memory channels based on the XOR of
+// all address bits in the 'channel_hash' mask, with repeated 'channel_width_'
// blocks with bits distributed from each chip in that channel.
int OsLayer::FindDimm(uint64 addr, char *buf, int len) {
static const string unknown = "DIMM Unknown";
- if (!modules_) {
+ if (!channels_) {
snprintf(buf, len, "%s", unknown.c_str());
return 0;
}
- // Find channel by counting interleave units (typically cachelines),
- // and mod by number of channels.
- vector<string>& channel = (*modules_)[
- (addr / interleave_size_) % modules_->size()];
+ // Find channel by XORing address bits in channel_hash mask.
+ uint32 low = (uint32)(addr & channel_hash_);
+ uint32 high = (uint32)((addr & channel_hash_) >> 32);
+ vector<string>& channel = (*channels_)[
+ __builtin_parity(high) ^ __builtin_parity(low)];
// Find dram chip by finding which byte within the channel
// by address mod channel width, then divide the channel
}
// Set parameters needed to translate physical address to memory module.
- void SetDramMappingParams(int interleave_size, int channel_width,
- vector< vector<string> > *modules) {
- interleave_size_ = interleave_size;
+ void SetDramMappingParams(uintptr_t channel_hash, int channel_width,
+ vector< vector<string> > *channels) {
+ channel_hash_ = channel_hash;
channel_width_ = channel_width;
- modules_ = modules;
+ channels_ = channels;
}
// Initializes data strctures and open files.
bool use_posix_shm_; // Use 4k page shmem?
bool dynamic_mapped_shmem_; // Conserve virtual address space.
int shmid_; // Handle to shmem
- vector< vector<string> > *modules_; // Memory module names per channel.
- int interleave_size_; // Channel interleaving chunk size.
+ vector< vector<string> > *channels_; // Memory module names per channel.
+ uint64 channel_hash_; // Mask of address bits XORed for channel.
int channel_width_; // Channel width in bits.
int64 regionsize_; // Size of memory "regions"
if (min_hugepages_mbytes_ > 0)
os_->SetMinimumHugepagesSize(min_hugepages_mbytes_ * kMegabyte);
- if (modules_.size() > 0) {
+ if (channels_.size() > 0) {
logprintf(6, "Log: Decoding memory: %dx%d bit channels,"
- " %d byte burst size, %d modules per channel (x%d)\n",
- modules_.size(), channel_width_, interleave_size_, modules_[0].size(),
- channel_width_/modules_[0].size());
- os_->SetDramMappingParams(interleave_size_, channel_width_, &modules_);
+ "%d modules per channel (x%d), decoding hash 0x%x\n",
+ channels_.size(), channel_width_, channels_[0].size(),
+ channel_width_/channels_[0].size(), channel_hash_);
+ os_->SetDramMappingParams(channel_hash_, channel_width_, &channels_);
}
if (!os_->Initialize()) {
min_hugepages_mbytes_ = 0;
freepages_ = 0;
paddr_base_ = 0;
- interleave_size_ = kCacheLineSize;
+ channel_hash_ = kCacheLineSize;
channel_width_ = 64;
user_break_ = false;
continue;
}
- ARG_IVALUE("--interleave_size", interleave_size_);
+ ARG_IVALUE("--channel_hash", channel_hash_);
ARG_IVALUE("--channel_width", channel_width_);
if (!strcmp(argv[i], "--memory_channel")) {
i++;
if (i < argc) {
- char *module = argv[i];
- modules_.push_back(vector<string>());
- while (char* next = strchr(module, ',')) {
- modules_.back().push_back(string(module, next - module));
- module = next + 1;
+ char *channel = argv[i];
+ channels_.push_back(vector<string>());
+ while (char* next = strchr(channel, ',')) {
+ channels_.back().push_back(string(channel, next - channel));
+ channel = next + 1;
}
- modules_.back().push_back(string(module));
+ channels_.back().push_back(string(channel));
}
continue;
}
}
// Validate memory channel parameters if supplied
- if (modules_.size()) {
- if (interleave_size_ <= 0 ||
- interleave_size_ & (interleave_size_ - 1)) {
+ if (channels_.size()) {
+ if (channels_.size() == 1) {
+ channel_hash_ = 0;
+ logprintf(7, "Log: "
+ "Only one memory channel...deactivating interleave decoding.\n");
+ } else if (channels_.size() > 2) {
logprintf(6, "Process Error: "
- "Interleave size %d is not a power of 2.\n", interleave_size_);
+ "Triple-channel mode not yet supported... sorry.\n");
bad_status();
return false;
}
- for (uint i = 0; i < modules_.size(); i++)
- if (modules_[i].size() != modules_[0].size()) {
+ for (uint i = 0; i < channels_.size(); i++)
+ if (channels_[i].size() != channels_[0].size()) {
logprintf(6, "Process Error: "
- "Channels 0 and %d have a different amount of modules.\n",i);
+ "Channels 0 and %d have a different count of dram modules.\n",i);
bad_status();
return false;
}
- if (modules_[0].size() & (modules_[0].size() - 1)) {
+ if (channels_[0].size() & (channels_[0].size() - 1)) {
logprintf(6, "Process Error: "
"Amount of modules per memory channel is not a power of 2.\n");
bad_status();
bad_status();
return false;
}
- if (channel_width_ / modules_[0].size() < 8) {
- logprintf(6, "Process Error: "
- "Chip width x%d must be x8 or greater.\n", channel_width_ / modules_[0].size());
+ if (channel_width_ / channels_[0].size() < 8) {
+ logprintf(6, "Process Error: Chip width x%d must be x8 or greater.\n",
+ channel_width_ / channels_[0].size());
bad_status();
return false;
}
"each CPU to be tested by that CPU\n"
" --remote_numa choose memory regions not associated with "
"each CPU to be tested by that CPU\n"
- " --interleave_size bytes size in bytes of each channel's data as interleaved "
- "between memory channels\n"
+ " --channel_hash mask of address bits XORed to determine channel.\n"
+ " Mask 0x40 interleaves cachelines between channels\n"
" --channel_width bits width in bits of each memory channel\n"
" --memory_channel u1,u2 defines a comma-separated list of names\n"
" for dram packages in a memory channel.\n"
int64 freepages_; // How many invalid pages we need.
int disk_pages_; // Number of pages per temp file.
uint64 paddr_base_; // Physical address base.
- vector< vector<string> > modules_; // Memory module names per channel.
- int interleave_size_; // Channel interleaving chunk size in bytes.
- // Usually cacheline sized.
+ vector< vector<string> > channels_; // Memory module names per channel.
+ uint64 channel_hash_; // Mask of address bits XORed for channel.
int channel_width_; // Channel width in bits.
// Control flags.