1 /* Copyright 2013 Google Inc. All Rights Reserved.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
18 * This "tool" can be used to brute force the XOR bitmask that a memory
19 * controller uses to interleave addresses onto its two channels. To use it,
20 * you need to have a bunch of addresses that are known to go to only one
21 * of the memory channels... easiest way to get these is to run stressapptest on
22 * a machine while holding a soldering iron close to the chips of one channel.
23 * Generate about a thousand failures and extract their physical addresses
24 * from the output. Write them to findmask.inc in a way that forms a valid
25 * definition for the addrs array. Make and run on a big machine.
27 * The program iterates over all possible bitmasks within the first NUM_BITS,
28 * parallelizing execution over NUM_THREADS. Every integer is masked
29 * onto all supplied addresses, counting the amount of times this results in
30 * an odd or even amount of bits. If all but NOISE addresses fall on one side,
31 * it will print that mask to stdout. Note that the script will always "find"
32 * the mask 0x0, and may also report masks such as 0x100000000 depending on
33 * your test machines memory size... you will need to use your own judgement to
34 * interpret the results.
36 * As the program might run for a long time, you can send SIGUSR1 to it to
37 * output the last mask that was processed and get a rough idea of the
50 #define NUM_THREADS 128 // keep this a power of two
52 static uint64_t addrs[] = {
53 #include "findmask.inc"
55 static uint64_t lastmask;
57 __attribute__((optimize(3, "unroll-loops")))
58 void* thread_func(void* arg) {
59 register uint64_t mask;
60 register uintptr_t num = (uintptr_t)arg;
62 for (mask = num; mask < (1ULL << (NUM_BITS + 1)); mask += NUM_THREADS) {
63 register const uint64_t* cur;
67 for (cur = addrs; (char*)cur < (char*)addrs + sizeof(addrs); cur++) {
69 register uint64_t addr asm("rdx") = *cur & mask;
70 register uint32_t tmp asm("ebx");
72 // Behold: the dark bit counting magic!
74 // Fold high and low 32 bits onto each other
75 "MOVl %%edx, %%ebx\n\t"
77 "XORl %%ebx, %%edx\n\t"
78 // Fold high and low 16 bits onto each other
79 "MOVl %%edx, %%ebx\n\t"
82 // Fold high and low 8 bits onto each other
84 // Invoke ancient 8086 parity flag (only counts lowest byte)
87 // Stupid SET instruction can only affect the lowest byte...
90 // Increment either 'a' or 'b' without needing another branch
93 : "=b" (tmp), "+r"(a), "+r"(b) : "d"(addr) : "cc");
95 #else // generic processor
96 register uint64_t addr = *cur & mask;
97 register uint32_t low = (uint32_t)addr;
98 register uint32_t high = (uint32_t)(addr >> 32);
100 // Takes about twice as long as the version above... take that GCC!
101 __builtin_parity(low) ^ __builtin_parity(high) ? a++ : b++;
104 // Early abort: probably still the most valuable optimization in here
105 if (a >= NOISE && b >= NOISE) break;
108 if (a < NOISE) b = a;
110 printf("Found mask with just %d deviations: 0x%" PRIx64 "\n", b, mask);
114 // I'm a little paranoid about performance: don't write to memory too often
115 if (!(mask & 0x7ff)) lastmask = mask;
121 void signal_handler(int signum) {
122 printf("Received signal... currently evaluating mask 0x%" PRIx64 "!\n",
127 int main(int argc, char** argv) {
129 pthread_t threads[NUM_THREADS];
131 signal(SIGUSR1, signal_handler);
133 for (i = 0; i < NUM_THREADS; i++)
134 pthread_create(&threads[i], 0, thread_func, (void*)i);
136 for (i = 0; i < NUM_THREADS; i++)
137 pthread_join(threads[i], 0);