1 // Copyright 2006 Google Inc. All Rights Reserved.
2 // Author: nsanders, menderico
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
16 // os.cc : os and machine specific implementation
17 // This file includes an abstracted interface
18 // for linux-distro specific and HW specific
25 #include <linux/types.h>
31 #include <sys/ioctl.h>
33 #include <sys/types.h>
39 #define SHM_HUGETLB 04000 // remove when glibc defines it
45 // This file must work with autoconf on its public version,
46 // so these includes are correct.
48 #include "error_diag.h"
50 // OsLayer initialization.
55 min_hugepages_bytes_ = 0;
57 use_hugepages_ = false;
58 use_posix_shm_ = false;
59 dynamic_mapped_shmem_ = false;
62 time_initialized_ = 0;
68 num_cpus_per_node_ = 0;
70 err_log_callback_ = 0;
71 error_injection_ = false;
74 address_mode_ = sizeof(pvoid) * 8;
83 delete error_diagnoser_;
86 // OsLayer initialization.
87 bool OsLayer::Initialize() {
88 time_initialized_ = time(NULL);
89 // Detect asm support.
94 num_cpus_ = sysconf(_SC_NPROCESSORS_ONLN);
95 num_cpus_per_node_ = num_cpus_ / num_nodes_;
97 logprintf(5, "Log: %d nodes, %d cpus.\n", num_nodes_, num_cpus_);
98 sat_assert(CPU_SETSIZE >= num_cpus_);
99 cpu_sets_.resize(num_nodes_);
100 cpu_sets_valid_.resize(num_nodes_);
101 // Create error diagnoser.
102 error_diagnoser_ = new ErrorDiag();
103 if (!error_diagnoser_->set_os(this))
108 // Machine type detected. Can we implement all these functions correctly?
109 bool OsLayer::IsSupported() {
111 // There are no explicitly supported systems in open source version.
115 // This is the default empty implementation.
116 // SAT won't report full error information.
120 int OsLayer::AddressMode() {
121 // Detect 32/64 bit binary.
123 return sizeof(pvoid) * 8;
126 // Translates user virtual to physical address.
127 uint64 OsLayer::VirtualToPhysical(void *vaddr) {
128 // Needs platform specific implementation.
132 // Returns the HD device that contains this file.
133 string OsLayer::FindFileDevice(string filename) {
137 // Returns a list of locations corresponding to HD devices.
138 list<string> OsLayer::FindFileDevices() {
139 // No autodetection on unknown systems.
140 list<string> locations;
145 // Get HW core features from cpuid instruction.
146 void OsLayer::GetFeatures() {
147 #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
148 // CPUID features documented at:
149 // http://www.sandpile.org/ia32/cpuid.htm
151 __asm__ __volatile__ (
152 "cpuid": "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (1));
153 has_clflush_ = (dx >> 19) & 1;
154 has_sse2_ = (dx >> 26) & 1;
156 logprintf(9, "Log: has clflush: %s, has sse2: %s\n",
157 has_clflush_ ? "true" : "false",
158 has_sse2_ ? "true" : "false");
159 #elif defined(STRESSAPPTEST_CPU_PPC)
160 // All PPC implementations have cache flush instructions.
162 #elif defined(STRESSAPPTEST_CPU_ARMV7A)
163 #warning "Unsupported CPU type ARMV7A: unable to determine feature set."
165 #warning "Unsupported CPU type: unable to determine feature set."
170 // We need to flush the cacheline here.
171 void OsLayer::Flush(void *vaddr) {
172 // Use the generic flush. This function is just so we can override
173 // this if we are so inclined.
179 // Run C or ASM copy as appropriate..
180 bool OsLayer::AdlerMemcpyWarm(uint64 *dstmem, uint64 *srcmem,
181 unsigned int size_in_bytes,
182 AdlerChecksum *checksum) {
184 return AdlerMemcpyAsm(dstmem, srcmem, size_in_bytes, checksum);
186 return AdlerMemcpyWarmC(dstmem, srcmem, size_in_bytes, checksum);
191 // Translate user virtual to physical address.
192 int OsLayer::FindDimm(uint64 addr, char *buf, int len) {
194 snprintf(tmpbuf, sizeof(tmpbuf), "DIMM Unknown");
195 snprintf(buf, len, "%s", tmpbuf);
200 // Classifies addresses according to "regions"
201 // This isn't really implemented meaningfully here..
202 int32 OsLayer::FindRegion(uint64 addr) {
203 static bool warned = false;
205 if (regionsize_ == 0) {
206 regionsize_ = totalmemsize_ / 8;
207 if (regionsize_ < 512 * kMegabyte)
208 regionsize_ = 512 * kMegabyte;
209 regioncount_ = totalmemsize_ / regionsize_;
210 if (regioncount_ < 1) regioncount_ = 1;
213 int32 region_num = addr / regionsize_;
214 if (region_num >= regioncount_) {
216 logprintf(0, "Log: region number %d exceeds region count %d\n",
217 region_num, regioncount_);
220 region_num = region_num % regioncount_;
225 // Report which cores are associated with a given region.
226 cpu_set_t *OsLayer::FindCoreMask(int32 region) {
227 sat_assert(region >= 0);
228 region %= num_nodes_;
229 if (!cpu_sets_valid_[region]) {
230 CPU_ZERO(&cpu_sets_[region]);
231 for (int i = 0; i < num_cpus_per_node_; ++i) {
232 CPU_SET(i + region * num_cpus_per_node_, &cpu_sets_[region]);
234 cpu_sets_valid_[region] = true;
235 logprintf(5, "Log: Region %d mask 0x%s\n",
236 region, FindCoreMaskFormat(region).c_str());
238 return &cpu_sets_[region];
241 // Return cores associated with a given region in hex string.
242 string OsLayer::FindCoreMaskFormat(int32 region) {
243 cpu_set_t* mask = FindCoreMask(region);
244 string format = cpuset_format(mask);
245 if (format.size() < 8)
246 format = string(8 - format.size(), '0') + format;
250 // Report an error in an easily parseable way.
251 bool OsLayer::ErrorReport(const char *part, const char *symptom, int count) {
252 time_t now = time(NULL);
253 int ttf = now - time_initialized_;
254 logprintf(0, "Report Error: %s : %s : %d : %ds\n", symptom, part, count, ttf);
258 // Read the number of hugepages out of the kernel interface in proc.
259 int64 OsLayer::FindHugePages() {
262 // This is a kernel interface to query the numebr of hugepages
263 // available in the system.
264 static const char *hugepages_info_file = "/proc/sys/vm/nr_hugepages";
265 int hpfile = open(hugepages_info_file, O_RDONLY);
267 ssize_t bytes_read = read(hpfile, buf, 64);
270 if (bytes_read <= 0) {
271 logprintf(12, "Log: /proc/sys/vm/nr_hugepages "
272 "read did not provide data\n");
276 if (bytes_read == 64) {
277 logprintf(0, "Process Error: /proc/sys/vm/nr_hugepages "
278 "is surprisingly large\n");
282 // Add a null termintation to be string safe.
283 buf[bytes_read] = '\0';
284 // Read the page count.
285 int64 pages = strtoull(buf, NULL, 10); // NOLINT
290 int64 OsLayer::FindFreeMemSize() {
293 if (totalmemsize_ > 0)
294 return totalmemsize_;
296 int64 pages = sysconf(_SC_PHYS_PAGES);
297 int64 avpages = sysconf(_SC_AVPHYS_PAGES);
298 int64 pagesize = sysconf(_SC_PAGESIZE);
299 int64 physsize = pages * pagesize;
300 int64 avphyssize = avpages * pagesize;
302 // Assume 2MB hugepages.
303 int64 hugepagesize = FindHugePages() * 2 * kMegabyte;
305 if ((pages == -1) || (pagesize == -1)) {
306 logprintf(0, "Process Error: sysconf could not determine memory size.\n");
310 // We want to leave enough stuff for things to run.
311 // If the user specified a minimum amount of memory to expect, require that.
312 // Otherwise, if more than 2GB is present, leave 192M + 5% for other stuff.
313 // If less than 2GB is present use 85% of what's available.
314 // These are fairly arbitrary numbers that seem to work OK.
316 // TODO(nsanders): is there a more correct way to determine target
318 if (hugepagesize > 0 && min_hugepages_bytes_ > 0) {
319 minsize = min_hugepages_bytes_;
320 } else if (physsize < 2048LL * kMegabyte) {
321 minsize = ((pages * 85) / 100) * pagesize;
323 minsize = ((pages * 95) / 100) * pagesize - (192 * kMegabyte);
326 // Use hugepage sizing if available.
327 if (hugepagesize > 0) {
328 if (hugepagesize < minsize) {
329 logprintf(0, "Procedural Error: Not enough hugepages. "
330 "%lldMB available < %lldMB required.\n",
331 hugepagesize / kMegabyte,
332 minsize / kMegabyte);
333 // Require the calculated minimum amount of memory.
336 // Require that we get all hugepages.
340 // Require the calculated minimum amount of memory.
344 logprintf(5, "Log: Total %lld MB. Free %lld MB. Hugepages %lld MB. "
345 "Targeting %lld MB (%lld%%)\n",
346 physsize / kMegabyte,
347 avphyssize / kMegabyte,
348 hugepagesize / kMegabyte,
350 size * 100 / physsize);
352 totalmemsize_ = size;
356 // Allocates all memory available.
357 int64 OsLayer::AllocateAllMem() {
358 int64 length = FindFreeMemSize();
359 bool retval = AllocateTestMem(length, 0);
366 // Allocate the target memory. This may be from malloc, hugepage pool
367 // or other platform specific sources.
368 bool OsLayer::AllocateTestMem(int64 length, uint64 paddr_base) {
369 // Try hugepages first.
372 sat_assert(length >= 0);
375 logprintf(0, "Process Error: non zero paddr_base %#llx is not supported,"
376 " ignore.\n", paddr_base);
378 // Determine optimal memory allocation path.
379 bool prefer_hugepages = false;
380 bool prefer_posix_shm = false;
381 bool prefer_dynamic_mapping = false;
383 // Are there enough hugepages?
384 int64 hugepagesize = FindHugePages() * 2 * kMegabyte;
385 // TODO(nsanders): Is there enough /dev/shm? Is there enough free memeory?
386 if ((length >= 1400LL * kMegabyte) && (address_mode_ == 32)) {
387 prefer_dynamic_mapping = true;
388 prefer_posix_shm = true;
389 logprintf(3, "Log: Prefer POSIX shared memory allocation.\n");
390 logprintf(3, "Log: You may need to run "
391 "'sudo mount -o remount,size=100\% /dev/shm.'\n");
392 } else if (hugepagesize >= length) {
393 prefer_hugepages = true;
394 logprintf(3, "Log: Prefer using hugepace allocation.\n");
396 logprintf(3, "Log: Prefer plain malloc memory allocation.\n");
399 // Allocate hugepage mapped memory.
400 if (prefer_hugepages) {
401 do { // Allow break statement.
405 if ((shmid = shmget(2, length,
406 SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
408 string errtxt = ErrorString(err);
409 logprintf(3, "Log: failed to allocate shared hugepage "
410 "object - err %d (%s)\n",
411 err, errtxt.c_str());
412 logprintf(3, "Log: sysctl -w vm.nr_hugepages=XXX allows hugepages.\n");
416 shmaddr = shmat(shmid, NULL, NULL);
417 if (shmaddr == reinterpret_cast<void*>(-1)) {
419 string errtxt = ErrorString(err);
420 logprintf(0, "Log: failed to attach shared "
421 "hugepage object - err %d (%s).\n",
422 err, errtxt.c_str());
423 if (shmctl(shmid, IPC_RMID, NULL) < 0) {
425 string errtxt = ErrorString(err);
426 logprintf(0, "Log: failed to remove shared "
427 "hugepage object - err %d (%s).\n",
428 err, errtxt.c_str());
432 use_hugepages_ = true;
435 logprintf(0, "Log: Using shared hugepage object 0x%x at %p.\n",
440 if ((!use_hugepages_) && prefer_posix_shm) {
443 void *shmaddr = NULL;
445 shm_object = shm_open("/stressapptest", O_CREAT | O_RDWR, S_IRWXU);
446 if (shm_object < 0) {
448 string errtxt = ErrorString(err);
449 logprintf(3, "Log: failed to allocate shared "
450 "smallpage object - err %d (%s)\n",
451 err, errtxt.c_str());
455 if (0 > ftruncate(shm_object, length)) {
457 string errtxt = ErrorString(err);
458 logprintf(3, "Log: failed to ftruncate shared "
459 "smallpage object - err %d (%s)\n",
460 err, errtxt.c_str());
464 // 32 bit linux apps can only use ~1.4G of address space.
465 // Use dynamic mapping for allocations larger than that.
466 // Currently perf hit is ~10% for this.
467 if (prefer_dynamic_mapping) {
468 dynamic_mapped_shmem_ = true;
470 // Do a full mapping here otherwise.
471 shmaddr = mmap64(NULL, length, PROT_READ | PROT_WRITE,
472 MAP_SHARED | MAP_NORESERVE | MAP_LOCKED | MAP_POPULATE,
474 if (shmaddr == reinterpret_cast<void*>(-1)) {
476 string errtxt = ErrorString(err);
477 logprintf(0, "Log: failed to map shared "
478 "smallpage object - err %d (%s).\n",
479 err, errtxt.c_str());
484 use_posix_shm_ = true;
487 char location_message[256] = "";
488 if (dynamic_mapped_shmem_) {
489 sprintf(location_message, "mapped as needed");
491 sprintf(location_message, "at %p", shmaddr);
493 logprintf(0, "Log: Using posix shared memory object 0x%x %s.\n",
494 shm_object, location_message);
496 shm_unlink("/stressapptest");
499 if (!use_hugepages_ && !use_posix_shm_) {
500 // Use memalign to ensure that blocks are aligned enough for disk direct IO.
501 buf = static_cast<char*>(memalign(4096, length));
503 logprintf(0, "Log: Using memaligned allocation at %p.\n", buf);
505 logprintf(0, "Process Error: memalign returned 0\n");
506 if ((length >= 1499LL * kMegabyte) && (address_mode_ == 32)) {
507 logprintf(0, "Log: You are trying to allocate > 1.4G on a 32 "
508 "bit process. Please setup shared memory.\n");
514 if (buf || dynamic_mapped_shmem_) {
515 testmemsize_ = length;
520 return (buf != 0) || dynamic_mapped_shmem_;
523 // Free the test memory.
524 void OsLayer::FreeTestMem() {
526 if (use_hugepages_) {
528 shmctl(shmid_, IPC_RMID, NULL);
529 } else if (use_posix_shm_) {
530 if (!dynamic_mapped_shmem_) {
531 munmap(testmem_, testmemsize_);
543 // Prepare the target memory. It may requre mapping in, or this may be a noop.
544 void *OsLayer::PrepareTestMem(uint64 offset, uint64 length) {
545 sat_assert((offset + length) <= testmemsize_);
546 if (dynamic_mapped_shmem_) {
547 // TODO(nsanders): Check if we can support MAP_NONBLOCK,
548 // and evaluate performance hit from not using it.
549 void * mapping = mmap64(NULL, length, PROT_READ | PROT_WRITE,
550 MAP_SHARED | MAP_NORESERVE | MAP_LOCKED | MAP_POPULATE,
552 if (mapping == MAP_FAILED) {
553 string errtxt = ErrorString(errno);
554 logprintf(0, "Process Error: PrepareTestMem mmap64(%llx, %llx) failed. "
556 offset, length, errtxt.c_str());
562 return reinterpret_cast<void*>(reinterpret_cast<char*>(testmem_) + offset);
565 // Release the test memory resources, if any.
566 void OsLayer::ReleaseTestMem(void *addr, uint64 offset, uint64 length) {
567 if (dynamic_mapped_shmem_) {
568 int retval = munmap(addr, length);
570 string errtxt = ErrorString(errno);
571 logprintf(0, "Process Error: ReleaseTestMem munmap(%p, %llx) failed. "
573 addr, length, errtxt.c_str());
579 // No error polling on unknown systems.
580 int OsLayer::ErrorPoll() {
584 // Generally, poll for errors once per second.
585 void OsLayer::ErrorWait() {
590 // Open a PCI bus-dev-func as a file and return its file descriptor.
591 // Error is indicated by return value less than zero.
592 int OsLayer::PciOpen(int bus, int device, int function) {
595 snprintf(dev_file, sizeof(dev_file), "/proc/bus/pci/%02x/%02x.%x",
596 bus, device, function);
598 int fd = open(dev_file, O_RDWR);
600 logprintf(0, "Process Error: Unable to open PCI bus %d, device %d, "
601 "function %d (errno %d).\n",
602 bus, device, function, errno);
610 // Read and write functions to access PCI config.
611 uint32 OsLayer::PciRead(int fd, uint32 offset, int width) {
612 // Strict aliasing rules lawyers will cause data corruption
613 // on cast pointers in some gccs.
620 uint32 size = width / 8;
622 sat_assert((width == 32) || (width == 16) || (width == 8));
623 sat_assert(offset <= (256 - size));
625 if (lseek(fd, offset, SEEK_SET) < 0) {
626 logprintf(0, "Process Error: Can't seek %x\n", offset);
629 if (read(fd, &datacast, size) != static_cast<ssize_t>(size)) {
630 logprintf(0, "Process Error: Can't read %x\n", offset);
637 sat_assert(&(datacast.l8) == reinterpret_cast<uint8*>(&datacast));
640 sat_assert(&(datacast.l16) == reinterpret_cast<uint16*>(&datacast));
648 void OsLayer::PciWrite(int fd, uint32 offset, uint32 value, int width) {
649 // Strict aliasing rules lawyers will cause data corruption
650 // on cast pointers in some gccs.
657 uint32 size = width / 8;
659 sat_assert((width == 32) || (width == 16) || (width == 8));
660 sat_assert(offset <= (256 - size));
662 // Cram the data into the right alignment.
665 sat_assert(&(datacast.l8) == reinterpret_cast<uint8*>(&datacast));
668 sat_assert(&(datacast.l16) == reinterpret_cast<uint16*>(&datacast));
669 datacast.l16 = value;
671 datacast.l32 = value;
674 if (lseek(fd, offset, SEEK_SET) < 0) {
675 logprintf(0, "Process Error: Can't seek %x\n", offset);
678 if (write(fd, &datacast, size) != static_cast<ssize_t>(size)) {
679 logprintf(0, "Process Error: Can't write %x to %x\n", datacast.l32, offset);
689 int OsLayer::OpenMSR(uint32 core, uint32 address) {
691 snprintf(buf, sizeof(buf), "/dev/cpu/%d/msr", core);
692 int fd = open(buf, O_RDWR);
696 uint32 pos = lseek(fd, address, SEEK_SET);
697 if (pos != address) {
699 logprintf(5, "Log: can't seek to msr %x, cpu %d\n", address, core);
706 bool OsLayer::ReadMSR(uint32 core, uint32 address, uint64 *data) {
707 int fd = OpenMSR(core, address);
711 // Read from the msr.
712 bool res = (sizeof(*data) == read(fd, data, sizeof(*data)));
715 logprintf(5, "Log: Failed to read msr %x core %d\n", address, core);
722 bool OsLayer::WriteMSR(uint32 core, uint32 address, uint64 *data) {
723 int fd = OpenMSR(core, address);
728 bool res = (sizeof(*data) == write(fd, data, sizeof(*data)));
731 logprintf(5, "Log: Failed to write msr %x core %d\n", address, core);
738 // Extract bits [n+len-1, n] from a 32 bit word.
739 // so GetBitField(0x0f00, 8, 4) == 0xf.
740 uint32 OsLayer::GetBitField(uint32 val, uint32 n, uint32 len) {
741 return (val >> n) & ((1<<len) - 1);
744 // Generic CPU stress workload that would work on any CPU/Platform.
745 // Float-point array moving average calculation.
746 bool OsLayer::CpuStressWorkload() {
747 double float_arr[100];
749 unsigned int seed = 12345;
751 // Initialize array with random numbers.
752 for (int i = 0; i < 100; i++) {
753 float_arr[i] = rand_r(&seed);
754 if (rand_r(&seed) % 2)
755 float_arr[i] *= -1.0;
758 // Calculate moving average.
759 for (int i = 0; i < 100000000; i++) {
761 (float_arr[i % 100] + float_arr[(i + 1) % 100] +
762 float_arr[(i + 99) % 100]) / 3;
763 sum += float_arr[i % 100];
766 // Artificial printf so the loops do not get optimized away.
768 logprintf(12, "Log: I'm Feeling Lucky!\n");
772 PCIDevices OsLayer::GetPCIDevices() {
773 PCIDevices device_list;
775 struct dirent *buf = new struct dirent();
776 struct dirent *entry;
777 dir = opendir(kSysfsPath);
779 logprintf(0, "Process Error: Cannot open %s", kSysfsPath);
780 while (readdir_r(dir, buf, &entry) == 0 && entry) {
782 unsigned int dev, func;
783 // ".", ".." or a special non-device perhaps.
784 if (entry->d_name[0] == '.')
787 device = new PCIDevice();
788 if (sscanf(entry->d_name, "%04x:%02hx:%02x.%d",
789 &device->domain, &device->bus, &dev, &func) < 4) {
790 logprintf(0, "Process Error: Couldn't parse %s", entry->d_name);
796 device->vendor_id = PCIGetValue(entry->d_name, "vendor");
797 device->device_id = PCIGetValue(entry->d_name, "device");
798 PCIGetResources(entry->d_name, device);
799 device_list.insert(device_list.end(), device);
806 int OsLayer::PCIGetValue(string name, string object) {
810 snprintf(filename, sizeof(filename), "%s/%s/%s", kSysfsPath,
811 name.c_str(), object.c_str());
812 fd = open(filename, O_RDONLY);
815 len = read(fd, buf, 256);
818 return strtol(buf, NULL, 0); // NOLINT
821 int OsLayer::PCIGetResources(string name, PCIDevice *device) {
829 snprintf(filename, sizeof(filename), "%s/%s/%s", kSysfsPath,
830 name.c_str(), "resource");
831 file = fopen(filename, "r");
833 logprintf(0, "Process Error: impossible to find resource file for %s",
837 for (i = 0; i < 6; i++) {
838 if (!fgets(buf, 256, file))
840 sscanf(buf, "%llx %llx", &start, &end); // NOLINT
843 size = end - start + 1;
844 device->base_addr[i] = start;
845 device->size[i] = size;