1 // Copyright 2008 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 // error_diag.cc: Collects device errors for analysis to more accurately
16 // pin-point failed component.
22 // This file must work with autoconf on its public version,
23 // so these includes are correct.
24 #include "error_diag.h"
28 // DeviceTree constructor.
29 DeviceTree::DeviceTree(string name)
30 : parent_(0), name_(name) {
31 pthread_mutex_init(&device_tree_mutex_, NULL);
34 // DeviceTree destructor.
35 DeviceTree::~DeviceTree() {
36 // Deallocate subtree devices.
37 for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
38 itr != subdevices_.end();
42 // Deallocate device errors.
43 for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
48 pthread_mutex_destroy(&device_tree_mutex_);
51 // Atomically find named device in sub device tree.
52 // Returns 0 if not found
53 DeviceTree *DeviceTree::FindInSubTree(string name) {
55 pthread_mutex_lock(&device_tree_mutex_);
56 ret = UnlockedFindInSubTree(name);
57 pthread_mutex_unlock(&device_tree_mutex_);
61 // Find named device in sub device tree (Non-atomic).
62 // Returns 0 if not found
63 DeviceTree *DeviceTree::UnlockedFindInSubTree(string name) {
64 std::map<string, DeviceTree*>::iterator itr = subdevices_.find(name);
65 if (itr != subdevices_.end()) {
69 for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
70 itr != subdevices_.end();
72 DeviceTree *result = itr->second->UnlockedFindInSubTree(name);
80 // Atomically add error instance to device.
81 void DeviceTree::AddErrorInstance(ErrorInstance *error_instance) {
82 pthread_mutex_lock(&device_tree_mutex_);
83 errors_.push_back(error_instance);
84 pthread_mutex_unlock(&device_tree_mutex_);
87 // Find or add queried device as necessary.
88 DeviceTree *DeviceTree::FindOrAddDevice(string name) {
89 // Assume named device does not exist and try to insert the device anyway.
90 // No-op if named device already exists.
91 InsertSubDevice(name);
92 // Find and return sub device pointer.
93 return FindInSubTree(name);
96 // Pretty prints device tree.
97 void DeviceTree::PrettyPrint(string spacer) {
98 for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
99 itr != subdevices_.end();
101 printf("%s%s\n", spacer.c_str(), itr->first.c_str());
102 itr->second->PrettyPrint(spacer+spacer);
106 // Atomically add sub device.
107 // No-op if named device already exists.
108 void DeviceTree::InsertSubDevice(string name) {
109 pthread_mutex_lock(&device_tree_mutex_);
110 if (UnlockedFindInSubTree(name) != 0) {
111 pthread_mutex_unlock(&device_tree_mutex_);
114 subdevices_[name] = new DeviceTree(name);
115 subdevices_[name]->parent_ = this;
116 pthread_mutex_unlock(&device_tree_mutex_);
120 // Returns true of any error associated with this device is fatal.
121 bool DeviceTree::KnownBad() {
122 pthread_mutex_lock(&device_tree_mutex_);
123 for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
124 itr != errors_.end();
126 if ((*itr)->severity_ == SAT_ERROR_FATAL) {
127 pthread_mutex_unlock(&device_tree_mutex_);
131 pthread_mutex_unlock(&device_tree_mutex_);
136 // ErrorDiag constructor.
137 ErrorDiag::ErrorDiag() {
139 system_tree_root_ = 0;
142 // ErrorDiag destructor.
143 ErrorDiag::~ErrorDiag() {
144 if (system_tree_root_)
145 delete system_tree_root_;
148 // Set platform specific handle and initialize device tree.
149 // Returns false on error. true otherwise.
150 bool ErrorDiag::set_os(OsLayer *os) {
152 return(InitializeDeviceTree());
155 // Create and initialize system device tree.
156 // Returns false on error. true otherwise.
157 bool ErrorDiag::InitializeDeviceTree() {
158 system_tree_root_ = new DeviceTree("system_root");
159 if (!system_tree_root_)
164 // Logs info about a CECC.
165 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
166 int ErrorDiag::AddCeccError(string dimm_string) {
167 DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
168 ECCErrorInstance *error = new ECCErrorInstance;
171 error->severity_ = SAT_ERROR_CORRECTABLE;
172 dimm_device->AddErrorInstance(error);
176 // Logs info about a UECC.
177 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
178 int ErrorDiag::AddUeccError(string dimm_string) {
179 DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
180 ECCErrorInstance *error = new ECCErrorInstance;
183 error->severity_ = SAT_ERROR_FATAL;
184 dimm_device->AddErrorInstance(error);
188 // Logs info about a miscompare.
189 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
190 int ErrorDiag::AddMiscompareError(string dimm_string, uint64 addr, int count) {
191 DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
192 MiscompareErrorInstance *error = new MiscompareErrorInstance;
195 error->severity_ = SAT_ERROR_FATAL;
197 dimm_device->AddErrorInstance(error);
198 os_->ErrorReport(dimm_string.c_str(), "miscompare", count);
202 // Utility Function to translate a virtual address to DIMM number.
203 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
204 string ErrorDiag::AddressToDimmString(OsLayer *os, void *addr, int offset) {
205 char dimm_string[256] = "";
206 char *vbyteaddr = reinterpret_cast<char*>(addr) + offset;
207 uint64 paddr = os->VirtualToPhysical(vbyteaddr);
208 os->FindDimm(paddr, dimm_string, sizeof(dimm_string));
209 return string(dimm_string);
212 // Info about a miscompare from a drive.
213 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
214 int ErrorDiag::AddHDDMiscompareError(string devicename, int block, int offset,
215 void *src_addr, void *dst_addr) {
216 bool mask_hdd_error = false;
218 HDDMiscompareErrorInstance *error = new HDDMiscompareErrorInstance;
222 error->addr_ = reinterpret_cast<uint64>(src_addr);
223 error->addr2_ = reinterpret_cast<uint64>(dst_addr);
224 error->offset_ = offset;
225 error->block_ = block;
227 string src_dimm = AddressToDimmString(os_, src_addr, offset);
228 string dst_dimm = AddressToDimmString(os_, dst_addr, offset);
230 // DIMM name look up success
231 if (src_dimm.compare("DIMM Unknown")) {
232 // Add src DIMM as possible miscompare cause.
233 DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
234 error->causes_.insert(src_dimm_dev);
235 if (src_dimm_dev->KnownBad()) {
236 mask_hdd_error = true;
237 logprintf(5, "Log: supressed %s miscompare report: "
238 "known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
241 if (dst_dimm.compare("DIMM Unknown")) {
242 // Add dst DIMM as possible miscompare cause.
243 DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
244 error->causes_.insert(dst_dimm_dev);
245 if (dst_dimm_dev->KnownBad()) {
246 mask_hdd_error = true;
247 logprintf(5, "Log: supressed %s miscompare report: "
248 "known bad destination: %s\n", devicename.c_str(),
253 DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
254 hdd_dev->AddErrorInstance(error);
256 // HDD error was not masked by bad DIMMs: report bad HDD.
257 if (!mask_hdd_error) {
258 os_->ErrorReport(devicename.c_str(), "miscompare", 1);
259 error->severity_ = SAT_ERROR_FATAL;
265 // Info about a sector tag miscompare from a drive.
266 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
267 int ErrorDiag::AddHDDSectorTagError(string devicename, int block, int offset,
268 int sector, void *src_addr,
270 bool mask_hdd_error = false;
272 HDDSectorTagErrorInstance *error = new HDDSectorTagErrorInstance;
276 error->addr_ = reinterpret_cast<uint64>(src_addr);
277 error->addr2_ = reinterpret_cast<uint64>(dst_addr);
278 error->sector_ = sector;
279 error->block_ = block;
281 string src_dimm = AddressToDimmString(os_, src_addr, offset);
282 string dst_dimm = AddressToDimmString(os_, dst_addr, offset);
284 // DIMM name look up success
285 if (src_dimm.compare("DIMM Unknown")) {
286 // Add src DIMM as possible miscompare cause.
287 DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
288 error->causes_.insert(src_dimm_dev);
289 if (src_dimm_dev->KnownBad()) {
290 mask_hdd_error = true;
291 logprintf(5, "Log: supressed %s sector tag error report: "
292 "known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
295 if (dst_dimm.compare("DIMM Unknown")) {
296 // Add dst DIMM as possible miscompare cause.
297 DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
298 error->causes_.insert(dst_dimm_dev);
299 if (dst_dimm_dev->KnownBad()) {
300 mask_hdd_error = true;
301 logprintf(5, "Log: supressed %s sector tag error report: "
302 "known bad destination: %s\n", devicename.c_str(),
307 DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
308 hdd_dev->AddErrorInstance(error);
310 // HDD error was not masked by bad DIMMs: report bad HDD.
311 if (!mask_hdd_error) {
312 os_->ErrorReport(devicename.c_str(), "sector", 1);
313 error->severity_ = SAT_ERROR_FATAL;