1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
|
@Service public class DiskHealthCheckService {
@Autowired private DiskMonitorService diskMonitorService; @Autowired private AlertService alertService;
@Scheduled(fixedRate = 3600000) public void checkDiskHealth() { try { String hostname = getHostname(); List<DiskMonitorData> diskDataList = diskMonitorService.getRealTimeDiskData(); for (DiskMonitorData diskData : diskDataList) { checkSingleDiskHealth(diskData); } } catch (Exception e) { log.error("硬盘健康检查失败: {}", e.getMessage(), e); } }
private void checkSingleDiskHealth(DiskMonitorData diskData) { try { checkDiskTemperature(diskData); checkDiskIoPerformance(diskData); checkDiskSpaceUsage(diskData); checkDiskErrors(diskData); predictDiskFailure(diskData); } catch (Exception e) { log.error("检查硬盘健康状态失败: deviceName={}, error={}", diskData.getDeviceName(), e.getMessage(), e); } }
private void checkDiskTemperature(DiskMonitorData diskData) { if (diskData.getTemperature() != null) { if (diskData.getTemperature() > 70) { sendHealthAlert(diskData, "DISK_TEMPERATURE_HIGH", "WARNING", String.format("硬盘温度过高: %d°C", diskData.getTemperature())); } else if (diskData.getTemperature() > 60) { sendHealthAlert(diskData, "DISK_TEMPERATURE_WARNING", "INFO", String.format("硬盘温度较高: %d°C", diskData.getTemperature())); } } }
private void checkDiskIoPerformance(DiskMonitorData diskData) { if (diskData.getIoUtilization() > 95) { sendHealthAlert(diskData, "DISK_IO_CRITICAL", "CRITICAL", String.format("硬盘IO利用率过高: %.2f%%", diskData.getIoUtilization())); } else if (diskData.getIoUtilization() > 80) { sendHealthAlert(diskData, "DISK_IO_HIGH", "WARNING", String.format("硬盘IO利用率较高: %.2f%%", diskData.getIoUtilization())); } if (diskData.getReadLatency() > 100) { sendHealthAlert(diskData, "DISK_READ_LATENCY_HIGH", "WARNING", String.format("硬盘读取延迟过高: %.2fms", diskData.getReadLatency())); } if (diskData.getWriteLatency() > 100) { sendHealthAlert(diskData, "DISK_WRITE_LATENCY_HIGH", "WARNING", String.format("硬盘写入延迟过高: %.2fms", diskData.getWriteLatency())); } }
private void checkDiskSpaceUsage(DiskMonitorData diskData) { if (diskData.getSpaceUsage() > 95) { sendHealthAlert(diskData, "DISK_SPACE_CRITICAL", "CRITICAL", String.format("硬盘空间严重不足: %.2f%%", diskData.getSpaceUsage())); } else if (diskData.getSpaceUsage() > 85) { sendHealthAlert(diskData, "DISK_SPACE_HIGH", "WARNING", String.format("硬盘空间不足: %.2f%%", diskData.getSpaceUsage())); } }
private void checkDiskErrors(DiskMonitorData diskData) { try { checkSmartInfo(diskData); checkDiskErrorLogs(diskData); } catch (Exception e) { log.error("检查硬盘错误失败: {}", e.getMessage(), e); } }
private void checkSmartInfo(DiskMonitorData diskData) { try { ProcessBuilder pb = new ProcessBuilder("smartctl", "-H", diskData.getDeviceName()); Process process = pb.start(); int exitCode = process.waitFor(); if (exitCode != 0) { sendHealthAlert(diskData, "DISK_SMART_ERROR", "WARNING", "硬盘SMART检查失败"); } } catch (Exception e) { log.error("检查SMART信息失败: {}", e.getMessage(), e); } }
private void checkDiskErrorLogs(DiskMonitorData diskData) { try { ProcessBuilder pb = new ProcessBuilder("dmesg", "|", "grep", "-i", "error"); Process process = pb.start(); BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream())); String line; int errorCount = 0; while ((line = reader.readLine()) != null) { if (line.contains(diskData.getDeviceName()) && line.contains("error")) { errorCount++; } } if (errorCount > 10) { sendHealthAlert(diskData, "DISK_ERROR_LOG_HIGH", "WARNING", String.format("硬盘错误日志过多: %d条", errorCount)); } } catch (Exception e) { log.error("检查硬盘错误日志失败: {}", e.getMessage(), e); } }
private void predictDiskFailure(DiskMonitorData diskData) { try { Date endTime = new Date(); Date startTime = new Date(endTime.getTime() - 7 * 24 * 60 * 60 * 1000); List<DiskMonitorData> historyData = diskMonitorService.getDiskHistoryData( diskData.getHostname(), diskData.getDeviceName(), startTime, endTime); analyzeFailureTrend(diskData, historyData); } catch (Exception e) { log.error("预测硬盘故障失败: {}", e.getMessage(), e); } }
private void analyzeFailureTrend(DiskMonitorData diskData, List<DiskMonitorData> historyData) { try { double avgIoUtilization = historyData.stream() .mapToDouble(DiskMonitorData::getIoUtilization) .average() .orElse(0.0); double avgTemperature = historyData.stream() .filter(data -> data.getTemperature() != null) .mapToInt(DiskMonitorData::getTemperature) .average() .orElse(0.0); double spaceUsageTrend = calculateSpaceUsageTrend(historyData); int riskLevel = evaluateFailureRisk(avgIoUtilization, avgTemperature, spaceUsageTrend); if (riskLevel > 7) { sendHealthAlert(diskData, "DISK_FAILURE_RISK_HIGH", "WARNING", String.format("硬盘故障风险较高: 风险等级%d", riskLevel)); } else if (riskLevel > 5) { sendHealthAlert(diskData, "DISK_FAILURE_RISK_MEDIUM", "INFO", String.format("硬盘故障风险中等: 风险等级%d", riskLevel)); } } catch (Exception e) { log.error("分析故障趋势失败: {}", e.getMessage(), e); } }
private double calculateSpaceUsageTrend(List<DiskMonitorData> historyData) { if (historyData.size() < 2) { return 0.0; } double firstUsage = historyData.get(0).getSpaceUsage(); double lastUsage = historyData.get(historyData.size() - 1).getSpaceUsage(); return lastUsage - firstUsage; }
private int evaluateFailureRisk(double avgIoUtilization, double avgTemperature, double spaceUsageTrend) { int riskLevel = 0; if (avgIoUtilization > 90) { riskLevel += 3; } else if (avgIoUtilization > 70) { riskLevel += 2; } else if (avgIoUtilization > 50) { riskLevel += 1; } if (avgTemperature > 65) { riskLevel += 3; } else if (avgTemperature > 55) { riskLevel += 2; } else if (avgTemperature > 45) { riskLevel += 1; } if (spaceUsageTrend > 5) { riskLevel += 2; } else if (spaceUsageTrend > 2) { riskLevel += 1; } return Math.min(riskLevel, 10); }
private void sendHealthAlert(DiskMonitorData diskData, String alertType, String alertLevel, String alertMessage) { try { DiskAlertRecord alertRecord = new DiskAlertRecord(); alertRecord.setHostname(diskData.getHostname()); alertRecord.setDeviceName(diskData.getDeviceName()); alertRecord.setAlertType(alertType); alertRecord.setAlertLevel(alertLevel); alertRecord.setAlertMessage(alertMessage); alertRecord.setAlertStatus("ACTIVE"); alertRecord.setAlertTime(new Date()); alertService.sendDiskAlert(alertRecord); log.warn("发送硬盘健康告警: hostname={}, deviceName={}, type={}, level={}", diskData.getHostname(), diskData.getDeviceName(), alertType, alertLevel); } catch (Exception e) { log.error("发送健康告警失败: {}", e.getMessage(), e); } }
private String getHostname() { try { return InetAddress.getLocalHost().getHostName(); } catch (UnknownHostException e) { return "unknown"; } } }
|