第294集Java业务自动巡检架构实战:智能巡检、异常检测与企业级业务监控解决方案 | 字数总计: 7.2k | 阅读时长: 35分钟 | 阅读量:
前言 Java业务自动巡检作为企业级应用运维的核心能力之一,直接影响着业务的稳定性和用户体验。通过智能的巡检策略,完善的异常检测机制,能够及时发现业务异常,预防潜在问题,保障企业级应用的高可用性。本文从巡检策略设计到异常检测,从基础原理到企业级实践,系统梳理Java业务自动巡检的完整解决方案。
一、业务自动巡检架构设计 1.1 巡检系统整体架构
1.2 巡检核心组件 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 @Component public class BusinessInspectionEngine { @Autowired private InspectionTaskScheduler taskScheduler; @Autowired private InspectionExecutor executor; @Autowired private ExceptionDetector exceptionDetector; @Autowired private AlertNotifier alertNotifier; @Autowired private InspectionResultStorage resultStorage; public void startInspectionEngine () { try { initializeInspectionTasks(); taskScheduler.startScheduler(); executor.startExecutor(); exceptionDetector.startDetector(); log.info("业务巡检引擎启动成功" ); } catch (Exception e) { log.error("业务巡检引擎启动失败" , e); throw new InspectionEngineException ("巡检引擎启动失败" , e); } } public void stopInspectionEngine () { try { exceptionDetector.stopDetector(); executor.stopExecutor(); taskScheduler.stopScheduler(); log.info("业务巡检引擎停止成功" ); } catch (Exception e) { log.error("业务巡检引擎停止失败" , e); } } private void initializeInspectionTasks () { List<InspectionConfig> configs = loadInspectionConfigs(); for (InspectionConfig config : configs) { InspectionTask task = createInspectionTask(config); taskScheduler.scheduleTask(task); } log.info("初始化巡检任务完成,任务数量: {}" , configs.size()); } private InspectionTask createInspectionTask (InspectionConfig config) { InspectionTask task = new InspectionTask (); task.setTaskId(config.getTaskId()); task.setTaskName(config.getTaskName()); task.setTaskType(config.getTaskType()); task.setCronExpression(config.getCronExpression()); task.setCheckPoints(config.getCheckPoints()); task.setEnabled(config.isEnabled()); return task; } private List<InspectionConfig> loadInspectionConfigs () { return new ArrayList <>(); } }
二、巡检任务调度与执行 2.1 巡检任务调度器 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 @Component public class InspectionTaskScheduler { private final ScheduledExecutorService scheduler; private final Map<String, ScheduledFuture<?>> scheduledTasks; private final Map<String, InspectionTask> taskMap; public InspectionTaskScheduler () { this .scheduler = Executors.newScheduledThreadPool(10 ); this .scheduledTasks = new ConcurrentHashMap <>(); this .taskMap = new ConcurrentHashMap <>(); } public void startScheduler () { log.info("巡检任务调度器启动" ); } public void stopScheduler () { try { scheduledTasks.values().forEach(future -> future.cancel(true )); scheduledTasks.clear(); scheduler.shutdown(); if (!scheduler.awaitTermination(30 , TimeUnit.SECONDS)) { scheduler.shutdownNow(); } log.info("巡检任务调度器停止" ); } catch (Exception e) { log.error("巡检任务调度器停止失败" , e); } } public void scheduleTask (InspectionTask task) { try { if (!task.isEnabled()) { log.info("任务已禁用,跳过调度: {}" , task.getTaskName()); return ; } CronExpression cronExpression = new CronExpression (task.getCronExpression()); Date nextExecutionTime = cronExpression.getNextValidTimeAfter(new Date ()); if (nextExecutionTime == null ) { log.warn("无法计算下次执行时间: {}" , task.getTaskName()); return ; } long delay = nextExecutionTime.getTime() - System.currentTimeMillis(); ScheduledFuture<?> future = scheduler.scheduleAtFixedRate( () -> executeInspectionTask(task), delay, calculatePeriod(task.getCronExpression()), TimeUnit.MILLISECONDS ); scheduledTasks.put(task.getTaskId(), future); taskMap.put(task.getTaskId(), task); log.info("巡检任务调度成功: {}, 下次执行时间: {}" , task.getTaskName(), nextExecutionTime); } catch (Exception e) { log.error("巡检任务调度失败: {}" , task.getTaskName(), e); } } private void executeInspectionTask (InspectionTask task) { try { log.info("开始执行巡检任务: {}" , task.getTaskName()); InspectionContext context = createInspectionContext(task); InspectionResult result = executeInspection(context); handleInspectionResult(result); log.info("巡检任务执行完成: {}" , task.getTaskName()); } catch (Exception e) { log.error("巡检任务执行失败: {}" , task.getTaskName(), e); } } private InspectionContext createInspectionContext (InspectionTask task) { InspectionContext context = new InspectionContext (); context.setTaskId(task.getTaskId()); context.setTaskName(task.getTaskName()); context.setTaskType(task.getTaskType()); context.setStartTime(System.currentTimeMillis()); context.setCheckPoints(task.getCheckPoints()); return context; } private InspectionResult executeInspection (InspectionContext context) { InspectionResult result = new InspectionResult (); result.setTaskId(context.getTaskId()); result.setTaskName(context.getTaskName()); result.setStartTime(context.getStartTime()); List<CheckPointResult> checkPointResults = new ArrayList <>(); for (CheckPoint checkPoint : context.getCheckPoints()) { try { CheckPointResult checkPointResult = executeCheckPoint(checkPoint); checkPointResults.add(checkPointResult); } catch (Exception e) { log.error("检查点执行失败: {}" , checkPoint.getName(), e); CheckPointResult errorResult = new CheckPointResult (); errorResult.setCheckPointName(checkPoint.getName()); errorResult.setStatus(CheckPointStatus.FAILED); errorResult.setErrorMessage(e.getMessage()); checkPointResults.add(errorResult); } } result.setCheckPointResults(checkPointResults); result.setEndTime(System.currentTimeMillis()); result.setDuration(result.getEndTime() - result.getStartTime()); return result; } private CheckPointResult executeCheckPoint (CheckPoint checkPoint) { CheckPointResult result = new CheckPointResult (); result.setCheckPointName(checkPoint.getName()); result.setStartTime(System.currentTimeMillis()); try { switch (checkPoint.getType()) { case DATA_CHECK: result = executeDataCheck(checkPoint); break ; case INTERFACE_CHECK: result = executeInterfaceCheck(checkPoint); break ; case PERFORMANCE_CHECK: result = executePerformanceCheck(checkPoint); break ; case BUSINESS_LOGIC_CHECK: result = executeBusinessLogicCheck(checkPoint); break ; default : throw new UnsupportedOperationException ("不支持的检查点类型: " + checkPoint.getType()); } } catch (Exception e) { result.setStatus(CheckPointStatus.FAILED); result.setErrorMessage(e.getMessage()); } result.setEndTime(System.currentTimeMillis()); result.setDuration(result.getEndTime() - result.getStartTime()); return result; } private long calculatePeriod (String cronExpression) { try { CronExpression cron = new CronExpression (cronExpression); Date now = new Date (); Date next1 = cron.getNextValidTimeAfter(now); Date next2 = cron.getNextValidTimeAfter(next1); return next2.getTime() - next1.getTime(); } catch (Exception e) { log.error("计算执行周期失败" , e); return 60000 ; } } }
2.2 检查点执行器 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 @Component public class CheckPointExecutor { @Autowired private DataCheckExecutor dataCheckExecutor; @Autowired private InterfaceCheckExecutor interfaceCheckExecutor; @Autowired private PerformanceCheckExecutor performanceCheckExecutor; @Autowired private BusinessLogicCheckExecutor businessLogicCheckExecutor; public CheckPointResult executeDataCheck (CheckPoint checkPoint) { return dataCheckExecutor.execute(checkPoint); } public CheckPointResult executeInterfaceCheck (CheckPoint checkPoint) { return interfaceCheckExecutor.execute(checkPoint); } public CheckPointResult executePerformanceCheck (CheckPoint checkPoint) { return performanceCheckExecutor.execute(checkPoint); } public CheckPointResult executeBusinessLogicCheck (CheckPoint checkPoint) { return businessLogicCheckExecutor.execute(checkPoint); } }
2.3 数据检查执行器 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 @Component public class DataCheckExecutor { @Autowired private DataSource dataSource; @Autowired private DataValidator dataValidator; public CheckPointResult execute (CheckPoint checkPoint) { CheckPointResult result = new CheckPointResult (); result.setCheckPointName(checkPoint.getName()); result.setStartTime(System.currentTimeMillis()); try { DataCheckConfig config = parseDataCheckConfig(checkPoint.getConfig()); List<Map<String, Object>> data = executeDataQuery(config); ValidationResult validationResult = dataValidator.validate(data, config.getValidationRules()); if (validationResult.isValid()) { result.setStatus(CheckPointStatus.PASSED); result.setMessage("数据检查通过" ); } else { result.setStatus(CheckPointStatus.FAILED); result.setMessage("数据检查失败: " + validationResult.getErrorMessage()); result.setErrorDetails(validationResult.getErrorDetails()); } result.setCheckData(data); } catch (Exception e) { result.setStatus(CheckPointStatus.FAILED); result.setErrorMessage("数据检查异常: " + e.getMessage()); log.error("数据检查执行失败: {}" , checkPoint.getName(), e); } result.setEndTime(System.currentTimeMillis()); result.setDuration(result.getEndTime() - result.getStartTime()); return result; } private DataCheckConfig parseDataCheckConfig (String configJson) { try { ObjectMapper mapper = new ObjectMapper (); return mapper.readValue(configJson, DataCheckConfig.class); } catch (Exception e) { throw new ConfigurationException ("数据检查配置解析失败" , e); } } private List<Map<String, Object>> executeDataQuery (DataCheckConfig config) { try { String sql = config.getSql(); List<Object> parameters = config.getParameters(); if (parameters == null || parameters.isEmpty()) { return dataSource.queryForList(sql); } else { return dataSource.queryForList(sql, parameters.toArray()); } } catch (Exception e) { throw new DataQueryException ("数据查询失败" , e); } } public static class DataCheckConfig { private String sql; private List<Object> parameters; private List<ValidationRule> validationRules; public String getSql () { return sql; } public void setSql (String sql) { this .sql = sql; } public List<Object> getParameters () { return parameters; } public void setParameters (List<Object> parameters) { this .parameters = parameters; } public List<ValidationRule> getValidationRules () { return validationRules; } public void setValidationRules (List<ValidationRule> validationRules) { this .validationRules = validationRules; } } }
2.4 接口检查执行器 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 @Component public class InterfaceCheckExecutor { @Autowired private RestTemplate restTemplate; @Autowired private InterfaceValidator interfaceValidator; public CheckPointResult execute (CheckPoint checkPoint) { CheckPointResult result = new CheckPointResult (); result.setCheckPointName(checkPoint.getName()); result.setStartTime(System.currentTimeMillis()); try { InterfaceCheckConfig config = parseInterfaceCheckConfig(checkPoint.getConfig()); InterfaceResponse response = executeInterfaceCall(config); ValidationResult validationResult = interfaceValidator.validate(response, config.getValidationRules()); if (validationResult.isValid()) { result.setStatus(CheckPointStatus.PASSED); result.setMessage("接口检查通过" ); } else { result.setStatus(CheckPointStatus.FAILED); result.setMessage("接口检查失败: " + validationResult.getErrorMessage()); result.setErrorDetails(validationResult.getErrorDetails()); } result.setResponseData(response); } catch (Exception e) { result.setStatus(CheckPointStatus.FAILED); result.setErrorMessage("接口检查异常: " + e.getMessage()); log.error("接口检查执行失败: {}" , checkPoint.getName(), e); } result.setEndTime(System.currentTimeMillis()); result.setDuration(result.getEndTime() - result.getStartTime()); return result; } private InterfaceCheckConfig parseInterfaceCheckConfig (String configJson) { try { ObjectMapper mapper = new ObjectMapper (); return mapper.readValue(configJson, InterfaceCheckConfig.class); } catch (Exception e) { throw new ConfigurationException ("接口检查配置解析失败" , e); } } private InterfaceResponse executeInterfaceCall (InterfaceCheckConfig config) { try { String url = config.getUrl(); HttpMethod method = config.getMethod(); Map<String, String> headers = config.getHeaders(); Object requestBody = config.getRequestBody(); int timeout = config.getTimeout(); HttpHeaders httpHeaders = new HttpHeaders (); if (headers != null ) { headers.forEach(httpHeaders::set); } HttpEntity<?> requestEntity = new HttpEntity <>(requestBody, httpHeaders); restTemplate.getRequestFactory().setConnectTimeout(timeout); restTemplate.getRequestFactory().setReadTimeout(timeout); ResponseEntity<String> responseEntity = restTemplate.exchange( url, method, requestEntity, String.class ); InterfaceResponse response = new InterfaceResponse (); response.setStatusCode(responseEntity.getStatusCodeValue()); response.setHeaders(responseEntity.getHeaders().toSingleValueMap()); response.setBody(responseEntity.getBody()); response.setResponseTime(System.currentTimeMillis()); return response; } catch (Exception e) { throw new InterfaceCallException ("接口调用失败" , e); } } public static class InterfaceCheckConfig { private String url; private HttpMethod method; private Map<String, String> headers; private Object requestBody; private int timeout; private List<ValidationRule> validationRules; public String getUrl () { return url; } public void setUrl (String url) { this .url = url; } public HttpMethod getMethod () { return method; } public void setMethod (HttpMethod method) { this .method = method; } public Map<String, String> getHeaders () { return headers; } public void setHeaders (Map<String, String> headers) { this .headers = headers; } public Object getRequestBody () { return requestBody; } public void setRequestBody (Object requestBody) { this .requestBody = requestBody; } public int getTimeout () { return timeout; } public void setTimeout (int timeout) { this .timeout = timeout; } public List<ValidationRule> getValidationRules () { return validationRules; } public void setValidationRules (List<ValidationRule> validationRules) { this .validationRules = validationRules; } } }
三、异常检测与告警 3.1 异常检测器 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 @Component public class ExceptionDetector { @Autowired private InspectionResultAnalyzer resultAnalyzer; @Autowired private ExceptionClassifier exceptionClassifier; @Autowired private AlertRuleEngine alertRuleEngine; private final ScheduledExecutorService detectorScheduler; public ExceptionDetector () { this .detectorScheduler = Executors.newScheduledThreadPool(5 ); } public void startDetector () { detectorScheduler.scheduleAtFixedRate( this ::detectExceptions, 0 , 30 , TimeUnit.SECONDS ); log.info("异常检测器启动成功" ); } public void stopDetector () { try { detectorScheduler.shutdown(); if (!detectorScheduler.awaitTermination(30 , TimeUnit.SECONDS)) { detectorScheduler.shutdownNow(); } log.info("异常检测器停止成功" ); } catch (Exception e) { log.error("异常检测器停止失败" , e); } } private void detectExceptions () { try { List<InspectionResult> recentResults = getRecentInspectionResults(); List<ExceptionDetection> detections = resultAnalyzer.analyzeResults(recentResults); for (ExceptionDetection detection : detections) { ExceptionType type = exceptionClassifier.classify(detection); detection.setExceptionType(type); } for (ExceptionDetection detection : detections) { List<AlertRule> triggeredRules = alertRuleEngine.checkRules(detection); if (!triggeredRules.isEmpty()) { triggerAlerts(detection, triggeredRules); } } } catch (Exception e) { log.error("异常检测失败" , e); } } private List<InspectionResult> getRecentInspectionResults () { return new ArrayList <>(); } private void triggerAlerts (ExceptionDetection detection, List<AlertRule> triggeredRules) { try { for (AlertRule rule : triggeredRules) { Alert alert = createAlert(detection, rule); sendAlert(alert); } } catch (Exception e) { log.error("告警触发失败" , e); } } private Alert createAlert (ExceptionDetection detection, AlertRule rule) { Alert alert = new Alert (); alert.setAlertId(generateAlertId()); alert.setAlertType(rule.getAlertType()); alert.setSeverity(rule.getSeverity()); alert.setTitle(rule.getTitle()); alert.setMessage(buildAlertMessage(detection, rule)); alert.setDetection(detection); alert.setRule(rule); alert.setTimestamp(System.currentTimeMillis()); return alert; } private void sendAlert (Alert alert) { log.warn("发送告警: {}" , alert.getTitle()); } private String generateAlertId () { return "alert-" + System.currentTimeMillis() + "-" + Thread.currentThread().getId(); } private String buildAlertMessage (ExceptionDetection detection, AlertRule rule) { StringBuilder message = new StringBuilder (); message.append("检测到异常: " ).append(detection.getDescription()).append("\n" ); message.append("异常类型: " ).append(detection.getExceptionType()).append("\n" ); message.append("检查点: " ).append(detection.getCheckPointName()).append("\n" ); message.append("检测时间: " ).append(new Date (detection.getTimestamp())).append("\n" ); message.append("告警规则: " ).append(rule.getName()); return message.toString(); } }
3.2 巡检结果分析器 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 @Component public class InspectionResultAnalyzer { @Autowired private InspectionResultStorage resultStorage; @Autowired private TrendAnalyzer trendAnalyzer; public List<ExceptionDetection> analyzeResults (List<InspectionResult> results) { List<ExceptionDetection> detections = new ArrayList <>(); try { for (InspectionResult result : results) { List<ExceptionDetection> resultDetections = analyzeSingleResult(result); detections.addAll(resultDetections); } List<ExceptionDetection> trendDetections = analyzeTrends(results); detections.addAll(trendDetections); List<ExceptionDetection> patternDetections = analyzePatterns(results); detections.addAll(patternDetections); } catch (Exception e) { log.error("巡检结果分析失败" , e); } return detections; } private List<ExceptionDetection> analyzeSingleResult (InspectionResult result) { List<ExceptionDetection> detections = new ArrayList <>(); for (CheckPointResult checkPointResult : result.getCheckPointResults()) { if (checkPointResult.getStatus() == CheckPointStatus.FAILED) { ExceptionDetection detection = new ExceptionDetection (); detection.setDetectionId(generateDetectionId()); detection.setTaskId(result.getTaskId()); detection.setTaskName(result.getTaskName()); detection.setCheckPointName(checkPointResult.getCheckPointName()); detection.setDescription(checkPointResult.getErrorMessage()); detection.setSeverity(calculateSeverity(checkPointResult)); detection.setTimestamp(result.getStartTime()); detection.setDetectionType(DetectionType.SINGLE_FAILURE); detections.add(detection); } } return detections; } private List<ExceptionDetection> analyzeTrends (List<InspectionResult> results) { List<ExceptionDetection> detections = new ArrayList <>(); try { Map<String, List<InspectionResult>> groupedResults = results.stream() .collect(Collectors.groupingBy(InspectionResult::getTaskId)); for (Map.Entry<String, List<InspectionResult>> entry : groupedResults.entrySet()) { String taskId = entry.getKey(); List<InspectionResult> taskResults = entry.getValue(); TrendAnalysis failureRateTrend = trendAnalyzer.analyzeFailureRateTrend(taskResults); if (failureRateTrend.isIncreasing() && failureRateTrend.getChangeRate() > 0.2 ) { ExceptionDetection detection = new ExceptionDetection (); detection.setDetectionId(generateDetectionId()); detection.setTaskId(taskId); detection.setTaskName(taskResults.get(0 ).getTaskName()); detection.setDescription("失败率持续上升" ); detection.setSeverity(Severity.HIGH); detection.setTimestamp(System.currentTimeMillis()); detection.setDetectionType(DetectionType.TREND_ANALYSIS); detection.setTrendAnalysis(failureRateTrend); detections.add(detection); } TrendAnalysis responseTimeTrend = trendAnalyzer.analyzeResponseTimeTrend(taskResults); if (responseTimeTrend.isIncreasing() && responseTimeTrend.getChangeRate() > 0.3 ) { ExceptionDetection detection = new ExceptionDetection (); detection.setDetectionId(generateDetectionId()); detection.setTaskId(taskId); detection.setTaskName(taskResults.get(0 ).getTaskName()); detection.setDescription("响应时间持续增长" ); detection.setSeverity(Severity.MEDIUM); detection.setTimestamp(System.currentTimeMillis()); detection.setDetectionType(DetectionType.TREND_ANALYSIS); detection.setTrendAnalysis(responseTimeTrend); detections.add(detection); } } } catch (Exception e) { log.error("趋势分析失败" , e); } return detections; } private List<ExceptionDetection> analyzePatterns (List<InspectionResult> results) { List<ExceptionDetection> detections = new ArrayList <>(); try { Map<String, Integer> errorPatterns = new HashMap <>(); for (InspectionResult result : results) { for (CheckPointResult checkPointResult : result.getCheckPointResults()) { if (checkPointResult.getStatus() == CheckPointStatus.FAILED) { String errorPattern = checkPointResult.getErrorMessage(); errorPatterns.merge(errorPattern, 1 , Integer::sum); } } } for (Map.Entry<String, Integer> entry : errorPatterns.entrySet()) { String errorPattern = entry.getKey(); int count = entry.getValue(); if (count >= 3 ) { ExceptionDetection detection = new ExceptionDetection (); detection.setDetectionId(generateDetectionId()); detection.setDescription("频繁出现错误模式: " + errorPattern); detection.setSeverity(Severity.MEDIUM); detection.setTimestamp(System.currentTimeMillis()); detection.setDetectionType(DetectionType.PATTERN_ANALYSIS); detection.setPatternCount(count); detections.add(detection); } } } catch (Exception e) { log.error("模式分析失败" , e); } return detections; } private Severity calculateSeverity (CheckPointResult checkPointResult) { String errorMessage = checkPointResult.getErrorMessage(); if (errorMessage.contains("数据库" ) || errorMessage.contains("连接" )) { return Severity.CRITICAL; } else if (errorMessage.contains("超时" ) || errorMessage.contains("性能" )) { return Severity.HIGH; } else if (errorMessage.contains("数据" ) || errorMessage.contains("业务" )) { return Severity.MEDIUM; } else { return Severity.LOW; } } private String generateDetectionId () { return "detection-" + System.currentTimeMillis() + "-" + Thread.currentThread().getId(); } }
3.3 告警规则引擎 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 @Component public class AlertRuleEngine { private final Map<String, AlertRule> alertRules; public AlertRuleEngine () { this .alertRules = new HashMap <>(); initializeDefaultRules(); } private void initializeDefaultRules () { AlertRule criticalRule = new AlertRule (); criticalRule.setRuleId("critical-exception" ); criticalRule.setName("严重异常告警" ); criticalRule.setDescription("检测到严重异常时立即告警" ); criticalRule.setSeverity(Severity.CRITICAL); criticalRule.setAlertType(AlertType.IMMEDIATE); criticalRule.setConditions(Arrays.asList( "exceptionType == 'CRITICAL'" , "severity == 'CRITICAL'" )); alertRules.put(criticalRule.getRuleId(), criticalRule); AlertRule frequencyRule = new AlertRule (); frequencyRule.setRuleId("high-frequency-exception" ); frequencyRule.setName("高频异常告警" ); frequencyRule.setDescription("短时间内出现大量异常时告警" ); frequencyRule.setSeverity(Severity.HIGH); frequencyRule.setAlertType(AlertType.BATCH); frequencyRule.setConditions(Arrays.asList( "exceptionCount > 10" , "timeWindow == '5min'" )); alertRules.put(frequencyRule.getRuleId(), frequencyRule); AlertRule trendRule = new AlertRule (); trendRule.setRuleId("trend-exception" ); trendRule.setName("趋势异常告警" ); trendRule.setDescription("异常趋势持续恶化时告警" ); trendRule.setSeverity(Severity.MEDIUM); trendRule.setAlertType(AlertType.TREND); trendRule.setConditions(Arrays.asList( "trendDirection == 'INCREASING'" , "changeRate > 0.2" )); alertRules.put(trendRule.getRuleId(), trendRule); } public List<AlertRule> checkRules (ExceptionDetection detection) { List<AlertRule> triggeredRules = new ArrayList <>(); for (AlertRule rule : alertRules.values()) { if (evaluateRule(rule, detection)) { triggeredRules.add(rule); } } return triggeredRules; } private boolean evaluateRule (AlertRule rule, ExceptionDetection detection) { try { for (String condition : rule.getConditions()) { if (!evaluateCondition(condition, detection)) { return false ; } } return true ; } catch (Exception e) { log.error("规则评估失败: {}" , rule.getName(), e); return false ; } } private boolean evaluateCondition (String condition, ExceptionDetection detection) { if (condition.contains("exceptionType == 'CRITICAL'" )) { return detection.getExceptionType() == ExceptionType.CRITICAL; } if (condition.contains("severity == 'CRITICAL'" )) { return detection.getSeverity() == Severity.CRITICAL; } if (condition.contains("exceptionCount > 10" )) { return detection.getPatternCount() > 10 ; } if (condition.contains("trendDirection == 'INCREASING'" )) { return detection.getTrendAnalysis() != null && detection.getTrendAnalysis().isIncreasing(); } if (condition.contains("changeRate > 0.2" )) { return detection.getTrendAnalysis() != null && detection.getTrendAnalysis().getChangeRate() > 0.2 ; } return false ; } public void addRule (AlertRule rule) { alertRules.put(rule.getRuleId(), rule); } public void removeRule (String ruleId) { alertRules.remove(ruleId); } public List<AlertRule> getAllRules () { return new ArrayList <>(alertRules.values()); } }
四、企业级巡检方案 4.1 巡检配置管理 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 @Service public class InspectionConfigService { @Autowired private ConfigurationRepository configRepository; public InspectionConfig getConfig (String configId) { return configRepository.findById(configId) .orElseThrow(() -> new ConfigNotFoundException ("巡检配置不存在: " + configId)); } public void saveConfig (InspectionConfig config) { try { validateConfig(config); configRepository.save(config); log.info("巡检配置保存成功: {}" , config.getConfigId()); } catch (Exception e) { log.error("巡检配置保存失败" , e); throw new ConfigSaveException ("巡检配置保存失败" , e); } } public void updateConfig (String configId, InspectionConfig config) { try { if (!configRepository.existsById(configId)) { throw new ConfigNotFoundException ("巡检配置不存在: " + configId); } validateConfig(config); config.setConfigId(configId); configRepository.save(config); log.info("巡检配置更新成功: {}" , configId); } catch (Exception e) { log.error("巡检配置更新失败" , e); throw new ConfigUpdateException ("巡检配置更新失败" , e); } } public void deleteConfig (String configId) { try { if (!configRepository.existsById(configId)) { throw new ConfigNotFoundException ("巡检配置不存在: " + configId); } configRepository.deleteById(configId); log.info("巡检配置删除成功: {}" , configId); } catch (Exception e) { log.error("巡检配置删除失败" , e); throw new ConfigDeleteException ("巡检配置删除失败" , e); } } public List<InspectionConfig> getAllConfigs () { return configRepository.findAll(); } private void validateConfig (InspectionConfig config) { if (config.getConfigId() == null || config.getConfigId().isEmpty()) { throw new ConfigValidationException ("配置ID不能为空" ); } if (config.getTaskName() == null || config.getTaskName().isEmpty()) { throw new ConfigValidationException ("任务名称不能为空" ); } if (config.getCronExpression() == null || config.getCronExpression().isEmpty()) { throw new ConfigValidationException ("Cron表达式不能为空" ); } if (config.getCheckPoints() == null || config.getCheckPoints().isEmpty()) { throw new ConfigValidationException ("检查点不能为空" ); } try { new CronExpression (config.getCronExpression()); } catch (Exception e) { throw new ConfigValidationException ("Cron表达式格式错误" , e); } } }
4.2 巡检结果存储 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 @Service public class InspectionResultStorageService { @Autowired private InspectionResultRepository resultRepository; @Autowired private CheckPointResultRepository checkPointResultRepository; public void storeResult (InspectionResult result) { try { resultRepository.save(result); for (CheckPointResult checkPointResult : result.getCheckPointResults()) { checkPointResult.setInspectionResultId(result.getResultId()); checkPointResultRepository.save(checkPointResult); } log.info("巡检结果存储成功: {}" , result.getResultId()); } catch (Exception e) { log.error("巡检结果存储失败" , e); throw new ResultStorageException ("巡检结果存储失败" , e); } } public InspectionResult getResult (String resultId) { return resultRepository.findById(resultId) .orElseThrow(() -> new ResultNotFoundException ("巡检结果不存在: " + resultId)); } public List<InspectionResult> getRecentResults (int limit) { return resultRepository.findRecentResults(limit); } public List<InspectionResult> getResultsByTask (String taskId, int limit) { return resultRepository.findByTaskIdOrderByStartTimeDesc(taskId, limit); } public InspectionStatistics getStatistics (String taskId, Date startTime, Date endTime) { InspectionStatistics statistics = new InspectionStatistics (); try { List<InspectionResult> results = resultRepository.findByTaskIdAndStartTimeBetween( taskId, startTime, endTime ); statistics.setTotalCount(results.size()); statistics.setSuccessCount((int ) results.stream() .filter(result -> result.getStatus() == InspectionStatus.SUCCESS) .count()); statistics.setFailureCount((int ) results.stream() .filter(result -> result.getStatus() == InspectionStatus.FAILURE) .count()); if (statistics.getTotalCount() > 0 ) { statistics.setSuccessRate((double ) statistics.getSuccessCount() / statistics.getTotalCount()); } double avgDuration = results.stream() .mapToLong(InspectionResult::getDuration) .average() .orElse(0.0 ); statistics.setAverageDuration(avgDuration); long maxDuration = results.stream() .mapToLong(InspectionResult::getDuration) .max() .orElse(0L ); statistics.setMaxDuration(maxDuration); long minDuration = results.stream() .mapToLong(InspectionResult::getDuration) .min() .orElse(0L ); statistics.setMinDuration(minDuration); } catch (Exception e) { log.error("获取巡检统计信息失败" , e); throw new StatisticsException ("获取巡检统计信息失败" , e); } return statistics; } }
4.3 巡检报告生成 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 @Service public class InspectionReportService { @Autowired private InspectionResultStorageService resultStorageService; @Autowired private ReportTemplateService templateService; public InspectionReport generateReport (String taskId, Date startTime, Date endTime) { InspectionReport report = new InspectionReport (); try { report.setReportId(generateReportId()); report.setTaskId(taskId); report.setStartTime(startTime); report.setEndTime(endTime); report.setGenerateTime(new Date ()); List<InspectionResult> results = resultStorageService.getResultsByTask(taskId, 100 ); InspectionStatistics statistics = resultStorageService.getStatistics(taskId, startTime, endTime); report.setStatistics(statistics); TrendAnalysis trendAnalysis = generateTrendAnalysis(results); report.setTrendAnalysis(trendAnalysis); ExceptionAnalysis exceptionAnalysis = generateExceptionAnalysis(results); report.setExceptionAnalysis(exceptionAnalysis); List<Recommendation> recommendations = generateRecommendations(statistics, trendAnalysis, exceptionAnalysis); report.setRecommendations(recommendations); String summary = generateSummary(report); report.setSummary(summary); log.info("巡检报告生成成功: {}" , report.getReportId()); } catch (Exception e) { log.error("巡检报告生成失败" , e); throw new ReportGenerationException ("巡检报告生成失败" , e); } return report; } private TrendAnalysis generateTrendAnalysis (List<InspectionResult> results) { TrendAnalysis analysis = new TrendAnalysis (); try { results.sort(Comparator.comparing(InspectionResult::getStartTime)); List<Double> successRates = new ArrayList <>(); for (InspectionResult result : results) { double successRate = calculateSuccessRate(result); successRates.add(successRate); } analysis.setSuccessRateTrend(analyzeTrend(successRates)); List<Long> durations = results.stream() .map(InspectionResult::getDuration) .collect(Collectors.toList()); analysis.setDurationTrend(analyzeTrend(durations)); List<Integer> exceptionCounts = results.stream() .map(result -> (int ) result.getCheckPointResults().stream() .filter(cp -> cp.getStatus() == CheckPointStatus.FAILED) .count()) .collect(Collectors.toList()); analysis.setExceptionTrend(analyzeTrend(exceptionCounts)); } catch (Exception e) { log.error("趋势分析生成失败" , e); } return analysis; } private ExceptionAnalysis generateExceptionAnalysis (List<InspectionResult> results) { ExceptionAnalysis analysis = new ExceptionAnalysis (); try { Map<String, Integer> exceptionTypeCount = new HashMap <>(); Map<String, Integer> exceptionMessageCount = new HashMap <>(); for (InspectionResult result : results) { for (CheckPointResult checkPointResult : result.getCheckPointResults()) { if (checkPointResult.getStatus() == CheckPointStatus.FAILED) { String errorMessage = checkPointResult.getErrorMessage(); exceptionMessageCount.merge(errorMessage, 1 , Integer::sum); String exceptionType = classifyException(errorMessage); exceptionTypeCount.merge(exceptionType, 1 , Integer::sum); } } } analysis.setExceptionTypeCount(exceptionTypeCount); analysis.setExceptionMessageCount(exceptionMessageCount); String mostCommonException = exceptionMessageCount.entrySet().stream() .max(Map.Entry.comparingByValue()) .map(Map.Entry::getKey) .orElse("无" ); analysis.setMostCommonException(mostCommonException); } catch (Exception e) { log.error("异常分析生成失败" , e); } return analysis; } private List<Recommendation> generateRecommendations (InspectionStatistics statistics, TrendAnalysis trendAnalysis, ExceptionAnalysis exceptionAnalysis) { List<Recommendation> recommendations = new ArrayList <>(); try { if (statistics.getSuccessRate() < 0.8 ) { Recommendation rec = new Recommendation (); rec.setType(RecommendationType.SUCCESS_RATE); rec.setPriority(Priority.HIGH); rec.setTitle("提升巡检成功率" ); rec.setDescription("当前巡检成功率为" + String.format("%.2f" , statistics.getSuccessRate() * 100 ) + "%,建议检查系统稳定性" ); recommendations.add(rec); } if (statistics.getAverageDuration() > 30000 ) { Recommendation rec = new Recommendation (); rec.setType(RecommendationType.PERFORMANCE); rec.setPriority(Priority.MEDIUM); rec.setTitle("优化巡检性能" ); rec.setDescription("平均执行时间为" + statistics.getAverageDuration() + "ms,建议优化检查点性能" ); recommendations.add(rec); } if (exceptionAnalysis.getMostCommonException() != null && !exceptionAnalysis.getMostCommonException().equals("无" )) { Recommendation rec = new Recommendation (); rec.setType(RecommendationType.EXCEPTION_HANDLING); rec.setPriority(Priority.MEDIUM); rec.setTitle("处理常见异常" ); rec.setDescription("最常见的异常是:" + exceptionAnalysis.getMostCommonException() + ",建议优先处理" ); recommendations.add(rec); } } catch (Exception e) { log.error("建议生成失败" , e); } return recommendations; } private String generateSummary (InspectionReport report) { StringBuilder summary = new StringBuilder (); summary.append("巡检报告摘要\n" ); summary.append("任务ID: " ).append(report.getTaskId()).append("\n" ); summary.append("统计期间: " ).append(report.getStartTime()).append(" - " ).append(report.getEndTime()).append("\n" ); summary.append("总执行次数: " ).append(report.getStatistics().getTotalCount()).append("\n" ); summary.append("成功率: " ).append(String.format("%.2f" , report.getStatistics().getSuccessRate() * 100 )).append("%\n" ); summary.append("平均执行时间: " ).append(report.getStatistics().getAverageDuration()).append("ms\n" ); if (!report.getRecommendations().isEmpty()) { summary.append("主要建议: " ).append(report.getRecommendations().get(0 ).getTitle()).append("\n" ); } return summary.toString(); } private double calculateSuccessRate (InspectionResult result) { int totalCheckPoints = result.getCheckPointResults().size(); int successCheckPoints = (int ) result.getCheckPointResults().stream() .filter(cp -> cp.getStatus() == CheckPointStatus.PASSED) .count(); return totalCheckPoints > 0 ? (double ) successCheckPoints / totalCheckPoints : 0.0 ; } private TrendDirection analyzeTrend (List<? extends Number> values) { if (values.size() < 2 ) { return TrendDirection.STABLE; } double firstHalf = values.subList(0 , values.size() / 2 ).stream() .mapToDouble(Number::doubleValue) .average() .orElse(0.0 ); double secondHalf = values.subList(values.size() / 2 , values.size()).stream() .mapToDouble(Number::doubleValue) .average() .orElse(0.0 ); if (secondHalf > firstHalf * 1.1 ) { return TrendDirection.INCREASING; } else if (secondHalf < firstHalf * 0.9 ) { return TrendDirection.DECREASING; } else { return TrendDirection.STABLE; } } private String classifyException (String errorMessage) { if (errorMessage.contains("数据库" ) || errorMessage.contains("SQL" )) { return "数据库异常" ; } else if (errorMessage.contains("网络" ) || errorMessage.contains("连接" )) { return "网络异常" ; } else if (errorMessage.contains("超时" )) { return "超时异常" ; } else if (errorMessage.contains("权限" ) || errorMessage.contains("认证" )) { return "权限异常" ; } else { return "其他异常" ; } } private String generateReportId () { return "report-" + System.currentTimeMillis() + "-" + Thread.currentThread().getId(); } }
五、最佳实践与总结 5.1 业务自动巡检最佳实践
巡检策略设计
根据业务特点设计巡检策略
设置合理的巡检频率
覆盖关键业务节点
检查点设计
设计全面的检查点
包括数据检查、接口检查、性能检查
设置合理的检查阈值
异常处理机制
建立完善的异常分类
实现智能告警
提供自动修复能力
监控告警体系
建立多层次的监控
设置合理的告警阈值
实现智能告警降噪
5.2 架构师级巡检技能
系统性思维
从全局角度设计巡检体系
考虑巡检对系统性能的影响
设计可扩展的巡检架构
业务理解能力
深入理解业务逻辑
识别关键业务节点
设计有效的检查策略
异常分析能力
快速识别异常模式
分析异常根因
制定有效的解决方案
运维自动化
5.3 持续改进建议
巡检策略优化
异常处理改进
监控体系完善
知识积累
总结 Java业务自动巡检是企业级应用运维的核心能力,通过智能的巡检策略、完善的异常检测机制和系统化的处理流程,能够及时发现业务异常,预防潜在问题,保障企业级应用的高可用性。本文从巡检策略设计到异常检测,从基础原理到企业级实践,系统梳理了Java业务自动巡检的完整解决方案。
关键要点:
智能巡检策略 :根据业务特点设计全面的巡检策略
异常检测机制 :实现智能的异常检测和分类
监控告警体系 :建立完善的监控和告警机制
企业级实践 :巡检配置管理、结果存储、报告生成
通过深入理解这些技术要点,架构师能够设计出完善的业务自动巡检系统,提升系统的稳定性和可靠性,确保企业级应用的高可用性。