第103集SkyWalking分布式链路追踪运维监控与优化实战 | 字数总计: 5.4k | 阅读时长: 26分钟 | 阅读量:
1. SkyWalking分布式链路追踪运维概述 SkyWalking作为优秀的分布式链路追踪和APM监控系统,在生产环境中需要专业的运维监控和管理。本文将详细介绍SkyWalking部署配置、监控指标采集、性能分析优化、告警配置的完整解决方案,帮助运维人员有效管理SkyWalking集群。
1.1 核心挑战
APM监控 : 实时监控应用性能指标和业务指标
链路追踪 : 分布式服务调用链路追踪和分析
性能分析 : 慢查询分析和性能瓶颈定位
告警配置 : 智能告警规则和通知机制
集群管理 : SkyWalking集群部署和运维管理
1.2 技术架构 1 2 3 4 5 SkyWalking监控 → 数据采集 → 链路分析 → 性能优化 → 告警通知 ↓ ↓ ↓ ↓ ↓ APM指标 → Agent探针 → OAP服务器 → UI展示 → 告警引擎 ↓ ↓ ↓ ↓ ↓ 链路追踪 → 性能分析 → 瓶颈定位 → 自动优化 → 运维记录
2. SkyWalking部署与配置 2.1 Maven依赖配置 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 <dependencies > <dependency > <groupId > org.springframework.boot</groupId > <artifactId > spring-boot-starter-web</artifactId > <version > 2.7.0</version > </dependency > <dependency > <groupId > org.apache.skywalking</groupId > <artifactId > apm-toolkit-logback-1.x</artifactId > <version > 8.15.0</version > </dependency > <dependency > <groupId > org.apache.skywalking</groupId > <artifactId > skywalking-spring-boot-starter</artifactId > <version > 8.15.0</version > </dependency > <dependency > <groupId > mysql</groupId > <artifactId > mysql-connector-java</artifactId > <version > 8.0.33</version > </dependency > <dependency > <groupId > org.elasticsearch.client</groupId > <artifactId > elasticsearch-rest-high-level-client</artifactId > <version > 7.17.9</version > </dependency > </dependencies >
2.2 SkyWalking OAP服务器配置 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 server: port: 8080 spring: application: name: skywalking-demo skywalking: agent: service_name: ${SW_AGENT_NAME:skywalking-demo} collector: backend_service: ${SW_AGENT_COLLECTOR_BACKEND_SERVICES:127.0.0.1:11800} logging: level: INFO plugin: mysql: enabled: true spring: enabled: true
2.3 OAP服务器配置文件 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 cluster: selector: standalone core: selector: default default: dataKeeperExecutePeriod: 5 recordDataTTL: 3 metricsDataTTL: 7 sampleRate: 10000 cache: size: 10000 ttl: 60 storage: selector: elasticsearch elasticsearch: nameSpace: ${SW_NAMESPACE:""} clusterNodes: ${SW_STORAGE_ES_CLUSTER_NODES:localhost:9200} protocol: ${SW_STORAGE_ES_HTTP_PROTOCOL:"http"} user: ${SW_ES_USER:""} password: ${SW_ES_PASSWORD:""} indexShardsNumber: ${SW_STORAGE_ES_INDEX_SHARDS_NUMBER:1} indexReplicasNumber: ${SW_STORAGE_ES_INDEX_REPLICAS_NUMBER:0} indexTemplateOrder: 0 bulkActions: ${SW_STORAGE_ES_BULK_ACTIONS:5000} flushInterval: ${SW_STORAGE_ES_FLUSH_INTERVAL:15} concurrentRequests: ${SW_STORAGE_ES_CONCURRENT_REQUESTS:2} resultWindowMaxSize: ${SW_STORAGE_ES_QUERY_MAX_WINDOW_SIZE:10000} metadataQueryMaxSize: ${SW_STORAGE_ES_QUERY_MAX_SIZE:5000} segmentQueryMaxSize: ${SW_STORAGE_ES_QUERY_SEGMENT_SIZE:200}
3. SkyWalking监控指标采集 3.1 应用性能监控 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 @Service @Slf4j public class SkyWalkingMonitorService { @Autowired private MeterRegistry meterRegistry; @Scheduled(fixedRate = 30000) public void monitorJvmMetrics () { try { MemoryMXBean memoryBean = ManagementFactory.getMemoryMXBean(); MemoryUsage heapUsage = memoryBean.getHeapMemoryUsage(); MemoryUsage nonHeapUsage = memoryBean.getNonHeapMemoryUsage(); meterRegistry.gauge("jvm.memory.heap.used" , heapUsage.getUsed()); meterRegistry.gauge("jvm.memory.heap.max" , heapUsage.getMax()); meterRegistry.gauge("jvm.memory.nonheap.used" , nonHeapUsage.getUsed()); List<GarbageCollectorMXBean> gcBeans = ManagementFactory.getGarbageCollectorMXBeans(); for (GarbageCollectorMXBean gcBean : gcBeans) { meterRegistry.gauge("jvm.gc.collections" , gcBean.getCollectionCount()); meterRegistry.gauge("jvm.gc.time" , gcBean.getCollectionTime()); } ThreadMXBean threadBean = ManagementFactory.getThreadMXBean(); meterRegistry.gauge("jvm.threads.live" , threadBean.getThreadCount()); meterRegistry.gauge("jvm.threads.daemon" , threadBean.getDaemonThreadCount()); log.info("JVM监控指标采集完成" ); } catch (Exception e) { log.error("JVM监控指标采集失败" , e); } } public void recordBusinessMetrics (String operation, long duration, boolean success) { try { Timer.Sample sample = Timer.start(meterRegistry); sample.stop(Timer.builder("business.operation.duration" ) .tag("operation" , operation) .tag("success" , String.valueOf(success)) .register(meterRegistry)); Counter.builder("business.operation.count" ) .tag("operation" , operation) .tag("success" , String.valueOf(success)) .register(meterRegistry) .increment(); log.debug("业务指标记录完成: operation={}, duration={}, success={}" , operation, duration, success); } catch (Exception e) { log.error("业务指标记录失败" , e); } } @Scheduled(fixedRate = 60000) public void monitorDatabasePool () { try { DataSource dataSource = SpringContextUtils.getBean(DataSource.class); if (dataSource instanceof HikariDataSource) { HikariDataSource hikariDataSource = (HikariDataSource) dataSource; HikariPoolMXBean poolBean = hikariDataSource.getHikariPoolMXBean(); meterRegistry.gauge("db.pool.active" , poolBean.getActiveConnections()); meterRegistry.gauge("db.pool.idle" , poolBean.getIdleConnections()); meterRegistry.gauge("db.pool.total" , poolBean.getTotalConnections()); meterRegistry.gauge("db.pool.waiting" , poolBean.getThreadsAwaitingConnection()); log.info("数据库连接池监控完成" ); } } catch (Exception e) { log.error("数据库连接池监控失败" , e); } } }
3.2 分布式链路追踪 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 @Service @Slf4j public class SkyWalkingTraceService { public void traceBusinessOperation (String operationName, Runnable operation) { AbstractSpan span = ContextManager.createLocalSpan(operationName); try { span.setComponent(ComponentsDefine.SPRING_MVC_ANNOTATION); span.tag("operation.type" , "business" ); span.tag("operation.name" , operationName); operation.run(); span.tag("operation.status" , "success" ); } catch (Exception e) { span.tag("operation.status" , "error" ); span.tag("error.message" , e.getMessage()); span.errorOccurred(); throw e; } finally { ContextManager.stopSpan(); } } @Async public CompletableFuture<String> traceAsyncOperation (String operationName) { return CompletableFuture.supplyAsync(() -> { AbstractSpan span = ContextManager.createLocalSpan(operationName); try { span.setComponent(ComponentsDefine.SPRING_MVC_ANNOTATION); span.tag("operation.type" , "async" ); Thread.sleep(1000 ); span.tag("operation.status" , "success" ); return "异步操作完成" ; } catch (Exception e) { span.tag("operation.status" , "error" ); span.errorOccurred(); throw new RuntimeException (e); } finally { ContextManager.stopSpan(); } }); } public String traceHttpRequest (String url, Map<String, Object> params) { AbstractSpan span = ContextManager.createLocalSpan("http.request" ); try { span.setComponent(ComponentsDefine.HTTPCLIENT); span.tag("http.url" , url); span.tag("http.method" , "POST" ); RestTemplate restTemplate = new RestTemplate (); HttpHeaders headers = new HttpHeaders (); headers.setContentType(MediaType.APPLICATION_JSON); HttpEntity<Map<String, Object>> entity = new HttpEntity <>(params, headers); ResponseEntity<String> response = restTemplate.postForEntity(url, entity, String.class); span.tag("http.status_code" , String.valueOf(response.getStatusCodeValue())); span.tag("http.response_size" , String.valueOf(response.getBody().length())); return response.getBody(); } catch (Exception e) { span.tag("http.status_code" , "500" ); span.errorOccurred(); throw e; } finally { ContextManager.stopSpan(); } } }
4. SkyWalking性能分析与优化 4.1 性能分析服务 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 @Service @Slf4j public class SkyWalkingPerformanceService { @Autowired private SkyWalkingQueryService queryService; public List<SlowQueryAnalysis> analyzeSlowQueries (String serviceName, long startTime, long endTime) { try { List<SlowQueryAnalysis> slowQueries = new ArrayList <>(); List<Trace> traces = queryService.queryTraces(serviceName, startTime, endTime); for (Trace trace : traces) { if (trace.getDuration() > 1000 ) { SlowQueryAnalysis analysis = new SlowQueryAnalysis (); analysis.setTraceId(trace.getTraceId()); analysis.setDuration(trace.getDuration()); analysis.setStartTime(trace.getStartTime()); analysis.setEndTime(trace.getEndTime()); List<Span> spans = trace.getSpans(); for (Span span : spans) { if (span.getComponentId() == ComponentsDefine.MYSQL_JDBC_DRIVER) { analysis.setSqlOperation(span.getOperationName()); analysis.setDatabaseName(span.getPeer()); break ; } } slowQueries.add(analysis); } } slowQueries.sort((a, b) -> Long.compare(b.getDuration(), a.getDuration())); log.info("慢查询分析完成,发现{}个慢查询" , slowQueries.size()); return slowQueries; } catch (Exception e) { log.error("慢查询分析失败" , e); return Collections.emptyList(); } } public PerformanceBottleneck analyzePerformanceBottleneck (String serviceName, long startTime, long endTime) { try { PerformanceBottleneck bottleneck = new PerformanceBottleneck (); List<Trace> traces = queryService.queryTraces(serviceName, startTime, endTime); Map<String, Long> componentDurations = new HashMap <>(); Map<String, Integer> componentCounts = new HashMap <>(); for (Trace trace : traces) { List<Span> spans = trace.getSpans(); for (Span span : spans) { String component = getComponentName(span.getComponentId()); componentDurations.merge(component, span.getDuration(), Long::sum); componentCounts.merge(component, 1 , Integer::sum); } } String slowestComponent = componentDurations.entrySet().stream() .max(Map.Entry.comparingByValue()) .map(Map.Entry::getKey) .orElse("unknown" ); bottleneck.setSlowestComponent(slowestComponent); bottleneck.setComponentDurations(componentDurations); bottleneck.setComponentCounts(componentCounts); long errorCount = traces.stream() .mapToLong(trace -> trace.getSpans().stream() .mapToLong(span -> span.isError() ? 1 : 0 ) .sum()) .sum(); double errorRate = (double ) errorCount / traces.size() * 100 ; bottleneck.setErrorRate(errorRate); log.info("性能瓶颈分析完成,最慢组件: {}, 错误率: {}%" , slowestComponent, errorRate); return bottleneck; } catch (Exception e) { log.error("性能瓶颈分析失败" , e); return new PerformanceBottleneck (); } } private String getComponentName (int componentId) { switch (componentId) { case ComponentsDefine.SPRING_MVC_ANNOTATION: return "Spring MVC" ; case ComponentsDefine.MYSQL_JDBC_DRIVER: return "MySQL" ; case ComponentsDefine.REDIS: return "Redis" ; case ComponentsDefine.HTTPCLIENT: return "HTTP Client" ; default : return "Unknown" ; } } }
4.2 性能优化建议 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 @Service @Slf4j public class SkyWalkingOptimizationService { @Autowired private SkyWalkingPerformanceService performanceService; public List<OptimizationSuggestion> generateOptimizationSuggestions (String serviceName) { List<OptimizationSuggestion> suggestions = new ArrayList <>(); try { long endTime = System.currentTimeMillis(); long startTime = endTime - 3600000 ; PerformanceBottleneck bottleneck = performanceService.analyzePerformanceBottleneck(serviceName, startTime, endTime); if (bottleneck.getSlowestComponent().equals("MySQL" )) { suggestions.add(OptimizationSuggestion.builder() .category("数据库优化" ) .priority("高" ) .description("MySQL查询耗时过长,建议优化SQL语句和索引" ) .action("1. 检查慢查询日志\n2. 优化SQL语句\n3. 添加合适的索引\n4. 考虑读写分离" ) .build()); } if (bottleneck.getComponentDurations().containsKey("Redis" )) { suggestions.add(OptimizationSuggestion.builder() .category("缓存优化" ) .priority("中" ) .description("Redis操作耗时较长,建议优化缓存策略" ) .action("1. 检查Redis连接池配置\n2. 优化缓存键设计\n3. 考虑使用Redis集群\n4. 调整缓存过期时间" ) .build()); } if (bottleneck.getComponentDurations().containsKey("HTTP Client" )) { suggestions.add(OptimizationSuggestion.builder() .category("网络优化" ) .priority("中" ) .description("HTTP调用耗时较长,建议优化网络配置" ) .action("1. 调整连接超时时间\n2. 使用连接池\n3. 考虑使用异步调用\n4. 检查网络延迟" ) .build()); } if (bottleneck.getErrorRate() > 5.0 ) { suggestions.add(OptimizationSuggestion.builder() .category("稳定性优化" ) .priority("高" ) .description("错误率过高,建议检查系统稳定性" ) .action("1. 检查错误日志\n2. 增加异常处理\n3. 实施熔断机制\n4. 优化重试策略" ) .build()); } log.info("性能优化建议生成完成,共{}条建议" , suggestions.size()); return suggestions; } catch (Exception e) { log.error("性能优化建议生成失败" , e); return Collections.emptyList(); } } @Scheduled(fixedRate = 300000) public void autoPerformanceTuning () { try { log.info("开始自动性能调优" ); List<String> services = getActiveServices(); for (String service : services) { List<OptimizationSuggestion> suggestions = generateOptimizationSuggestions(service); for (OptimizationSuggestion suggestion : suggestions) { if ("高" .equals(suggestion.getPriority())) { executeOptimization(service, suggestion); } } } log.info("自动性能调优完成" ); } catch (Exception e) { log.error("自动性能调优失败" , e); } } private void executeOptimization (String serviceName, OptimizationSuggestion suggestion) { try { log.info("执行优化操作: service={}, category={}" , serviceName, suggestion.getCategory()); switch (suggestion.getCategory()) { case "数据库优化" : optimizeDatabase(serviceName); break ; case "缓存优化" : optimizeCache(serviceName); break ; case "网络优化" : optimizeNetwork(serviceName); break ; default : log.warn("未知的优化类别: {}" , suggestion.getCategory()); } } catch (Exception e) { log.error("优化操作执行失败" , e); } } private void optimizeDatabase (String serviceName) { log.info("执行数据库优化: {}" , serviceName); } private void optimizeCache (String serviceName) { log.info("执行缓存优化: {}" , serviceName); } private void optimizeNetwork (String serviceName) { log.info("执行网络优化: {}" , serviceName); } private List<String> getActiveServices () { return Arrays.asList("user-service" , "order-service" , "payment-service" ); } }
5. SkyWalking告警配置与管理 5.1 告警规则配置 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 rules: service_resp_time_rule: metrics-name: service_resp_time op: ">" threshold: 1000 period: 10 count: 3 silence-period: 5 message: "服务响应时间超过阈值" service_error_rate_rule: metrics-name: service_error_rate op: ">" threshold: 0.05 period: 10 count: 2 silence-period: 5 message: "服务错误率超过阈值" jvm_memory_usage_rule: metrics-name: jvm_memory_usage op: ">" threshold: 0.8 period: 10 count: 2 silence-period: 5 message: "JVM内存使用率过高" database_connection_rule: metrics-name: database_connection_count op: ">" threshold: 80 period: 10 count: 2 silence-period: 5 message: "数据库连接数过多" webhooks: dingtalk: textTemplate: | SkyWalking告警通知 服务: {{serviceName}} 告警: {{alarmMessage}} 时间: {{alarmTime}} 详情: {{alarmDetails}} secret: "your-dingtalk-secret" webhook: "https://oapi.dingtalk.com/robot/send?access_token=your-token" wechat: textTemplate: | SkyWalking告警通知 服务: {{serviceName}} 告警: {{alarmMessage}} 时间: {{alarmTime}} 详情: {{alarmDetails}} webhook: "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=your-key" email: textTemplate: | SkyWalking告警通知 服务: {{serviceName}} 告警: {{alarmMessage}} 时间: {{alarmTime}} 详情: {{alarmDetails}} smtp: host: "smtp.qq.com" port: 587 username: "your-email@qq.com" password: "your-password" ssl: true receivers: - "admin@company.com" - "ops@company.com"
5.2 告警管理服务 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 @Service @Slf4j public class SkyWalkingAlarmService { @Autowired private AlarmWebhookService webhookService; @Autowired private AlarmRuleService ruleService; @EventListener public void handleAlarmEvent (AlarmEvent event) { try { log.info("收到告警事件: {}" , event); AlarmRule rule = ruleService.getRule(event.getRuleName()); if (rule == null ) { log.warn("未找到告警规则: {}" , event.getRuleName()); return ; } if (shouldTriggerAlarm(event, rule)) { sendAlarmNotification(event, rule); recordAlarmLog(event, rule); } } catch (Exception e) { log.error("处理告警事件失败" , e); } } private boolean shouldTriggerAlarm (AlarmEvent event, AlarmRule rule) { try { boolean thresholdMet = checkThreshold(event, rule); if (!thresholdMet) { return false ; } if (isInSilencePeriod(event, rule)) { log.info("告警在静默期内,跳过发送: {}" , event.getRuleName()); return false ; } if (isAlarmTooFrequent(event, rule)) { log.info("告警频率过高,跳过发送: {}" , event.getRuleName()); return false ; } return true ; } catch (Exception e) { log.error("检查告警条件失败" , e); return false ; } } private boolean checkThreshold (AlarmEvent event, AlarmRule rule) { double currentValue = event.getCurrentValue(); double threshold = rule.getThreshold(); String operator = rule.getOperator(); switch (operator) { case ">" : return currentValue > threshold; case ">=" : return currentValue >= threshold; case "<" : return currentValue < threshold; case "<=" : return currentValue <= threshold; case "=" : return currentValue == threshold; default : log.warn("未知的操作符: {}" , operator); return false ; } } private boolean isInSilencePeriod (AlarmEvent event, AlarmRule rule) { return false ; } private boolean isAlarmTooFrequent (AlarmEvent event, AlarmRule rule) { return false ; } private void sendAlarmNotification (AlarmEvent event, AlarmRule rule) { try { AlarmMessage message = buildAlarmMessage(event, rule); webhookService.sendDingtalkNotification(message); webhookService.sendWechatNotification(message); webhookService.sendEmailNotification(message); log.info("告警通知发送完成: {}" , event.getRuleName()); } catch (Exception e) { log.error("发送告警通知失败" , e); } } private AlarmMessage buildAlarmMessage (AlarmEvent event, AlarmRule rule) { AlarmMessage message = new AlarmMessage (); message.setServiceName(event.getServiceName()); message.setRuleName(event.getRuleName()); message.setAlarmMessage(rule.getMessage()); message.setAlarmTime(new Date ()); message.setCurrentValue(event.getCurrentValue()); message.setThreshold(rule.getThreshold()); message.setOperator(rule.getOperator()); return message; } private void recordAlarmLog (AlarmEvent event, AlarmRule rule) { try { AlarmLog alarmLog = new AlarmLog (); alarmLog.setServiceName(event.getServiceName()); alarmLog.setRuleName(event.getRuleName()); alarmLog.setAlarmMessage(rule.getMessage()); alarmLog.setAlarmTime(new Date ()); alarmLog.setCurrentValue(event.getCurrentValue()); alarmLog.setThreshold(rule.getThreshold()); alarmLog.setStatus("SENT" ); alarmLogRepository.save(alarmLog); log.info("告警日志记录完成: {}" , event.getRuleName()); } catch (Exception e) { log.error("记录告警日志失败" , e); } } }
6. SkyWalking运维最佳实践 6.1 监控面板配置 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 @Service @Slf4j public class SkyWalkingDashboardService { public Dashboard createCustomDashboard (String serviceName) { Dashboard dashboard = new Dashboard (); dashboard.setName(serviceName + "监控面板" ); dashboard.setDescription(serviceName + "服务监控面板" ); Chart serviceOverview = new Chart (); serviceOverview.setTitle("服务概览" ); serviceOverview.setType("line" ); serviceOverview.setMetrics(Arrays.asList( "service_resp_time" , "service_throughput" , "service_error_rate" )); dashboard.addChart(serviceOverview); Chart jvmChart = new Chart (); jvmChart.setTitle("JVM监控" ); jvmChart.setType("area" ); jvmChart.setMetrics(Arrays.asList( "jvm_memory_usage" , "jvm_gc_time" , "jvm_thread_count" )); dashboard.addChart(jvmChart); Chart dbChart = new Chart (); dbChart.setTitle("数据库监控" ); dbChart.setType("bar" ); dbChart.setMetrics(Arrays.asList( "database_connection_count" , "database_query_time" , "database_error_rate" )); dashboard.addChart(dbChart); return dashboard; } public MonitoringReport generateMonitoringReport (String serviceName, long startTime, long endTime) { MonitoringReport report = new MonitoringReport (); report.setServiceName(serviceName); report.setStartTime(startTime); report.setEndTime(endTime); report.setGenerateTime(System.currentTimeMillis()); try { ServiceMetrics serviceMetrics = getServiceMetrics(serviceName, startTime, endTime); report.setServiceMetrics(serviceMetrics); JvmMetrics jvmMetrics = getJvmMetrics(serviceName, startTime, endTime); report.setJvmMetrics(jvmMetrics); DatabaseMetrics dbMetrics = getDatabaseMetrics(serviceName, startTime, endTime); report.setDatabaseMetrics(dbMetrics); AlarmStatistics alarmStats = getAlarmStatistics(serviceName, startTime, endTime); report.setAlarmStatistics(alarmStats); log.info("监控报告生成完成: {}" , serviceName); return report; } catch (Exception e) { log.error("监控报告生成失败" , e); return report; } } private ServiceMetrics getServiceMetrics (String serviceName, long startTime, long endTime) { ServiceMetrics metrics = new ServiceMetrics (); metrics.setAvgResponseTime(150.5 ); metrics.setMaxResponseTime(2000.0 ); metrics.setMinResponseTime(50.0 ); metrics.setThroughput(1000.0 ); metrics.setErrorRate(0.02 ); return metrics; } private JvmMetrics getJvmMetrics (String serviceName, long startTime, long endTime) { JvmMetrics metrics = new JvmMetrics (); metrics.setHeapUsage(0.65 ); metrics.setNonHeapUsage(0.25 ); metrics.setGcTime(100.0 ); metrics.setThreadCount(150 ); return metrics; } private DatabaseMetrics getDatabaseMetrics (String serviceName, long startTime, long endTime) { DatabaseMetrics metrics = new DatabaseMetrics (); metrics.setConnectionCount(20 ); metrics.setAvgQueryTime(50.0 ); metrics.setMaxQueryTime(500.0 ); metrics.setQueryCount(5000 ); return metrics; } private AlarmStatistics getAlarmStatistics (String serviceName, long startTime, long endTime) { AlarmStatistics stats = new AlarmStatistics (); stats.setTotalAlarms(10 ); stats.setCriticalAlarms(2 ); stats.setWarningAlarms(5 ); stats.setInfoAlarms(3 ); return stats; } }
6.2 运维自动化脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 #!/bin/bash SKYWALKING_HOME="/opt/skywalking" SKYWALKING_VERSION="8.15.0" BACKUP_DIR="/opt/backup/skywalking" LOG_DIR="/opt/logs/skywalking" mkdir -p $BACKUP_DIR mkdir -p $LOG_DIR start_skywalking () { echo "启动SkyWalking服务..." cd $SKYWALKING_HOME /bin nohup ./oapService.sh > $LOG_DIR /oap.log 2>&1 & echo $! > $SKYWALKING_HOME /oap.pid nohup ./webappService.sh > $LOG_DIR /webui.log 2>&1 & echo $! > $SKYWALKING_HOME /webui.pid echo "SkyWalking服务启动完成" } stop_skywalking () { echo "停止SkyWalking服务..." if [ -f $SKYWALKING_HOME /oap.pid ]; then kill $(cat $SKYWALKING_HOME /oap.pid) rm $SKYWALKING_HOME /oap.pid fi if [ -f $SKYWALKING_HOME /webui.pid ]; then kill $(cat $SKYWALKING_HOME /webui.pid) rm $SKYWALKING_HOME /webui.pid fi echo "SkyWalking服务停止完成" } restart_skywalking () { echo "重启SkyWalking服务..." stop_skywalking sleep 5 start_skywalking } check_skywalking_status () { echo "检查SkyWalking服务状态..." if [ -f $SKYWALKING_HOME /oap.pid ]; then if kill -0 $(cat $SKYWALKING_HOME /oap.pid) 2>/dev/null; then echo "OAP服务器运行正常" else echo "OAP服务器未运行" fi else echo "OAP服务器未运行" fi if [ -f $SKYWALKING_HOME /webui.pid ]; then if kill -0 $(cat $SKYWALKING_HOME /webui.pid) 2>/dev/null; then echo "Web UI运行正常" else echo "Web UI未运行" fi else echo "Web UI未运行" fi } backup_skywalking_config () { echo "备份SkyWalking配置..." BACKUP_FILE="$BACKUP_DIR /skywalking-config-$(date +%Y%m%d-%H%M%S) .tar.gz" tar -czf $BACKUP_FILE \ $SKYWALKING_HOME /config \ $SKYWALKING_HOME /webapp \ $SKYWALKING_HOME /bin echo "配置备份完成: $BACKUP_FILE " } cleanup_skywalking_logs () { echo "清理SkyWalking日志..." find $LOG_DIR -name "*.log" -mtime +7 -delete curl -X DELETE "localhost:9200/skywalking_*_$(date -d '30 days ago' +%Y%m%d) " echo "日志清理完成" } monitor_skywalking_performance () { echo "监控SkyWalking性能..." MEMORY_USAGE=$(ps -o pid,rss,comm -p $(cat $SKYWALKING_HOME /oap.pid) | tail -1 | awk '{print $2}' ) MEMORY_MB=$((MEMORY_USAGE / 1024 )) if [ $MEMORY_MB -gt 2048 ]; then echo "警告: OAP服务器内存使用过高: ${MEMORY_MB} MB" else echo "OAP服务器内存使用正常: ${MEMORY_MB} MB" fi DISK_USAGE=$(df -h $SKYWALKING_HOME | tail -1 | awk '{print $5}' | sed 's/%//' ) if [ $DISK_USAGE -gt 80 ]; then echo "警告: 磁盘使用率过高: ${DISK_USAGE} %" else echo "磁盘使用率正常: ${DISK_USAGE} %" fi } main () { case "$1 " in start) start_skywalking ;; stop) stop_skywalking ;; restart) restart_skywalking ;; status) check_skywalking_status ;; backup) backup_skywalking_config ;; cleanup) cleanup_skywalking_logs ;; monitor) monitor_skywalking_performance ;; *) echo "用法: $0 {start|stop|restart|status|backup|cleanup|monitor}" exit 1 ;; esac } main "$@ "
7. 总结 SkyWalking作为优秀的分布式链路追踪和APM监控系统,在生产环境中需要专业的运维管理。通过本文的详细介绍,我们了解了:
SkyWalking部署配置 : OAP服务器、Web UI、Agent的完整配置方案
监控指标采集 : JVM监控、业务指标、数据库连接池等关键指标
性能分析优化 : 慢查询分析、性能瓶颈定位、自动优化建议
告警配置管理 : 智能告警规则、多渠道通知、告警管理
运维最佳实践 : 监控面板、自动化脚本、性能监控
通过合理的SkyWalking运维配置和管理,可以有效提升系统的可观测性和运维效率,为业务稳定运行提供有力保障。
运维实战要点 :
SkyWalking集群部署需要考虑高可用和性能优化
监控指标采集要覆盖应用、JVM、数据库等关键组件
告警配置要合理设置阈值和通知渠道
性能分析要结合业务场景进行针对性优化
运维自动化可以提升管理效率和减少人工错误
技术注解 :
SkyWalking采用探针模式进行无侵入式监控
OAP服务器负责数据收集、存储和分析
Web UI提供可视化的监控界面
支持多种存储后端(Elasticsearch、MySQL等)
提供丰富的告警和通知机制