前言

RDS-Redis自动巡检作为企业级云数据库运维的核心能力之一,直接影响着数据存储的稳定性和缓存性能。通过智能的云数据库监控策略,完善的缓存优化机制,能够及时发现数据库异常,预防数据丢失,保障企业级应用的高可用性。本文从云数据库监控设计到缓存优化,从基础原理到企业级实践,系统梳理RDS-Redis自动巡检的完整解决方案。

一、RDS-Redis自动巡检架构设计

1.1 巡检系统整体架构

1.2 巡检核心组件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
/**
* RDS-Redis自动巡检核心组件
*/
@Component
public class RDSRedisInspectionEngine {

@Autowired
private RDSMonitorService rdsMonitorService;

@Autowired
private RedisMonitorService redisMonitorService;

@Autowired
private CloudDatabaseManager cloudDatabaseManager;

@Autowired
private CacheManager cacheManager;

@Autowired
private AlertService alertService;

@Autowired
private InspectionResultStorage resultStorage;

/**
* 启动RDS-Redis巡检引擎
*/
public void startInspectionEngine() {
try {
// 1. 初始化云数据库连接
initializeCloudDatabaseConnections();

// 2. 启动RDS监控
rdsMonitorService.startMonitoring();

// 3. 启动Redis监控
redisMonitorService.startMonitoring();

// 4. 启动云数据库管理
cloudDatabaseManager.startManagement();

// 5. 启动缓存管理
cacheManager.startManagement();

log.info("RDS-Redis巡检引擎启动成功");

} catch (Exception e) {
log.error("RDS-Redis巡检引擎启动失败", e);
throw new InspectionEngineException("巡检引擎启动失败", e);
}
}

/**
* 停止RDS-Redis巡检引擎
*/
public void stopInspectionEngine() {
try {
// 1. 停止缓存管理
cacheManager.stopManagement();

// 2. 停止云数据库管理
cloudDatabaseManager.stopManagement();

// 3. 停止Redis监控
redisMonitorService.stopMonitoring();

// 4. 停止RDS监控
rdsMonitorService.stopMonitoring();

log.info("RDS-Redis巡检引擎停止成功");

} catch (Exception e) {
log.error("RDS-Redis巡检引擎停止失败", e);
}
}

/**
* 初始化云数据库连接
*/
private void initializeCloudDatabaseConnections() {
try {
// 初始化RDS连接
rdsMonitorService.initializeConnections();

// 初始化Redis连接
redisMonitorService.initializeConnections();

log.info("云数据库连接初始化完成");

} catch (Exception e) {
log.error("云数据库连接初始化失败", e);
throw new ConnectionException("云数据库连接初始化失败", e);
}
}
}

二、RDS数据库监控

2.1 RDS监控服务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
/**
* RDS数据库监控服务
*/
@Service
public class RDSMonitorService {

@Autowired
private RDSConnectionManager connectionManager;

@Autowired
private RDSMetricsCollector metricsCollector;

@Autowired
private RDSHealthAnalyzer healthAnalyzer;

@Autowired
private RDSOptimizer optimizer;

private final ScheduledExecutorService monitorScheduler;
private final Map<String, RDSInstance> monitoredInstances;

public RDSMonitorService() {
this.monitorScheduler = Executors.newScheduledThreadPool(10);
this.monitoredInstances = new ConcurrentHashMap<>();
}

/**
* 启动RDS监控
*/
public void startMonitoring() {
// 启动定期监控任务
monitorScheduler.scheduleAtFixedRate(
this::monitorRDSInstances,
0,
60, // 1分钟
TimeUnit.SECONDS
);

log.info("RDS数据库监控启动成功");
}

/**
* 停止RDS监控
*/
public void stopMonitoring() {
try {
monitorScheduler.shutdown();
if (!monitorScheduler.awaitTermination(30, TimeUnit.SECONDS)) {
monitorScheduler.shutdownNow();
}

log.info("RDS数据库监控停止成功");

} catch (Exception e) {
log.error("RDS数据库监控停止失败", e);
}
}

/**
* 初始化RDS连接
*/
public void initializeConnections() {
try {
// 获取所有RDS实例
List<RDSInstance> instances = connectionManager.getAllInstances();

// 初始化每个实例的连接
for (RDSInstance instance : instances) {
initializeInstanceConnection(instance);
monitoredInstances.put(instance.getInstanceId(), instance);
}

log.info("RDS实例连接初始化完成,实例数量: {}", instances.size());

} catch (Exception e) {
log.error("RDS连接初始化失败", e);
throw new ConnectionException("RDS连接初始化失败", e);
}
}

/**
* 监控RDS实例
*/
private void monitorRDSInstances() {
try {
for (RDSInstance instance : monitoredInstances.values()) {
monitorSingleInstance(instance);
}

} catch (Exception e) {
log.error("RDS实例监控失败", e);
}
}

/**
* 监控单个RDS实例
*/
private void monitorSingleInstance(RDSInstance instance) {
try {
// 1. 收集实例指标
RDSMetrics metrics = metricsCollector.collectMetrics(instance);

// 2. 分析实例健康状态
RDSHealthStatus healthStatus = healthAnalyzer.analyzeHealth(instance, metrics);

// 3. 检查实例状态
if (!healthStatus.isHealthy()) {
handleUnhealthyInstance(instance, healthStatus);
}

// 4. 尝试性能优化
if (healthStatus.needsOptimization()) {
attemptOptimization(instance, healthStatus);
}

// 5. 记录监控结果
recordMonitoringResult(instance, metrics, healthStatus);

} catch (Exception e) {
log.error("RDS实例监控失败: {}", instance.getInstanceId(), e);
}
}

/**
* 处理不健康实例
*/
private void handleUnhealthyInstance(RDSInstance instance, RDSHealthStatus healthStatus) {
try {
// 1. 发送实例告警
sendInstanceAlert(instance, healthStatus);

// 2. 记录实例问题
recordInstanceIssue(instance, healthStatus);

// 3. 尝试自动修复
if (healthStatus.isAutoRepairable()) {
attemptAutoRepair(instance, healthStatus);
}

} catch (Exception e) {
log.error("不健康实例处理失败: {}", instance.getInstanceId(), e);
}
}

/**
* 发送实例告警
*/
private void sendInstanceAlert(RDSInstance instance, RDSHealthStatus healthStatus) {
RDSAlert alert = new RDSAlert();
alert.setAlertType(AlertType.RDS_UNHEALTHY);
alert.setSeverity(healthStatus.getSeverity());
alert.setMessage("RDS实例状态异常: " + instance.getInstanceId());
alert.setInstance(instance);
alert.setHealthStatus(healthStatus);
alert.setTimestamp(System.currentTimeMillis());

// 发送告警
alertService.sendAlert(alert);
}

/**
* 尝试自动修复
*/
private void attemptAutoRepair(RDSInstance instance, RDSHealthStatus healthStatus) {
try {
log.info("尝试自动修复RDS实例: {}", instance.getInstanceId());

// 根据问题类型选择修复策略
switch (healthStatus.getIssueType()) {
case CONNECTION_ISSUE:
repairConnectionIssue(instance);
break;
case PERFORMANCE_ISSUE:
repairPerformanceIssue(instance);
break;
case STORAGE_ISSUE:
repairStorageIssue(instance);
break;
case BACKUP_ISSUE:
repairBackupIssue(instance);
break;
default:
log.warn("无法自动修复的问题类型: {}", healthStatus.getIssueType());
}

} catch (Exception e) {
log.error("RDS实例自动修复失败: {}", instance.getInstanceId(), e);
}
}

/**
* 尝试性能优化
*/
private void attemptOptimization(RDSInstance instance, RDSHealthStatus healthStatus) {
try {
log.info("尝试RDS实例性能优化: {}", instance.getInstanceId());

// 根据性能问题选择优化策略
switch (healthStatus.getPerformanceIssueType()) {
case HIGH_CPU_USAGE:
optimizer.optimizeCPUUsage(instance);
break;
case HIGH_MEMORY_USAGE:
optimizer.optimizeMemoryUsage(instance);
break;
case HIGH_DISK_USAGE:
optimizer.optimizeDiskUsage(instance);
break;
case SLOW_QUERY:
optimizer.optimizeSlowQuery(instance);
break;
default:
log.warn("无法自动优化的性能问题类型: {}", healthStatus.getPerformanceIssueType());
}

} catch (Exception e) {
log.error("RDS实例性能优化失败: {}", instance.getInstanceId(), e);
}
}

/**
* 修复连接问题
*/
private void repairConnectionIssue(RDSInstance instance) {
try {
// 重启实例连接
connectionManager.restartInstanceConnection(instance);
log.info("RDS实例连接修复成功: {}", instance.getInstanceId());

} catch (Exception e) {
log.error("RDS实例连接修复失败: {}", instance.getInstanceId(), e);
}
}

/**
* 修复性能问题
*/
private void repairPerformanceIssue(RDSInstance instance) {
try {
// 调整实例参数
optimizer.adjustInstanceParameters(instance);
log.info("RDS实例性能修复成功: {}", instance.getInstanceId());

} catch (Exception e) {
log.error("RDS实例性能修复失败: {}", instance.getInstanceId(), e);
}
}

/**
* 修复存储问题
*/
private void repairStorageIssue(RDSInstance instance) {
try {
// 清理临时文件
optimizer.cleanupTempFiles(instance);
log.info("RDS实例存储修复成功: {}", instance.getInstanceId());

} catch (Exception e) {
log.error("RDS实例存储修复失败: {}", instance.getInstanceId(), e);
}
}

/**
* 修复备份问题
*/
private void repairBackupIssue(RDSInstance instance) {
try {
// 触发备份
optimizer.triggerBackup(instance);
log.info("RDS实例备份修复成功: {}", instance.getInstanceId());

} catch (Exception e) {
log.error("RDS实例备份修复失败: {}", instance.getInstanceId(), e);
}
}

/**
* 记录监控结果
*/
private void recordMonitoringResult(RDSInstance instance, RDSMetrics metrics, RDSHealthStatus healthStatus) {
RDSMonitoringResult result = new RDSMonitoringResult();
result.setInstanceId(instance.getInstanceId());
result.setTimestamp(System.currentTimeMillis());
result.setMetrics(metrics);
result.setHealthStatus(healthStatus);

// 存储监控结果
resultStorage.storeRDSResult(result);
}
}

2.2 RDS指标收集器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
/**
* RDS指标收集器
*/
@Component
public class RDSMetricsCollector {

@Autowired
private RDSConnectionManager connectionManager;

@Autowired
private CloudAPIClient cloudAPIClient;

/**
* 收集RDS指标
*/
public RDSMetrics collectMetrics(RDSInstance instance) {
RDSMetrics metrics = new RDSMetrics();

try {
// 1. 收集基础指标
collectBasicMetrics(instance, metrics);

// 2. 收集性能指标
collectPerformanceMetrics(instance, metrics);

// 3. 收集存储指标
collectStorageMetrics(instance, metrics);

// 4. 收集连接指标
collectConnectionMetrics(instance, metrics);

// 5. 收集备份指标
collectBackupMetrics(instance, metrics);

} catch (Exception e) {
log.error("RDS指标收集失败: {}", instance.getInstanceId(), e);
}

return metrics;
}

/**
* 收集基础指标
*/
private void collectBasicMetrics(RDSInstance instance, RDSMetrics metrics) {
try {
// 实例状态
String instanceStatus = cloudAPIClient.getInstanceStatus(instance.getInstanceId());
metrics.setInstanceStatus(instanceStatus);

// 实例类型
String instanceType = cloudAPIClient.getInstanceType(instance.getInstanceId());
metrics.setInstanceType(instanceType);

// 引擎版本
String engineVersion = cloudAPIClient.getEngineVersion(instance.getInstanceId());
metrics.setEngineVersion(engineVersion);

// 创建时间
Date createTime = cloudAPIClient.getCreateTime(instance.getInstanceId());
metrics.setCreateTime(createTime);

} catch (Exception e) {
log.error("基础指标收集失败: {}", instance.getInstanceId(), e);
}
}

/**
* 收集性能指标
*/
private void collectPerformanceMetrics(RDSInstance instance, RDSMetrics metrics) {
try {
// CPU使用率
double cpuUsage = cloudAPIClient.getCPUUsage(instance.getInstanceId());
metrics.setCpuUsage(cpuUsage);

// 内存使用率
double memoryUsage = cloudAPIClient.getMemoryUsage(instance.getInstanceId());
metrics.setMemoryUsage(memoryUsage);

// QPS
double qps = cloudAPIClient.getQPS(instance.getInstanceId());
metrics.setQps(qps);

// 平均延迟
double avgLatency = cloudAPIClient.getAverageLatency(instance.getInstanceId());
metrics.setAverageLatency(avgLatency);

// 慢查询数
int slowQueryCount = cloudAPIClient.getSlowQueryCount(instance.getInstanceId());
metrics.setSlowQueryCount(slowQueryCount);

// 连接数
int connectionCount = cloudAPIClient.getConnectionCount(instance.getInstanceId());
metrics.setConnectionCount(connectionCount);

// 最大连接数
int maxConnections = cloudAPIClient.getMaxConnections(instance.getInstanceId());
metrics.setMaxConnections(maxConnections);

} catch (Exception e) {
log.error("性能指标收集失败: {}", instance.getInstanceId(), e);
}
}

/**
* 收集存储指标
*/
private void collectStorageMetrics(RDSInstance instance, RDSMetrics metrics) {
try {
// 磁盘使用率
double diskUsage = cloudAPIClient.getDiskUsage(instance.getInstanceId());
metrics.setDiskUsage(diskUsage);

// 可用空间
long availableSpace = cloudAPIClient.getAvailableSpace(instance.getInstanceId());
metrics.setAvailableSpace(availableSpace);

// 总空间
long totalSpace = cloudAPIClient.getTotalSpace(instance.getInstanceId());
metrics.setTotalSpace(totalSpace);

// IOPS
int iops = cloudAPIClient.getIOPS(instance.getInstanceId());
metrics.setIops(iops);

// 读取IOPS
int readIops = cloudAPIClient.getReadIOPS(instance.getInstanceId());
metrics.setReadIops(readIops);

// 写入IOPS
int writeIops = cloudAPIClient.getWriteIOPS(instance.getInstanceId());
metrics.setWriteIops(writeIops);

} catch (Exception e) {
log.error("存储指标收集失败: {}", instance.getInstanceId(), e);
}
}

/**
* 收集连接指标
*/
private void collectConnectionMetrics(RDSInstance instance, RDSMetrics metrics) {
try {
// 活跃连接数
int activeConnections = cloudAPIClient.getActiveConnections(instance.getInstanceId());
metrics.setActiveConnections(activeConnections);

// 空闲连接数
int idleConnections = cloudAPIClient.getIdleConnections(instance.getInstanceId());
metrics.setIdleConnections(idleConnections);

// 连接错误数
int connectionErrors = cloudAPIClient.getConnectionErrors(instance.getInstanceId());
metrics.setConnectionErrors(connectionErrors);

// 连接超时数
int connectionTimeouts = cloudAPIClient.getConnectionTimeouts(instance.getInstanceId());
metrics.setConnectionTimeouts(connectionTimeouts);

} catch (Exception e) {
log.error("连接指标收集失败: {}", instance.getInstanceId(), e);
}
}

/**
* 收集备份指标
*/
private void collectBackupMetrics(RDSInstance instance, RDSMetrics metrics) {
try {
// 最后备份时间
Date lastBackupTime = cloudAPIClient.getLastBackupTime(instance.getInstanceId());
metrics.setLastBackupTime(lastBackupTime);

// 备份大小
long backupSize = cloudAPIClient.getBackupSize(instance.getInstanceId());
metrics.setBackupSize(backupSize);

// 备份状态
String backupStatus = cloudAPIClient.getBackupStatus(instance.getInstanceId());
metrics.setBackupStatus(backupStatus);

// 备份保留天数
int backupRetentionDays = cloudAPIClient.getBackupRetentionDays(instance.getInstanceId());
metrics.setBackupRetentionDays(backupRetentionDays);

} catch (Exception e) {
log.error("备份指标收集失败: {}", instance.getInstanceId(), e);
}
}
}

2.3 RDS健康分析器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
/**
* RDS健康分析器
*/
@Component
public class RDSHealthAnalyzer {

/**
* 分析RDS健康状态
*/
public RDSHealthStatus analyzeHealth(RDSInstance instance, RDSMetrics metrics) {
RDSHealthStatus status = new RDSHealthStatus();

try {
// 1. 分析实例状态
InstanceStatusAnalysis instanceAnalysis = analyzeInstanceStatus(metrics);
status.setInstanceAnalysis(instanceAnalysis);

// 2. 分析性能状态
PerformanceStatusAnalysis performanceAnalysis = analyzePerformanceStatus(metrics);
status.setPerformanceAnalysis(performanceAnalysis);

// 3. 分析存储状态
StorageStatusAnalysis storageAnalysis = analyzeStorageStatus(metrics);
status.setStorageAnalysis(storageAnalysis);

// 4. 分析连接状态
ConnectionStatusAnalysis connectionAnalysis = analyzeConnectionStatus(metrics);
status.setConnectionAnalysis(connectionAnalysis);

// 5. 分析备份状态
BackupStatusAnalysis backupAnalysis = analyzeBackupStatus(metrics);
status.setBackupAnalysis(backupAnalysis);

// 6. 综合评估健康状态
evaluateHealthStatus(status);

} catch (Exception e) {
log.error("RDS健康分析失败: {}", instance.getInstanceId(), e);
status.setHealthy(false);
status.setSeverity(Severity.HIGH);
}

return status;
}

/**
* 分析实例状态
*/
private InstanceStatusAnalysis analyzeInstanceStatus(RDSMetrics metrics) {
InstanceStatusAnalysis analysis = new InstanceStatusAnalysis();

// 检查实例状态
String instanceStatus = metrics.getInstanceStatus();
if ("Running".equals(instanceStatus)) {
analysis.setInstanceStatus(InstanceStatus.RUNNING);
analysis.setInstanceStatusLevel(StatusLevel.NORMAL);
} else if ("Stopped".equals(instanceStatus)) {
analysis.setInstanceStatus(InstanceStatus.STOPPED);
analysis.setInstanceStatusLevel(StatusLevel.CRITICAL);
} else if ("Starting".equals(instanceStatus)) {
analysis.setInstanceStatus(InstanceStatus.STARTING);
analysis.setInstanceStatusLevel(StatusLevel.WARNING);
} else {
analysis.setInstanceStatus(InstanceStatus.UNKNOWN);
analysis.setInstanceStatusLevel(StatusLevel.CRITICAL);
}

return analysis;
}

/**
* 分析性能状态
*/
private PerformanceStatusAnalysis analyzePerformanceStatus(RDSMetrics metrics) {
PerformanceStatusAnalysis analysis = new PerformanceStatusAnalysis();

// 分析CPU使用率
double cpuUsage = metrics.getCpuUsage();
analysis.setCpuUsage(cpuUsage);
analysis.setCpuUsageLevel(evaluateCPUUsageLevel(cpuUsage));

// 分析内存使用率
double memoryUsage = metrics.getMemoryUsage();
analysis.setMemoryUsage(memoryUsage);
analysis.setMemoryUsageLevel(evaluateMemoryUsageLevel(memoryUsage));

// 分析QPS
double qps = metrics.getQps();
analysis.setQps(qps);
analysis.setQpsLevel(evaluateQPSLevel(qps));

// 分析平均延迟
double avgLatency = metrics.getAverageLatency();
analysis.setAverageLatency(avgLatency);
analysis.setLatencyLevel(evaluateLatencyLevel(avgLatency));

// 分析慢查询数
int slowQueryCount = metrics.getSlowQueryCount();
analysis.setSlowQueryCount(slowQueryCount);
analysis.setSlowQueryLevel(evaluateSlowQueryLevel(slowQueryCount));

return analysis;
}

/**
* 分析存储状态
*/
private StorageStatusAnalysis analyzeStorageStatus(RDSMetrics metrics) {
StorageStatusAnalysis analysis = new StorageStatusAnalysis();

// 分析磁盘使用率
double diskUsage = metrics.getDiskUsage();
analysis.setDiskUsage(diskUsage);
analysis.setDiskUsageLevel(evaluateDiskUsageLevel(diskUsage));

// 分析可用空间
long availableSpace = metrics.getAvailableSpace();
analysis.setAvailableSpace(availableSpace);
analysis.setAvailableSpaceLevel(evaluateAvailableSpaceLevel(availableSpace));

// 分析IOPS
int iops = metrics.getIops();
analysis.setIops(iops);
analysis.setIopsLevel(evaluateIOPSLevel(iops));

return analysis;
}

/**
* 分析连接状态
*/
private ConnectionStatusAnalysis analyzeConnectionStatus(RDSMetrics metrics) {
ConnectionStatusAnalysis analysis = new ConnectionStatusAnalysis();

// 分析连接数
int connectionCount = metrics.getConnectionCount();
int maxConnections = metrics.getMaxConnections();
double connectionRatio = (double) connectionCount / maxConnections;

analysis.setConnectionCount(connectionCount);
analysis.setMaxConnections(maxConnections);
analysis.setConnectionRatio(connectionRatio);
analysis.setConnectionLevel(evaluateConnectionLevel(connectionRatio));

// 分析连接错误
int connectionErrors = metrics.getConnectionErrors();
analysis.setConnectionErrors(connectionErrors);
analysis.setConnectionErrorLevel(evaluateConnectionErrorLevel(connectionErrors));

// 分析连接超时
int connectionTimeouts = metrics.getConnectionTimeouts();
analysis.setConnectionTimeouts(connectionTimeouts);
analysis.setConnectionTimeoutLevel(evaluateConnectionTimeoutLevel(connectionTimeouts));

return analysis;
}

/**
* 分析备份状态
*/
private BackupStatusAnalysis analyzeBackupStatus(RDSMetrics metrics) {
BackupStatusAnalysis analysis = new BackupStatusAnalysis();

// 分析备份状态
String backupStatus = metrics.getBackupStatus();
analysis.setBackupStatus(backupStatus);
analysis.setBackupStatusLevel(evaluateBackupStatusLevel(backupStatus));

// 分析最后备份时间
Date lastBackupTime = metrics.getLastBackupTime();
analysis.setLastBackupTime(lastBackupTime);
analysis.setBackupTimeLevel(evaluateBackupTimeLevel(lastBackupTime));

// 分析备份大小
long backupSize = metrics.getBackupSize();
analysis.setBackupSize(backupSize);
analysis.setBackupSizeLevel(evaluateBackupSizeLevel(backupSize));

return analysis;
}

/**
* 评估健康状态
*/
private void evaluateHealthStatus(RDSHealthStatus status) {
boolean isHealthy = true;
Severity maxSeverity = Severity.LOW;

// 评估实例状态
if (status.getInstanceAnalysis().getInstanceStatusLevel() == StatusLevel.CRITICAL) {
isHealthy = false;
maxSeverity = Severity.CRITICAL;
}

// 评估性能状态
if (status.getPerformanceAnalysis().hasCriticalIssues()) {
isHealthy = false;
maxSeverity = Severity.max(maxSeverity, Severity.HIGH);
}

// 评估存储状态
if (status.getStorageAnalysis().hasCriticalIssues()) {
isHealthy = false;
maxSeverity = Severity.max(maxSeverity, Severity.HIGH);
}

// 评估连接状态
if (status.getConnectionAnalysis().hasCriticalIssues()) {
isHealthy = false;
maxSeverity = Severity.max(maxSeverity, Severity.MEDIUM);
}

// 评估备份状态
if (status.getBackupAnalysis().hasCriticalIssues()) {
isHealthy = false;
maxSeverity = Severity.max(maxSeverity, Severity.MEDIUM);
}

status.setHealthy(isHealthy);
status.setSeverity(maxSeverity);
}

/**
* 评估CPU使用率级别
*/
private StatusLevel evaluateCPUUsageLevel(double cpuUsage) {
if (cpuUsage > 0.9) { // 90%
return StatusLevel.CRITICAL;
} else if (cpuUsage > 0.8) { // 80%
return StatusLevel.WARNING;
} else {
return StatusLevel.NORMAL;
}
}

/**
* 评估内存使用率级别
*/
private StatusLevel evaluateMemoryUsageLevel(double memoryUsage) {
if (memoryUsage > 0.9) { // 90%
return StatusLevel.CRITICAL;
} else if (memoryUsage > 0.8) { // 80%
return StatusLevel.WARNING;
} else {
return StatusLevel.NORMAL;
}
}

/**
* 评估QPS级别
*/
private StatusLevel evaluateQPSLevel(double qps) {
if (qps < 100) { // 100 QPS
return StatusLevel.WARNING;
} else if (qps < 50) { // 50 QPS
return StatusLevel.CRITICAL;
} else {
return StatusLevel.NORMAL;
}
}

/**
* 评估延迟级别
*/
private StatusLevel evaluateLatencyLevel(double latency) {
if (latency > 1000) { // 1000ms
return StatusLevel.CRITICAL;
} else if (latency > 500) { // 500ms
return StatusLevel.WARNING;
} else {
return StatusLevel.NORMAL;
}
}

/**
* 评估慢查询级别
*/
private StatusLevel evaluateSlowQueryLevel(int slowQueryCount) {
if (slowQueryCount > 100) { // 100个
return StatusLevel.CRITICAL;
} else if (slowQueryCount > 50) { // 50个
return StatusLevel.WARNING;
} else {
return StatusLevel.NORMAL;
}
}

/**
* 评估磁盘使用率级别
*/
private StatusLevel evaluateDiskUsageLevel(double diskUsage) {
if (diskUsage > 0.9) { // 90%
return StatusLevel.CRITICAL;
} else if (diskUsage > 0.8) { // 80%
return StatusLevel.WARNING;
} else {
return StatusLevel.NORMAL;
}
}

/**
* 评估可用空间级别
*/
private StatusLevel evaluateAvailableSpaceLevel(long availableSpace) {
if (availableSpace < 1024 * 1024 * 1024) { // 1GB
return StatusLevel.CRITICAL;
} else if (availableSpace < 5 * 1024 * 1024 * 1024) { // 5GB
return StatusLevel.WARNING;
} else {
return StatusLevel.NORMAL;
}
}

/**
* 评估IOPS级别
*/
private StatusLevel evaluateIOPSLevel(int iops) {
if (iops > 10000) { // 10000 IOPS
return StatusLevel.WARNING;
} else if (iops > 15000) { // 15000 IOPS
return StatusLevel.CRITICAL;
} else {
return StatusLevel.NORMAL;
}
}

/**
* 评估连接级别
*/
private StatusLevel evaluateConnectionLevel(double connectionRatio) {
if (connectionRatio > 0.9) { // 90%
return StatusLevel.CRITICAL;
} else if (connectionRatio > 0.8) { // 80%
return StatusLevel.WARNING;
} else {
return StatusLevel.NORMAL;
}
}

/**
* 评估连接错误级别
*/
private StatusLevel evaluateConnectionErrorLevel(int connectionErrors) {
if (connectionErrors > 100) { // 100个
return StatusLevel.CRITICAL;
} else if (connectionErrors > 50) { // 50个
return StatusLevel.WARNING;
} else {
return StatusLevel.NORMAL;
}
}

/**
* 评估连接超时级别
*/
private StatusLevel evaluateConnectionTimeoutLevel(int connectionTimeouts) {
if (connectionTimeouts > 50) { // 50个
return StatusLevel.CRITICAL;
} else if (connectionTimeouts > 20) { // 20个
return StatusLevel.WARNING;
} else {
return StatusLevel.NORMAL;
}
}

/**
* 评估备份状态级别
*/
private StatusLevel evaluateBackupStatusLevel(String backupStatus) {
if ("Failed".equals(backupStatus)) {
return StatusLevel.CRITICAL;
} else if ("InProgress".equals(backupStatus)) {
return StatusLevel.WARNING;
} else if ("Completed".equals(backupStatus)) {
return StatusLevel.NORMAL;
} else {
return StatusLevel.WARNING;
}
}

/**
* 评估备份时间级别
*/
private StatusLevel evaluateBackupTimeLevel(Date lastBackupTime) {
if (lastBackupTime == null) {
return StatusLevel.CRITICAL;
}

long timeDiff = System.currentTimeMillis() - lastBackupTime.getTime();
long daysDiff = timeDiff / (24 * 60 * 60 * 1000);

if (daysDiff > 7) { // 7天
return StatusLevel.CRITICAL;
} else if (daysDiff > 3) { // 3天
return StatusLevel.WARNING;
} else {
return StatusLevel.NORMAL;
}
}

/**
* 评估备份大小级别
*/
private StatusLevel evaluateBackupSizeLevel(long backupSize) {
if (backupSize > 100 * 1024 * 1024 * 1024) { // 100GB
return StatusLevel.WARNING;
} else if (backupSize > 500 * 1024 * 1024 * 1024) { // 500GB
return StatusLevel.CRITICAL;
} else {
return StatusLevel.NORMAL;
}
}
}

三、Redis缓存监控

3.1 Redis监控服务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
/**
* Redis缓存监控服务
*/
@Service
public class RedisMonitorService {

@Autowired
private RedisConnectionManager redisConnectionManager;

@Autowired
private RedisMetricsCollector redisMetricsCollector;

@Autowired
private RedisHealthAnalyzer redisHealthAnalyzer;

@Autowired
private RedisOptimizer redisOptimizer;

private final ScheduledExecutorService monitorScheduler;
private final Map<String, RedisInstance> monitoredInstances;

public RedisMonitorService() {
this.monitorScheduler = Executors.newScheduledThreadPool(10);
this.monitoredInstances = new ConcurrentHashMap<>();
}

/**
* 启动Redis监控
*/
public void startMonitoring() {
// 启动定期监控任务
monitorScheduler.scheduleAtFixedRate(
this::monitorRedisInstances,
0,
30, // 30秒
TimeUnit.SECONDS
);

log.info("Redis缓存监控启动成功");
}

/**
* 停止Redis监控
*/
public void stopMonitoring() {
try {
monitorScheduler.shutdown();
if (!monitorScheduler.awaitTermination(30, TimeUnit.SECONDS)) {
monitorScheduler.shutdownNow();
}

log.info("Redis缓存监控停止成功");

} catch (Exception e) {
log.error("Redis缓存监控停止失败", e);
}
}

/**
* 初始化Redis连接
*/
public void initializeConnections() {
try {
// 获取所有Redis实例
List<RedisInstance> instances = redisConnectionManager.getAllInstances();

// 初始化每个实例的连接
for (RedisInstance instance : instances) {
initializeInstanceConnection(instance);
monitoredInstances.put(instance.getInstanceId(), instance);
}

log.info("Redis实例连接初始化完成,实例数量: {}", instances.size());

} catch (Exception e) {
log.error("Redis连接初始化失败", e);
throw new ConnectionException("Redis连接初始化失败", e);
}
}

/**
* 监控Redis实例
*/
private void monitorRedisInstances() {
try {
for (RedisInstance instance : monitoredInstances.values()) {
monitorSingleInstance(instance);
}

} catch (Exception e) {
log.error("Redis实例监控失败", e);
}
}

/**
* 监控单个Redis实例
*/
private void monitorSingleInstance(RedisInstance instance) {
try {
// 1. 收集实例指标
RedisMetrics metrics = redisMetricsCollector.collectMetrics(instance);

// 2. 分析实例健康状态
RedisHealthStatus healthStatus = redisHealthAnalyzer.analyzeHealth(instance, metrics);

// 3. 检查实例状态
if (!healthStatus.isHealthy()) {
handleUnhealthyInstance(instance, healthStatus);
}

// 4. 尝试性能优化
if (healthStatus.needsOptimization()) {
attemptOptimization(instance, healthStatus);
}

// 5. 记录监控结果
recordMonitoringResult(instance, metrics, healthStatus);

} catch (Exception e) {
log.error("Redis实例监控失败: {}", instance.getInstanceId(), e);
}
}

/**
* 处理不健康实例
*/
private void handleUnhealthyInstance(RedisInstance instance, RedisHealthStatus healthStatus) {
try {
// 1. 发送实例告警
sendInstanceAlert(instance, healthStatus);

// 2. 记录实例问题
recordInstanceIssue(instance, healthStatus);

// 3. 尝试自动修复
if (healthStatus.isAutoRepairable()) {
attemptAutoRepair(instance, healthStatus);
}

} catch (Exception e) {
log.error("不健康Redis实例处理失败: {}", instance.getInstanceId(), e);
}
}

/**
* 发送实例告警
*/
private void sendInstanceAlert(RedisInstance instance, RedisHealthStatus healthStatus) {
RedisAlert alert = new RedisAlert();
alert.setAlertType(AlertType.REDIS_UNHEALTHY);
alert.setSeverity(healthStatus.getSeverity());
alert.setMessage("Redis实例状态异常: " + instance.getInstanceId());
alert.setInstance(instance);
alert.setHealthStatus(healthStatus);
alert.setTimestamp(System.currentTimeMillis());

// 发送告警
alertService.sendAlert(alert);
}

/**
* 尝试自动修复
*/
private void attemptAutoRepair(RedisInstance instance, RedisHealthStatus healthStatus) {
try {
log.info("尝试自动修复Redis实例: {}", instance.getInstanceId());

// 根据问题类型选择修复策略
switch (healthStatus.getIssueType()) {
case CONNECTION_ISSUE:
repairConnectionIssue(instance);
break;
case MEMORY_ISSUE:
repairMemoryIssue(instance);
break;
case PERFORMANCE_ISSUE:
repairPerformanceIssue(instance);
break;
case KEY_EXPIRATION_ISSUE:
repairKeyExpirationIssue(instance);
break;
default:
log.warn("无法自动修复的问题类型: {}", healthStatus.getIssueType());
}

} catch (Exception e) {
log.error("Redis实例自动修复失败: {}", instance.getInstanceId(), e);
}
}

/**
* 尝试性能优化
*/
private void attemptOptimization(RedisInstance instance, RedisHealthStatus healthStatus) {
try {
log.info("尝试Redis实例性能优化: {}", instance.getInstanceId());

// 根据性能问题选择优化策略
switch (healthStatus.getPerformanceIssueType()) {
case LOW_HIT_RATIO:
redisOptimizer.optimizeHitRatio(instance);
break;
case HIGH_MEMORY_USAGE:
redisOptimizer.optimizeMemoryUsage(instance);
break;
case SLOW_COMMANDS:
redisOptimizer.optimizeSlowCommands(instance);
break;
case HIGH_EVICTION_RATE:
redisOptimizer.optimizeEvictionRate(instance);
break;
default:
log.warn("无法自动优化的性能问题类型: {}", healthStatus.getPerformanceIssueType());
}

} catch (Exception e) {
log.error("Redis实例性能优化失败: {}", instance.getInstanceId(), e);
}
}

/**
* 修复连接问题
*/
private void repairConnectionIssue(RedisInstance instance) {
try {
// 重启实例连接
redisConnectionManager.restartInstanceConnection(instance);
log.info("Redis实例连接修复成功: {}", instance.getInstanceId());

} catch (Exception e) {
log.error("Redis实例连接修复失败: {}", instance.getInstanceId(), e);
}
}

/**
* 修复内存问题
*/
private void repairMemoryIssue(RedisInstance instance) {
try {
// 清理过期键
redisOptimizer.cleanupExpiredKeys(instance);
log.info("Redis实例内存修复成功: {}", instance.getInstanceId());

} catch (Exception e) {
log.error("Redis实例内存修复失败: {}", instance.getInstanceId(), e);
}
}

/**
* 修复性能问题
*/
private void repairPerformanceIssue(RedisInstance instance) {
try {
// 调整实例参数
redisOptimizer.adjustInstanceParameters(instance);
log.info("Redis实例性能修复成功: {}", instance.getInstanceId());

} catch (Exception e) {
log.error("Redis实例性能修复失败: {}", instance.getInstanceId(), e);
}
}

/**
* 修复键过期问题
*/
private void repairKeyExpirationIssue(RedisInstance instance) {
try {
// 调整过期策略
redisOptimizer.adjustExpirationPolicy(instance);
log.info("Redis实例键过期修复成功: {}", instance.getInstanceId());

} catch (Exception e) {
log.error("Redis实例键过期修复失败: {}", instance.getInstanceId(), e);
}
}

/**
* 记录监控结果
*/
private void recordMonitoringResult(RedisInstance instance, RedisMetrics metrics, RedisHealthStatus healthStatus) {
RedisMonitoringResult result = new RedisMonitoringResult();
result.setInstanceId(instance.getInstanceId());
result.setTimestamp(System.currentTimeMillis());
result.setMetrics(metrics);
result.setHealthStatus(healthStatus);

// 存储监控结果
resultStorage.storeRedisResult(result);
}
}

3.2 Redis指标收集器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
/**
* Redis指标收集器
*/
@Component
public class RedisMetricsCollector {

@Autowired
private RedisConnectionManager connectionManager;

@Autowired
private CloudAPIClient cloudAPIClient;

/**
* 收集Redis指标
*/
public RedisMetrics collectMetrics(RedisInstance instance) {
RedisMetrics metrics = new RedisMetrics();

try {
// 1. 收集基础指标
collectBasicMetrics(instance, metrics);

// 2. 收集性能指标
collectPerformanceMetrics(instance, metrics);

// 3. 收集内存指标
collectMemoryMetrics(instance, metrics);

// 4. 收集键值指标
collectKeyValueMetrics(instance, metrics);

// 5. 收集复制指标
collectReplicationMetrics(instance, metrics);

} catch (Exception e) {
log.error("Redis指标收集失败: {}", instance.getInstanceId(), e);
}

return metrics;
}

/**
* 收集基础指标
*/
private void collectBasicMetrics(RedisInstance instance, RedisMetrics metrics) {
try {
// 实例状态
String instanceStatus = cloudAPIClient.getRedisInstanceStatus(instance.getInstanceId());
metrics.setInstanceStatus(instanceStatus);

// 实例类型
String instanceType = cloudAPIClient.getRedisInstanceType(instance.getInstanceId());
metrics.setInstanceType(instanceType);

// Redis版本
String redisVersion = cloudAPIClient.getRedisVersion(instance.getInstanceId());
metrics.setRedisVersion(redisVersion);

// 运行时间
long uptime = cloudAPIClient.getRedisUptime(instance.getInstanceId());
metrics.setUptime(uptime);

} catch (Exception e) {
log.error("Redis基础指标收集失败: {}", instance.getInstanceId(), e);
}
}

/**
* 收集性能指标
*/
private void collectPerformanceMetrics(RedisInstance instance, RedisMetrics metrics) {
try {
// 每秒操作数
double opsPerSecond = cloudAPIClient.getRedisOpsPerSecond(instance.getInstanceId());
metrics.setOpsPerSecond(opsPerSecond);

// 平均响应时间
double avgResponseTime = cloudAPIClient.getRedisAverageResponseTime(instance.getInstanceId());
metrics.setAverageResponseTime(avgResponseTime);

// 最大响应时间
double maxResponseTime = cloudAPIClient.getRedisMaxResponseTime(instance.getInstanceId());
metrics.setMaxResponseTime(maxResponseTime);

// 连接数
int connectedClients = cloudAPIClient.getRedisConnectedClients(instance.getInstanceId());
metrics.setConnectedClients(connectedClients);

// 阻塞客户端数
int blockedClients = cloudAPIClient.getRedisBlockedClients(instance.getInstanceId());
metrics.setBlockedClients(blockedClients);

// 慢查询数
int slowLogCount = cloudAPIClient.getRedisSlowLogCount(instance.getInstanceId());
metrics.setSlowLogCount(slowLogCount);

} catch (Exception e) {
log.error("Redis性能指标收集失败: {}", instance.getInstanceId(), e);
}
}

/**
* 收集内存指标
*/
private void collectMemoryMetrics(RedisInstance instance, RedisMetrics metrics) {
try {
// 已使用内存
long usedMemory = cloudAPIClient.getRedisUsedMemory(instance.getInstanceId());
metrics.setUsedMemory(usedMemory);

// 最大内存
long maxMemory = cloudAPIClient.getRedisMaxMemory(instance.getInstanceId());
metrics.setMaxMemory(maxMemory);

// 内存使用率
double memoryUsage = (double) usedMemory / maxMemory;
metrics.setMemoryUsage(memoryUsage);

// 内存碎片率
double memoryFragmentationRatio = cloudAPIClient.getRedisMemoryFragmentationRatio(instance.getInstanceId());
metrics.setMemoryFragmentationRatio(memoryFragmentationRatio);

// 已使用内存峰值
long usedMemoryPeak = cloudAPIClient.getRedisUsedMemoryPeak(instance.getInstanceId());
metrics.setUsedMemoryPeak(usedMemoryPeak);

// 内存使用率峰值
double memoryUsagePeak = (double) usedMemoryPeak / maxMemory;
metrics.setMemoryUsagePeak(memoryUsagePeak);

} catch (Exception e) {
log.error("Redis内存指标收集失败: {}", instance.getInstanceId(), e);
}
}

/**
* 收集键值指标
*/
private void collectKeyValueMetrics(RedisInstance instance, RedisMetrics metrics) {
try {
// 键总数
int totalKeys = cloudAPIClient.getRedisTotalKeys(instance.getInstanceId());
metrics.setTotalKeys(totalKeys);

// 过期键数
int expiredKeys = cloudAPIClient.getRedisExpiredKeys(instance.getInstanceId());
metrics.setExpiredKeys(expiredKeys);

// 键空间命中数
long keyspaceHits = cloudAPIClient.getRedisKeyspaceHits(instance.getInstanceId());
metrics.setKeyspaceHits(keyspaceHits);

// 键空间未命中数
long keyspaceMisses = cloudAPIClient.getRedisKeyspaceMisses(instance.getInstanceId());
metrics.setKeyspaceMisses(keyspaceMisses);

// 命中率
double hitRatio = (double) keyspaceHits / (keyspaceHits + keyspaceMisses);
metrics.setHitRatio(hitRatio);

// 驱逐键数
int evictedKeys = cloudAPIClient.getRedisEvictedKeys(instance.getInstanceId());
metrics.setEvictedKeys(evictedKeys);

} catch (Exception e) {
log.error("Redis键值指标收集失败: {}", instance.getInstanceId(), e);
}
}

/**
* 收集复制指标
*/
private void collectReplicationMetrics(RedisInstance instance, RedisMetrics metrics) {
try {
// 主从状态
String replicationRole = cloudAPIClient.getRedisReplicationRole(instance.getInstanceId());
metrics.setReplicationRole(replicationRole);

// 从节点数
int connectedSlaves = cloudAPIClient.getRedisConnectedSlaves(instance.getInstanceId());
metrics.setConnectedSlaves(connectedSlaves);

// 复制延迟
long replicationLag = cloudAPIClient.getRedisReplicationLag(instance.getInstanceId());
metrics.setReplicationLag(replicationLag);

// 复制状态
String replicationStatus = cloudAPIClient.getRedisReplicationStatus(instance.getInstanceId());
metrics.setReplicationStatus(replicationStatus);

} catch (Exception e) {
log.error("Redis复制指标收集失败: {}", instance.getInstanceId(), e);
}
}
}

四、企业级RDS-Redis巡检方案

4.1 巡检配置管理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
/**
* RDS-Redis巡检配置管理服务
*/
@Service
public class RDSRedisInspectionConfigService {

@Autowired
private InspectionConfigRepository configRepository;

/**
* 获取巡检配置
*/
public RDSRedisInspectionConfig getConfig(String configId) {
return configRepository.findById(configId)
.orElseThrow(() -> new ConfigNotFoundException("RDS-Redis巡检配置不存在: " + configId));
}

/**
* 保存巡检配置
*/
public void saveConfig(RDSRedisInspectionConfig config) {
try {
// 验证配置
validateConfig(config);

// 保存配置
configRepository.save(config);

log.info("RDS-Redis巡检配置保存成功: {}", config.getConfigId());

} catch (Exception e) {
log.error("RDS-Redis巡检配置保存失败", e);
throw new ConfigSaveException("RDS-Redis巡检配置保存失败", e);
}
}

/**
* 更新巡检配置
*/
public void updateConfig(String configId, RDSRedisInspectionConfig config) {
try {
// 检查配置是否存在
if (!configRepository.existsById(configId)) {
throw new ConfigNotFoundException("RDS-Redis巡检配置不存在: " + configId);
}

// 验证配置
validateConfig(config);

// 更新配置
config.setConfigId(configId);
configRepository.save(config);

log.info("RDS-Redis巡检配置更新成功: {}", configId);

} catch (Exception e) {
log.error("RDS-Redis巡检配置更新失败", e);
throw new ConfigUpdateException("RDS-Redis巡检配置更新失败", e);
}
}

/**
* 删除巡检配置
*/
public void deleteConfig(String configId) {
try {
if (!configRepository.existsById(configId)) {
throw new ConfigNotFoundException("RDS-Redis巡检配置不存在: " + configId);
}

configRepository.deleteById(configId);

log.info("RDS-Redis巡检配置删除成功: {}", configId);

} catch (Exception e) {
log.error("RDS-Redis巡检配置删除失败", e);
throw new ConfigDeleteException("RDS-Redis巡检配置删除失败", e);
}
}

/**
* 获取所有配置
*/
public List<RDSRedisInspectionConfig> getAllConfigs() {
return configRepository.findAll();
}

/**
* 验证配置
*/
private void validateConfig(RDSRedisInspectionConfig config) {
if (config.getConfigId() == null || config.getConfigId().isEmpty()) {
throw new ConfigValidationException("配置ID不能为空");
}

if (config.getRdsInstances() == null || config.getRdsInstances().isEmpty()) {
throw new ConfigValidationException("RDS实例不能为空");
}

if (config.getRedisInstances() == null || config.getRedisInstances().isEmpty()) {
throw new ConfigValidationException("Redis实例不能为空");
}

if (config.getInspectionInterval() <= 0) {
throw new ConfigValidationException("巡检间隔必须大于0");
}

if (config.getAlertThresholds() == null || config.getAlertThresholds().isEmpty()) {
throw new ConfigValidationException("告警阈值不能为空");
}
}
}

4.2 巡检报告生成

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
/**
* RDS-Redis巡检报告生成服务
*/
@Service
public class RDSRedisInspectionReportService {

@Autowired
private InspectionResultStorageService resultStorageService;

@Autowired
private ReportTemplateService templateService;

/**
* 生成RDS-Redis巡检报告
*/
public RDSRedisInspectionReport generateReport(String environment, Date startTime, Date endTime) {
RDSRedisInspectionReport report = new RDSRedisInspectionReport();

try {
// 1. 设置报告基本信息
report.setReportId(generateReportId());
report.setEnvironment(environment);
report.setStartTime(startTime);
report.setEndTime(endTime);
report.setGenerateTime(new Date());

// 2. 获取RDS巡检结果
List<RDSMonitoringResult> rdsResults = resultStorageService.getRDSResultsByEnvironment(environment, startTime, endTime);

// 3. 获取Redis巡检结果
List<RedisMonitoringResult> redisResults = resultStorageService.getRedisResultsByEnvironment(environment, startTime, endTime);

// 4. 生成RDS统计信息
RDSStatistics rdsStats = generateRDSStatistics(rdsResults);
report.setRdsStatistics(rdsStats);

// 5. 生成Redis统计信息
RedisStatistics redisStats = generateRedisStatistics(redisResults);
report.setRedisStatistics(redisStats);

// 6. 生成性能分析
PerformanceAnalysis performanceAnalysis = generatePerformanceAnalysis(rdsResults, redisResults);
report.setPerformanceAnalysis(performanceAnalysis);

// 7. 生成异常分析
ExceptionAnalysis exceptionAnalysis = generateExceptionAnalysis(rdsResults, redisResults);
report.setExceptionAnalysis(exceptionAnalysis);

// 8. 生成建议
List<Recommendation> recommendations = generateRecommendations(report);
report.setRecommendations(recommendations);

// 9. 生成报告摘要
String summary = generateSummary(report);
report.setSummary(summary);

log.info("RDS-Redis巡检报告生成成功: {}", report.getReportId());

} catch (Exception e) {
log.error("RDS-Redis巡检报告生成失败", e);
throw new ReportGenerationException("RDS-Redis巡检报告生成失败", e);
}

return report;
}

/**
* 生成RDS统计信息
*/
private RDSStatistics generateRDSStatistics(List<RDSMonitoringResult> results) {
RDSStatistics stats = new RDSStatistics();

try {
// 计算基础统计
stats.setTotalInspections(results.size());
stats.setHealthyInspections((int) results.stream()
.filter(result -> result.getHealthStatus().isHealthy())
.count());
stats.setUnhealthyInspections((int) results.stream()
.filter(result -> !result.getHealthStatus().isHealthy())
.count());

// 计算健康率
if (stats.getTotalInspections() > 0) {
stats.setHealthRate((double) stats.getHealthyInspections() / stats.getTotalInspections());
}

// 计算平均CPU使用率
double avgCpuUsage = results.stream()
.mapToDouble(result -> result.getMetrics().getCpuUsage())
.average()
.orElse(0.0);
stats.setAverageCpuUsage(avgCpuUsage);

// 计算平均内存使用率
double avgMemoryUsage = results.stream()
.mapToDouble(result -> result.getMetrics().getMemoryUsage())
.average()
.orElse(0.0);
stats.setAverageMemoryUsage(avgMemoryUsage);

// 计算平均QPS
double avgQps = results.stream()
.mapToDouble(result -> result.getMetrics().getQps())
.average()
.orElse(0.0);
stats.setAverageQps(avgQps);

// 计算平均延迟
double avgLatency = results.stream()
.mapToDouble(result -> result.getMetrics().getAverageLatency())
.average()
.orElse(0.0);
stats.setAverageLatency(avgLatency);

// 计算平均磁盘使用率
double avgDiskUsage = results.stream()
.mapToDouble(result -> result.getMetrics().getDiskUsage())
.average()
.orElse(0.0);
stats.setAverageDiskUsage(avgDiskUsage);

} catch (Exception e) {
log.error("RDS统计信息生成失败", e);
}

return stats;
}

/**
* 生成Redis统计信息
*/
private RedisStatistics generateRedisStatistics(List<RedisMonitoringResult> results) {
RedisStatistics stats = new RedisStatistics();

try {
// 计算基础统计
stats.setTotalInspections(results.size());
stats.setHealthyInspections((int) results.stream()
.filter(result -> result.getHealthStatus().isHealthy())
.count());
stats.setUnhealthyInspections((int) results.stream()
.filter(result -> !result.getHealthStatus().isHealthy())
.count());

// 计算健康率
if (stats.getTotalInspections() > 0) {
stats.setHealthRate((double) stats.getHealthyInspections() / stats.getTotalInspections());
}

// 计算平均内存使用率
double avgMemoryUsage = results.stream()
.mapToDouble(result -> result.getMetrics().getMemoryUsage())
.average()
.orElse(0.0);
stats.setAverageMemoryUsage(avgMemoryUsage);

// 计算平均命中率
double avgHitRatio = results.stream()
.mapToDouble(result -> result.getMetrics().getHitRatio())
.average()
.orElse(0.0);
stats.setAverageHitRatio(avgHitRatio);

// 计算平均响应时间
double avgResponseTime = results.stream()
.mapToDouble(result -> result.getMetrics().getAverageResponseTime())
.average()
.orElse(0.0);
stats.setAverageResponseTime(avgResponseTime);

// 计算平均每秒操作数
double avgOpsPerSecond = results.stream()
.mapToDouble(result -> result.getMetrics().getOpsPerSecond())
.average()
.orElse(0.0);
stats.setAverageOpsPerSecond(avgOpsPerSecond);

// 计算平均键数量
double avgTotalKeys = results.stream()
.mapToDouble(result -> result.getMetrics().getTotalKeys())
.average()
.orElse(0.0);
stats.setAverageTotalKeys(avgTotalKeys);

} catch (Exception e) {
log.error("Redis统计信息生成失败", e);
}

return stats;
}

/**
* 生成性能分析
*/
private PerformanceAnalysis generatePerformanceAnalysis(List<RDSMonitoringResult> rdsResults,
List<RedisMonitoringResult> redisResults) {
PerformanceAnalysis analysis = new PerformanceAnalysis();

try {
// 分析RDS性能趋势
List<Double> rdsCpuUsages = rdsResults.stream()
.map(result -> result.getMetrics().getCpuUsage())
.collect(Collectors.toList());
analysis.setRdsCpuUsageTrend(analyzeTrend(rdsCpuUsages));

List<Double> rdsLatencies = rdsResults.stream()
.map(result -> result.getMetrics().getAverageLatency())
.collect(Collectors.toList());
analysis.setRdsLatencyTrend(analyzeTrend(rdsLatencies));

// 分析Redis性能趋势
List<Double> redisHitRatios = redisResults.stream()
.map(result -> result.getMetrics().getHitRatio())
.collect(Collectors.toList());
analysis.setRedisHitRatioTrend(analyzeTrend(redisHitRatios));

List<Double> redisResponseTimes = redisResults.stream()
.map(result -> result.getMetrics().getAverageResponseTime())
.collect(Collectors.toList());
analysis.setRedisResponseTimeTrend(analyzeTrend(redisResponseTimes));

// 分析整体性能
analysis.setOverallPerformanceScore(calculateOverallPerformanceScore(rdsResults, redisResults));

} catch (Exception e) {
log.error("性能分析生成失败", e);
}

return analysis;
}

/**
* 生成异常分析
*/
private ExceptionAnalysis generateExceptionAnalysis(List<RDSMonitoringResult> rdsResults,
List<RedisMonitoringResult> redisResults) {
ExceptionAnalysis analysis = new ExceptionAnalysis();

try {
// 统计RDS异常类型
Map<String, Integer> rdsExceptionCount = new HashMap<>();
for (RDSMonitoringResult result : rdsResults) {
if (!result.getHealthStatus().isHealthy()) {
String issueType = result.getHealthStatus().getIssueType().name();
rdsExceptionCount.merge(issueType, 1, Integer::sum);
}
}
analysis.setRdsExceptionCount(rdsExceptionCount);

// 统计Redis异常类型
Map<String, Integer> redisExceptionCount = new HashMap<>();
for (RedisMonitoringResult result : redisResults) {
if (!result.getHealthStatus().isHealthy()) {
String issueType = result.getHealthStatus().getIssueType().name();
redisExceptionCount.merge(issueType, 1, Integer::sum);
}
}
analysis.setRedisExceptionCount(redisExceptionCount);

// 找出最常见的异常
String mostCommonRdsException = rdsExceptionCount.entrySet().stream()
.max(Map.Entry.comparingByValue())
.map(Map.Entry::getKey)
.orElse("无");
analysis.setMostCommonRdsException(mostCommonRdsException);

String mostCommonRedisException = redisExceptionCount.entrySet().stream()
.max(Map.Entry.comparingByValue())
.map(Map.Entry::getKey)
.orElse("无");
analysis.setMostCommonRedisException(mostCommonRedisException);

} catch (Exception e) {
log.error("异常分析生成失败", e);
}

return analysis;
}

/**
* 生成建议
*/
private List<Recommendation> generateRecommendations(RDSRedisInspectionReport report) {
List<Recommendation> recommendations = new ArrayList<>();

try {
// 基于RDS统计生成建议
RDSStatistics rdsStats = report.getRdsStatistics();
if (rdsStats.getHealthRate() < 0.8) {
Recommendation rec = new Recommendation();
rec.setType(RecommendationType.RDS_STABILITY);
rec.setPriority(Priority.HIGH);
rec.setTitle("提升RDS稳定性");
rec.setDescription("当前RDS健康率为" + String.format("%.2f", rdsStats.getHealthRate() * 100) + "%,建议检查RDS配置和性能");
recommendations.add(rec);
}

if (rdsStats.getAverageCpuUsage() > 0.8) {
Recommendation rec = new Recommendation();
rec.setType(RecommendationType.RDS_PERFORMANCE);
rec.setPriority(Priority.HIGH);
rec.setTitle("优化RDS性能");
rec.setDescription("平均CPU使用率为" + String.format("%.2f", rdsStats.getAverageCpuUsage() * 100) + "%,建议优化查询和调整配置");
recommendations.add(rec);
}

// 基于Redis统计生成建议
RedisStatistics redisStats = report.getRedisStatistics();
if (redisStats.getHealthRate() < 0.8) {
Recommendation rec = new Recommendation();
rec.setType(RecommendationType.REDIS_STABILITY);
rec.setPriority(Priority.HIGH);
rec.setTitle("提升Redis稳定性");
rec.setDescription("当前Redis健康率为" + String.format("%.2f", redisStats.getHealthRate() * 100) + "%,建议检查Redis配置和内存使用");
recommendations.add(rec);
}

if (redisStats.getAverageHitRatio() < 0.8) {
Recommendation rec = new Recommendation();
rec.setType(RecommendationType.REDIS_PERFORMANCE);
rec.setPriority(Priority.MEDIUM);
rec.setTitle("优化Redis命中率");
rec.setDescription("平均命中率为" + String.format("%.2f", redisStats.getAverageHitRatio() * 100) + "%,建议优化缓存策略");
recommendations.add(rec);
}

// 基于异常分析生成建议
ExceptionAnalysis exceptionAnalysis = report.getExceptionAnalysis();
if (exceptionAnalysis.getMostCommonRdsException() != null &&
!exceptionAnalysis.getMostCommonRdsException().equals("无")) {
Recommendation rec = new Recommendation();
rec.setType(RecommendationType.RDS_EXCEPTION_HANDLING);
rec.setPriority(Priority.MEDIUM);
rec.setTitle("处理RDS常见异常");
rec.setDescription("最常见的RDS异常是:" + exceptionAnalysis.getMostCommonRdsException() + ",建议优先处理");
recommendations.add(rec);
}

if (exceptionAnalysis.getMostCommonRedisException() != null &&
!exceptionAnalysis.getMostCommonRedisException().equals("无")) {
Recommendation rec = new Recommendation();
rec.setType(RecommendationType.REDIS_EXCEPTION_HANDLING);
rec.setPriority(Priority.MEDIUM);
rec.setTitle("处理Redis常见异常");
rec.setDescription("最常见的Redis异常是:" + exceptionAnalysis.getMostCommonRedisException() + ",建议优先处理");
recommendations.add(rec);
}

} catch (Exception e) {
log.error("建议生成失败", e);
}

return recommendations;
}

/**
* 生成报告摘要
*/
private String generateSummary(RDSRedisInspectionReport report) {
StringBuilder summary = new StringBuilder();

summary.append("RDS-Redis巡检报告摘要\n");
summary.append("环境: ").append(report.getEnvironment()).append("\n");
summary.append("统计期间: ").append(report.getStartTime()).append(" - ").append(report.getEndTime()).append("\n");
summary.append("RDS健康率: ").append(String.format("%.2f", report.getRdsStatistics().getHealthRate() * 100)).append("%\n");
summary.append("Redis健康率: ").append(String.format("%.2f", report.getRedisStatistics().getHealthRate() * 100)).append("%\n");
summary.append("RDS平均CPU使用率: ").append(String.format("%.2f", report.getRdsStatistics().getAverageCpuUsage() * 100)).append("%\n");
summary.append("Redis平均命中率: ").append(String.format("%.2f", report.getRedisStatistics().getAverageHitRatio() * 100)).append("%\n");

if (!report.getRecommendations().isEmpty()) {
summary.append("主要建议: ").append(report.getRecommendations().get(0).getTitle()).append("\n");
}

return summary.toString();
}

/**
* 分析趋势
*/
private TrendDirection analyzeTrend(List<Double> values) {
if (values.size() < 2) {
return TrendDirection.STABLE;
}

// 简单趋势分析
double firstHalf = values.subList(0, values.size() / 2).stream()
.mapToDouble(Double::doubleValue)
.average()
.orElse(0.0);

double secondHalf = values.subList(values.size() / 2, values.size()).stream()
.mapToDouble(Double::doubleValue)
.average()
.orElse(0.0);

if (secondHalf > firstHalf * 1.1) {
return TrendDirection.INCREASING;
} else if (secondHalf < firstHalf * 0.9) {
return TrendDirection.DECREASING;
} else {
return TrendDirection.STABLE;
}
}

/**
* 计算整体性能评分
*/
private double calculateOverallPerformanceScore(List<RDSMonitoringResult> rdsResults,
List<RedisMonitoringResult> redisResults) {
double rdsScore = 0.0;
double redisScore = 0.0;

// 计算RDS性能评分
if (!rdsResults.isEmpty()) {
double avgCpuUsage = rdsResults.stream()
.mapToDouble(result -> result.getMetrics().getCpuUsage())
.average()
.orElse(0.0);
double avgLatency = rdsResults.stream()
.mapToDouble(result -> result.getMetrics().getAverageLatency())
.average()
.orElse(0.0);

rdsScore = (1.0 - avgCpuUsage) * 0.5 + (1.0 - Math.min(avgLatency / 1000.0, 1.0)) * 0.5;
}

// 计算Redis性能评分
if (!redisResults.isEmpty()) {
double avgHitRatio = redisResults.stream()
.mapToDouble(result -> result.getMetrics().getHitRatio())
.average()
.orElse(0.0);
double avgResponseTime = redisResults.stream()
.mapToDouble(result -> result.getMetrics().getAverageResponseTime())
.average()
.orElse(0.0);

redisScore = avgHitRatio * 0.5 + (1.0 - Math.min(avgResponseTime / 100.0, 1.0)) * 0.5;
}

return (rdsScore + redisScore) / 2.0;
}

/**
* 生成报告ID
*/
private String generateReportId() {
return "rds-redis-report-" + System.currentTimeMillis() + "-" + Thread.currentThread().getId();
}
}

五、最佳实践与总结

5.1 RDS-Redis自动巡检最佳实践

  1. 云数据库监控策略

    • 建立全面的RDS监控体系
    • 监控实例健康状态
    • 监控性能指标
  2. 缓存监控策略

    • 建立完善的Redis监控
    • 监控缓存命中率
    • 监控内存使用情况
  3. 性能优化策略

    • 优化数据库查询
    • 优化缓存策略
    • 监控资源使用率
  4. 异常处理机制

    • 建立完善的异常分类
    • 实现智能告警
    • 提供自动修复能力

5.2 架构师级云数据库运维技能

  1. 云数据库管理能力

    • 深入理解RDS架构
    • 掌握实例管理
    • 管理备份和恢复
  2. 缓存管理能力

    • 深入理解Redis架构
    • 掌握缓存策略
    • 管理内存和性能
  3. 性能调优能力

    • 优化数据库性能
    • 优化缓存性能
    • 调整配置参数
  4. 监控运维能力

    • 建立监控体系
    • 实现自动化运维
    • 持续优化改进

5.3 持续改进建议

  1. 监控体系完善

    • 完善监控指标
    • 优化告警策略
    • 提升监控精度
  2. 自动化程度提升

    • 实现更多自动修复
    • 优化巡检策略
    • 提升运维效率
  3. 知识积累

    • 建立运维知识库
    • 总结最佳实践
    • 形成标准化流程

总结

RDS-Redis自动巡检是企业级云数据库运维的核心能力,通过智能的云数据库监控策略、完善的缓存优化机制和系统化的性能管理,能够及时发现数据库异常,预防数据丢失,保障企业级应用的高可用性。本文从云数据库监控设计到缓存优化,从基础原理到企业级实践,系统梳理了RDS-Redis自动巡检的完整解决方案。

关键要点:

  1. 云数据库监控策略:全面的RDS监控和实例健康检查
  2. 缓存监控机制:完善的Redis监控和性能优化
  3. 性能优化方案:智能的性能监控和优化
  4. 企业级实践:配置管理、报告生成、持续改进

通过深入理解这些技术要点,架构师能够设计出完善的RDS-Redis自动巡检系统,提升云数据库的稳定性和可靠性,确保企业级应用的高可用性。