前言

线程死锁是生产环境中最严重的故障之一,一旦发生死锁,系统将完全无法响应,严重影响业务正常运行。面对死锁问题,需要快速诊断、应急处理和根本解决。本文从死锁诊断到应急处理,从故障恢复到预防措施,系统梳理企业级死锁故障的完整解决方案。

一、死锁问题诊断架构

1.1 死锁诊断与处理架构

1.2 死锁处理流程

二、死锁诊断工具

2.1 死锁诊断器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
@Component
@Slf4j
public class DeadlockDiagnosticTool {

@Autowired
private ThreadMXBean threadMXBean;

@Autowired
private DeadlockAnalyzer deadlockAnalyzer;

@Autowired
private DeadlockReporter deadlockReporter;

/**
* 诊断死锁问题
*/
public DeadlockDiagnosticResult diagnoseDeadlock() {
DeadlockDiagnosticResult result = new DeadlockDiagnosticResult();
result.setDiagnosisTime(LocalDateTime.now());

try {
// 1. 检测死锁
DeadlockDetectionResult detectionResult = detectDeadlock();
result.setDetectionResult(detectionResult);

if (detectionResult.hasDeadlock()) {
// 2. 分析死锁
DeadlockAnalysis analysis = analyzeDeadlock(detectionResult);
result.setAnalysis(analysis);

// 3. 评估影响
ImpactAssessment impact = assessImpact(detectionResult);
result.setImpact(impact);

// 4. 生成诊断报告
DeadlockDiagnosticReport report = generateDiagnosticReport(result);
result.setReport(report);

// 5. 提供解决建议
List<String> recommendations = generateRecommendations(analysis);
result.setRecommendations(recommendations);
}

return result;

} catch (Exception e) {
log.error("死锁诊断失败", e);
result.setError(e.getMessage());
return result;
}
}

/**
* 检测死锁
*/
private DeadlockDetectionResult detectDeadlock() {
DeadlockDetectionResult result = new DeadlockDetectionResult();

try {
// 1. 检测死锁线程
long[] deadlockedThreads = threadMXBean.findDeadlockedThreads();

if (deadlockedThreads != null && deadlockedThreads.length > 0) {
result.setHasDeadlock(true);
result.setDeadlockedThreadCount(deadlockedThreads.length);

// 2. 获取死锁线程信息
ThreadInfo[] threadInfos = threadMXBean.getThreadInfo(deadlockedThreads);
result.setThreadInfos(Arrays.asList(threadInfos));

// 3. 获取锁信息
List<LockInfo> lockInfos = extractLockInfos(threadInfos);
result.setLockInfos(lockInfos);

// 4. 构建死锁图
DeadlockGraph deadlockGraph = buildDeadlockGraph(threadInfos, lockInfos);
result.setDeadlockGraph(deadlockGraph);

log.error("检测到死锁: 死锁线程数量={}", deadlockedThreads.length);
} else {
result.setHasDeadlock(false);
log.info("未检测到死锁");
}

} catch (Exception e) {
log.error("死锁检测异常", e);
result.setError(e.getMessage());
}

return result;
}

/**
* 分析死锁
*/
private DeadlockAnalysis analyzeDeadlock(DeadlockDetectionResult detectionResult) {
try {
// 1. 分析死锁原因
DeadlockCause cause = analyzeDeadlockCause(detectionResult);

// 2. 分析死锁模式
DeadlockPattern pattern = analyzeDeadlockPattern(detectionResult);

// 3. 分析死锁严重程度
DeadlockSeverity severity = analyzeDeadlockSeverity(detectionResult);

// 4. 分析死锁影响范围
DeadlockScope scope = analyzeDeadlockScope(detectionResult);

DeadlockAnalysis analysis = new DeadlockAnalysis();
analysis.setCause(cause);
analysis.setPattern(pattern);
analysis.setSeverity(severity);
analysis.setScope(scope);
analysis.setAnalysisTime(LocalDateTime.now());

return analysis;

} catch (Exception e) {
log.error("死锁分析失败", e);
throw new RuntimeException("死锁分析失败", e);
}
}

/**
* 分析死锁原因
*/
private DeadlockCause analyzeDeadlockCause(DeadlockDetectionResult detectionResult) {
List<ThreadInfo> threadInfos = detectionResult.getThreadInfos();

// 1. 检查锁顺序
if (isLockOrderDeadlock(threadInfos)) {
return DeadlockCause.LOCK_ORDER;
}

// 2. 检查资源竞争
if (isResourceCompetitionDeadlock(threadInfos)) {
return DeadlockCause.RESOURCE_COMPETITION;
}

// 3. 检查嵌套锁
if (isNestedLockDeadlock(threadInfos)) {
return DeadlockCause.NESTED_LOCK;
}

// 4. 检查锁粒度
if (isLockGranularityDeadlock(threadInfos)) {
return DeadlockCause.LOCK_GRANULARITY;
}

return DeadlockCause.UNKNOWN;
}

/**
* 检查锁顺序死锁
*/
private boolean isLockOrderDeadlock(List<ThreadInfo> threadInfos) {
// 分析锁的获取顺序,检查是否存在循环等待
Map<Long, Set<Long>> waitGraph = new HashMap<>();

for (ThreadInfo threadInfo : threadInfos) {
Long threadId = threadInfo.getThreadId();
Set<Long> waitingFor = new HashSet<>();

// 获取线程等待的锁
LockInfo waitingLock = threadInfo.getLockInfo();
if (waitingLock != null) {
// 查找持有此锁的线程
for (ThreadInfo otherThread : threadInfos) {
LockInfo[] lockedLocks = otherThread.getLockedSynchronizers();
if (lockedLocks != null) {
for (LockInfo lockedLock : lockedLocks) {
if (lockedLock.getIdentityHashCode() == waitingLock.getIdentityHashCode()) {
waitingFor.add(otherThread.getThreadId());
}
}
}
}
}

if (!waitingFor.isEmpty()) {
waitGraph.put(threadId, waitingFor);
}
}

// 检测环路
return hasCycleInWaitGraph(waitGraph);
}

/**
* 检查资源竞争死锁
*/
private boolean isResourceCompetitionDeadlock(List<ThreadInfo> threadInfos) {
// 统计锁的使用情况
Map<String, Integer> lockUsageCount = new HashMap<>();

for (ThreadInfo threadInfo : threadInfos) {
LockInfo[] lockedLocks = threadInfo.getLockedSynchronizers();
if (lockedLocks != null) {
for (LockInfo lockInfo : lockedLocks) {
String lockName = lockInfo.getClassName();
lockUsageCount.put(lockName, lockUsageCount.getOrDefault(lockName, 0) + 1);
}
}
}

// 如果多个线程使用相同类型的锁,可能存在资源竞争
return lockUsageCount.values().stream().anyMatch(count -> count > 1);
}

/**
* 检查嵌套锁死锁
*/
private boolean isNestedLockDeadlock(List<ThreadInfo> threadInfos) {
for (ThreadInfo threadInfo : threadInfos) {
StackTraceElement[] stackTrace = threadInfo.getStackTrace();
if (stackTrace != null) {
int lockCount = 0;
for (StackTraceElement element : stackTrace) {
if (element.getMethodName().contains("lock") ||
element.getMethodName().contains("synchronized")) {
lockCount++;
}
}
if (lockCount > 2) {
return true; // 嵌套锁过多
}
}
}
return false;
}

/**
* 检查锁粒度死锁
*/
private boolean isLockGranularityDeadlock(List<ThreadInfo> threadInfos) {
for (ThreadInfo threadInfo : threadInfos) {
LockInfo[] lockedLocks = threadInfo.getLockedSynchronizers();
if (lockedLocks != null && lockedLocks.length > 3) {
return true; // 持有锁过多
}
}
return false;
}

/**
* 分析死锁模式
*/
private DeadlockPattern analyzeDeadlockPattern(DeadlockDetectionResult detectionResult) {
int threadCount = detectionResult.getDeadlockedThreadCount();
int lockCount = detectionResult.getLockInfos().size();

if (threadCount == 2 && lockCount == 2) {
return DeadlockPattern.TWO_THREAD_TWO_LOCK;
} else if (threadCount > 2 && lockCount > 2) {
return DeadlockPattern.MULTI_THREAD_MULTI_LOCK;
} else if (threadCount > 5) {
return DeadlockPattern.CASCADE_DEADLOCK;
} else {
return DeadlockPattern.UNKNOWN;
}
}

/**
* 分析死锁严重程度
*/
private DeadlockSeverity analyzeDeadlockSeverity(DeadlockDetectionResult detectionResult) {
int threadCount = detectionResult.getDeadlockedThreadCount();
int lockCount = detectionResult.getLockInfos().size();

if (threadCount >= 10 || lockCount >= 20) {
return DeadlockSeverity.CRITICAL;
} else if (threadCount >= 5 || lockCount >= 10) {
return DeadlockSeverity.HIGH;
} else if (threadCount >= 3 || lockCount >= 5) {
return DeadlockSeverity.MEDIUM;
} else {
return DeadlockSeverity.LOW;
}
}

/**
* 分析死锁影响范围
*/
private DeadlockScope analyzeDeadlockScope(DeadlockDetectionResult detectionResult) {
List<ThreadInfo> threadInfos = detectionResult.getThreadInfos();

// 分析线程类型
Set<String> threadTypes = new HashSet<>();
for (ThreadInfo threadInfo : threadInfos) {
String threadName = threadInfo.getThreadName();
if (threadName.contains("http")) {
threadTypes.add("HTTP");
} else if (threadName.contains("task")) {
threadTypes.add("TASK");
} else if (threadName.contains("mq")) {
threadTypes.add("MQ");
} else {
threadTypes.add("OTHER");
}
}

if (threadTypes.contains("HTTP")) {
return DeadlockScope.WEB_SERVICE;
} else if (threadTypes.contains("MQ")) {
return DeadlockScope.MESSAGE_QUEUE;
} else if (threadTypes.contains("TASK")) {
return DeadlockScope.BACKGROUND_TASK;
} else {
return DeadlockScope.UNKNOWN;
}
}

/**
* 评估影响
*/
private ImpactAssessment assessImpact(DeadlockDetectionResult detectionResult) {
ImpactAssessment impact = new ImpactAssessment();

// 1. 评估业务影响
BusinessImpact businessImpact = assessBusinessImpact(detectionResult);
impact.setBusinessImpact(businessImpact);

// 2. 评估系统影响
SystemImpact systemImpact = assessSystemImpact(detectionResult);
impact.setSystemImpact(systemImpact);

// 3. 评估用户体验影响
UserImpact userImpact = assessUserImpact(detectionResult);
impact.setUserImpact(userImpact);

return impact;
}

/**
* 评估业务影响
*/
private BusinessImpact assessBusinessImpact(DeadlockDetectionResult detectionResult) {
BusinessImpact impact = new BusinessImpact();

int threadCount = detectionResult.getDeadlockedThreadCount();

if (threadCount >= 10) {
impact.setLevel(ImpactLevel.CRITICAL);
impact.setDescription("业务完全无法响应");
} else if (threadCount >= 5) {
impact.setLevel(ImpactLevel.HIGH);
impact.setDescription("业务响应严重延迟");
} else if (threadCount >= 3) {
impact.setLevel(ImpactLevel.MEDIUM);
impact.setDescription("业务响应延迟");
} else {
impact.setLevel(ImpactLevel.LOW);
impact.setDescription("业务响应轻微延迟");
}

return impact;
}

/**
* 评估系统影响
*/
private SystemImpact assessSystemImpact(DeadlockDetectionResult detectionResult) {
SystemImpact impact = new SystemImpact();

int threadCount = detectionResult.getDeadlockedThreadCount();

if (threadCount >= 10) {
impact.setLevel(ImpactLevel.CRITICAL);
impact.setDescription("系统完全无法响应");
} else if (threadCount >= 5) {
impact.setLevel(ImpactLevel.HIGH);
impact.setDescription("系统响应严重延迟");
} else if (threadCount >= 3) {
impact.setLevel(ImpactLevel.MEDIUM);
impact.setDescription("系统响应延迟");
} else {
impact.setLevel(ImpactLevel.LOW);
impact.setDescription("系统响应轻微延迟");
}

return impact;
}

/**
* 评估用户体验影响
*/
private UserImpact assessUserImpact(DeadlockDetectionResult detectionResult) {
UserImpact impact = new UserImpact();

int threadCount = detectionResult.getDeadlockedThreadCount();

if (threadCount >= 10) {
impact.setLevel(ImpactLevel.CRITICAL);
impact.setDescription("用户无法使用系统");
} else if (threadCount >= 5) {
impact.setLevel(ImpactLevel.HIGH);
impact.setDescription("用户使用体验严重下降");
} else if (threadCount >= 3) {
impact.setLevel(ImpactLevel.MEDIUM);
impact.setDescription("用户使用体验下降");
} else {
impact.setLevel(ImpactLevel.LOW);
impact.setDescription("用户使用体验轻微下降");
}

return impact;
}

/**
* 生成诊断报告
*/
private DeadlockDiagnosticReport generateDiagnosticReport(DeadlockDiagnosticResult result) {
DeadlockDiagnosticReport report = new DeadlockDiagnosticReport();
report.setReportTime(LocalDateTime.now());
report.setDetectionResult(result.getDetectionResult());
report.setAnalysis(result.getAnalysis());
report.setImpact(result.getImpact());
report.setRecommendations(result.getRecommendations());

// 生成报告摘要
String summary = generateReportSummary(result);
report.setSummary(summary);

return report;
}

/**
* 生成报告摘要
*/
private String generateReportSummary(DeadlockDiagnosticResult result) {
StringBuilder summary = new StringBuilder();

DeadlockDetectionResult detection = result.getDetectionResult();
DeadlockAnalysis analysis = result.getAnalysis();
ImpactAssessment impact = result.getImpact();

summary.append("死锁诊断报告摘要:\n");
summary.append("检测时间: ").append(result.getDiagnosisTime()).append("\n");
summary.append("死锁线程数: ").append(detection.getDeadlockedThreadCount()).append("\n");
summary.append("死锁原因: ").append(analysis.getCause().getDescription()).append("\n");
summary.append("死锁模式: ").append(analysis.getPattern().getName()).append("\n");
summary.append("严重程度: ").append(analysis.getSeverity().getDescription()).append("\n");
summary.append("影响范围: ").append(analysis.getScope().getDescription()).append("\n");
summary.append("业务影响: ").append(impact.getBusinessImpact().getDescription()).append("\n");
summary.append("系统影响: ").append(impact.getSystemImpact().getDescription()).append("\n");
summary.append("用户影响: ").append(impact.getUserImpact().getDescription()).append("\n");

return summary.toString();
}

/**
* 生成解决建议
*/
private List<String> generateRecommendations(DeadlockAnalysis analysis) {
List<String> recommendations = new ArrayList<>();

DeadlockCause cause = analysis.getCause();
DeadlockSeverity severity = analysis.getSeverity();
DeadlockScope scope = analysis.getScope();

// 根据死锁原因生成建议
switch (cause) {
case LOCK_ORDER:
recommendations.add("统一锁的获取顺序,避免循环等待");
recommendations.add("使用锁排序机制,确保锁的获取顺序一致");
break;
case RESOURCE_COMPETITION:
recommendations.add("优化资源竞争,使用无锁数据结构");
recommendations.add("减少锁的持有时间,提高并发性能");
break;
case NESTED_LOCK:
recommendations.add("减少嵌套锁的使用,避免锁的层次过深");
recommendations.add("重构代码,减少锁的嵌套层次");
break;
case LOCK_GRANULARITY:
recommendations.add("优化锁的粒度,减少锁的持有数量");
recommendations.add("使用分段锁,提高并发性能");
break;
default:
recommendations.add("分析具体死锁场景,制定针对性解决方案");
break;
}

// 根据严重程度生成建议
if (severity == DeadlockSeverity.CRITICAL) {
recommendations.add("立即中断死锁线程,恢复系统响应");
recommendations.add("考虑服务重启,确保系统稳定");
} else if (severity == DeadlockSeverity.HIGH) {
recommendations.add("优先处理死锁线程,避免影响扩大");
recommendations.add("监控系统状态,准备应急处理");
}

// 根据影响范围生成建议
switch (scope) {
case WEB_SERVICE:
recommendations.add("检查HTTP请求处理逻辑,优化锁的使用");
recommendations.add("考虑使用异步处理,减少锁竞争");
break;
case MESSAGE_QUEUE:
recommendations.add("检查消息处理逻辑,优化锁的使用");
recommendations.add("考虑使用无锁消息队列");
break;
case BACKGROUND_TASK:
recommendations.add("检查后台任务逻辑,优化锁的使用");
recommendations.add("考虑使用任务队列,减少锁竞争");
break;
}

return recommendations;
}

/**
* 检测等待图中的环路
*/
private boolean hasCycleInWaitGraph(Map<Long, Set<Long>> waitGraph) {
Set<Long> visited = new HashSet<>();
Set<Long> recursionStack = new HashSet<>();

for (Long threadId : waitGraph.keySet()) {
if (!visited.contains(threadId)) {
if (hasCycleDFS(threadId, waitGraph, visited, recursionStack)) {
return true;
}
}
}

return false;
}

/**
* DFS检测环路
*/
private boolean hasCycleDFS(Long threadId, Map<Long, Set<Long>> waitGraph,
Set<Long> visited, Set<Long> recursionStack) {
visited.add(threadId);
recursionStack.add(threadId);

Set<Long> waitingFor = waitGraph.get(threadId);
if (waitingFor != null) {
for (Long nextThreadId : waitingFor) {
if (!visited.contains(nextThreadId)) {
if (hasCycleDFS(nextThreadId, waitGraph, visited, recursionStack)) {
return true;
}
} else if (recursionStack.contains(nextThreadId)) {
return true; // 发现环路
}
}
}

recursionStack.remove(threadId);
return false;
}

/**
* 提取锁信息
*/
private List<LockInfo> extractLockInfos(ThreadInfo[] threadInfos) {
List<LockInfo> lockInfos = new ArrayList<>();

for (ThreadInfo threadInfo : threadInfos) {
// 获取线程持有的锁
LockInfo[] lockedLocks = threadInfo.getLockedSynchronizers();
if (lockedLocks != null) {
lockInfos.addAll(Arrays.asList(lockedLocks));
}

// 获取线程等待的锁
LockInfo waitingLock = threadInfo.getLockInfo();
if (waitingLock != null) {
lockInfos.add(waitingLock);
}
}

return lockInfos;
}

/**
* 构建死锁图
*/
private DeadlockGraph buildDeadlockGraph(ThreadInfo[] threadInfos, List<LockInfo> lockInfos) {
DeadlockGraph graph = new DeadlockGraph();

// 添加线程节点
for (ThreadInfo threadInfo : threadInfos) {
DeadlockNode node = new DeadlockNode();
node.setType(NodeType.THREAD);
node.setId(threadInfo.getThreadId());
node.setName(threadInfo.getThreadName());
node.setState(threadInfo.getThreadState());
graph.addNode(node);
}

// 添加锁节点
for (LockInfo lockInfo : lockInfos) {
DeadlockNode node = new DeadlockNode();
node.setType(NodeType.LOCK);
node.setId(lockInfo.getIdentityHashCode());
node.setName(lockInfo.getClassName());
graph.addNode(node);
}

// 添加边
for (ThreadInfo threadInfo : threadInfos) {
// 线程持有的锁
LockInfo[] lockedLocks = threadInfo.getLockedSynchronizers();
if (lockedLocks != null) {
for (LockInfo lockInfo : lockedLocks) {
DeadlockEdge edge = new DeadlockEdge();
edge.setFromNodeId(threadInfo.getThreadId());
edge.setToNodeId(lockInfo.getIdentityHashCode());
edge.setType(EdgeType.HOLDS);
graph.addEdge(edge);
}
}

// 线程等待的锁
LockInfo waitingLock = threadInfo.getLockInfo();
if (waitingLock != null) {
DeadlockEdge edge = new DeadlockEdge();
edge.setFromNodeId(threadInfo.getThreadId());
edge.setToNodeId(waitingLock.getIdentityHashCode());
edge.setType(EdgeType.WAITS_FOR);
graph.addEdge(edge);
}
}

return graph;
}
}

2.2 应急处理器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
@Component
@Slf4j
public class DeadlockEmergencyHandler {

@Autowired
private DeadlockDiagnosticTool diagnosticTool;

@Autowired
private ThreadMXBean threadMXBean;

@Autowired
private AlertService alertService;

@Autowired
private RecoveryService recoveryService;

/**
* 处理死锁紧急情况
*/
public EmergencyHandleResult handleDeadlockEmergency() {
EmergencyHandleResult result = new EmergencyHandleResult();
result.setHandleTime(LocalDateTime.now());

try {
// 1. 诊断死锁
DeadlockDiagnosticResult diagnosticResult = diagnosticTool.diagnoseDeadlock();
result.setDiagnosticResult(diagnosticResult);

if (diagnosticResult.getDetectionResult().hasDeadlock()) {
// 2. 评估紧急程度
EmergencyLevel level = evaluateEmergencyLevel(diagnosticResult);
result.setEmergencyLevel(level);

// 3. 执行应急处理
EmergencyAction action = executeEmergencyAction(diagnosticResult, level);
result.setAction(action);

// 4. 启动故障恢复
RecoveryResult recoveryResult = recoveryService.startRecovery(diagnosticResult);
result.setRecoveryResult(recoveryResult);

// 5. 发送告警
sendEmergencyAlert(result);

log.error("死锁应急处理完成: 级别={}, 动作={}", level, action);
} else {
log.info("未检测到死锁,无需应急处理");
}

return result;

} catch (Exception e) {
log.error("死锁应急处理失败", e);
result.setError(e.getMessage());
return result;
}
}

/**
* 评估紧急程度
*/
private EmergencyLevel evaluateEmergencyLevel(DeadlockDiagnosticResult diagnosticResult) {
DeadlockAnalysis analysis = diagnosticResult.getAnalysis();
ImpactAssessment impact = diagnosticResult.getImpact();

// 根据严重程度和影响评估紧急程度
if (analysis.getSeverity() == DeadlockSeverity.CRITICAL ||
impact.getBusinessImpact().getLevel() == ImpactLevel.CRITICAL) {
return EmergencyLevel.CRITICAL;
} else if (analysis.getSeverity() == DeadlockSeverity.HIGH ||
impact.getBusinessImpact().getLevel() == ImpactLevel.HIGH) {
return EmergencyLevel.HIGH;
} else if (analysis.getSeverity() == DeadlockSeverity.MEDIUM ||
impact.getBusinessImpact().getLevel() == ImpactLevel.MEDIUM) {
return EmergencyLevel.MEDIUM;
} else {
return EmergencyLevel.LOW;
}
}

/**
* 执行应急处理
*/
private EmergencyAction executeEmergencyAction(DeadlockDiagnosticResult diagnosticResult, EmergencyLevel level) {
EmergencyAction action = new EmergencyAction();
action.setActionTime(LocalDateTime.now());
action.setLevel(level);

try {
switch (level) {
case CRITICAL:
// 严重级别:立即中断死锁线程
action = interruptDeadlockedThreads(diagnosticResult);
break;
case HIGH:
// 高级别:中断部分死锁线程
action = interruptPartialDeadlockedThreads(diagnosticResult);
break;
case MEDIUM:
// 中级别:记录日志,监控状态
action = logAndMonitor(diagnosticResult);
break;
case LOW:
// 低级别:记录日志
action = logOnly(diagnosticResult);
break;
}

return action;

} catch (Exception e) {
log.error("执行应急处理失败", e);
action.setSuccess(false);
action.setError(e.getMessage());
return action;
}
}

/**
* 中断死锁线程
*/
private EmergencyAction interruptDeadlockedThreads(DeadlockDiagnosticResult diagnosticResult) {
EmergencyAction action = new EmergencyAction();
action.setActionType(EmergencyActionType.INTERRUPT_THREADS);

try {
DeadlockDetectionResult detection = diagnosticResult.getDetectionResult();
List<ThreadInfo> threadInfos = detection.getThreadInfos();

int interruptedCount = 0;
List<String> interruptedThreads = new ArrayList<>();

for (ThreadInfo threadInfo : threadInfos) {
try {
Thread thread = findThreadById(threadInfo.getThreadId());
if (thread != null) {
thread.interrupt();
interruptedCount++;
interruptedThreads.add(threadInfo.getThreadName());
log.warn("已中断死锁线程: {}", threadInfo.getThreadName());
}
} catch (Exception e) {
log.error("中断线程失败: {}", threadInfo.getThreadName(), e);
}
}

action.setSuccess(true);
action.setInterruptedThreadCount(interruptedCount);
action.setInterruptedThreads(interruptedThreads);
action.setDescription("已中断 " + interruptedCount + " 个死锁线程");

return action;

} catch (Exception e) {
log.error("中断死锁线程失败", e);
action.setSuccess(false);
action.setError(e.getMessage());
return action;
}
}

/**
* 中断部分死锁线程
*/
private EmergencyAction interruptPartialDeadlockedThreads(DeadlockDiagnosticResult diagnosticResult) {
EmergencyAction action = new EmergencyAction();
action.setActionType(EmergencyActionType.INTERRUPT_PARTIAL_THREADS);

try {
DeadlockDetectionResult detection = diagnosticResult.getDetectionResult();
List<ThreadInfo> threadInfos = detection.getThreadInfos();

// 只中断一半的死锁线程
int interruptCount = threadInfos.size() / 2;
int interruptedCount = 0;
List<String> interruptedThreads = new ArrayList<>();

for (int i = 0; i < interruptCount; i++) {
ThreadInfo threadInfo = threadInfos.get(i);
try {
Thread thread = findThreadById(threadInfo.getThreadId());
if (thread != null) {
thread.interrupt();
interruptedCount++;
interruptedThreads.add(threadInfo.getThreadName());
log.warn("已中断死锁线程: {}", threadInfo.getThreadName());
}
} catch (Exception e) {
log.error("中断线程失败: {}", threadInfo.getThreadName(), e);
}
}

action.setSuccess(true);
action.setInterruptedThreadCount(interruptedCount);
action.setInterruptedThreads(interruptedThreads);
action.setDescription("已中断 " + interruptedCount + " 个死锁线程");

return action;

} catch (Exception e) {
log.error("中断部分死锁线程失败", e);
action.setSuccess(false);
action.setError(e.getMessage());
return action;
}
}

/**
* 记录日志并监控
*/
private EmergencyAction logAndMonitor(DeadlockDiagnosticResult diagnosticResult) {
EmergencyAction action = new EmergencyAction();
action.setActionType(EmergencyActionType.LOG_AND_MONITOR);

try {
// 记录详细日志
logDeadlockDetails(diagnosticResult);

// 启动监控
startDeadlockMonitoring(diagnosticResult);

action.setSuccess(true);
action.setDescription("已记录日志并启动监控");

return action;

} catch (Exception e) {
log.error("记录日志并监控失败", e);
action.setSuccess(false);
action.setError(e.getMessage());
return action;
}
}

/**
* 仅记录日志
*/
private EmergencyAction logOnly(DeadlockDiagnosticResult diagnosticResult) {
EmergencyAction action = new EmergencyAction();
action.setActionType(EmergencyActionType.LOG_ONLY);

try {
// 记录日志
logDeadlockDetails(diagnosticResult);

action.setSuccess(true);
action.setDescription("已记录死锁日志");

return action;

} catch (Exception e) {
log.error("记录日志失败", e);
action.setSuccess(false);
action.setError(e.getMessage());
return action;
}
}

/**
* 记录死锁详细信息
*/
private void logDeadlockDetails(DeadlockDiagnosticResult diagnosticResult) {
DeadlockDetectionResult detection = diagnosticResult.getDetectionResult();

log.error("死锁详细信息:");
log.error("死锁线程数量: {}", detection.getDeadlockedThreadCount());

for (ThreadInfo threadInfo : detection.getThreadInfos()) {
log.error("线程: {} (ID: {})", threadInfo.getThreadName(), threadInfo.getThreadId());
log.error("状态: {}", threadInfo.getThreadState());
log.error("阻塞时间: {}ms", threadInfo.getBlockedTime());

if (threadInfo.getLockInfo() != null) {
log.error("等待锁: {} (ID: {})",
threadInfo.getLockInfo().getClassName(),
threadInfo.getLockInfo().getIdentityHashCode());
}

log.error("堆栈跟踪:");
StackTraceElement[] stackTrace = threadInfo.getStackTrace();
for (StackTraceElement element : stackTrace) {
log.error(" {}", element.toString());
}
}
}

/**
* 启动死锁监控
*/
private void startDeadlockMonitoring(DeadlockDiagnosticResult diagnosticResult) {
// 启动死锁监控任务
ScheduledExecutorService monitorExecutor = Executors.newScheduledThreadPool(1);
monitorExecutor.scheduleAtFixedRate(() -> {
try {
DeadlockDiagnosticResult newResult = diagnosticTool.diagnoseDeadlock();
if (!newResult.getDetectionResult().hasDeadlock()) {
log.info("死锁已解除,停止监控");
monitorExecutor.shutdown();
}
} catch (Exception e) {
log.error("死锁监控异常", e);
}
}, 0, 5, TimeUnit.SECONDS);
}

/**
* 发送紧急告警
*/
private void sendEmergencyAlert(EmergencyHandleResult result) {
try {
EmergencyAlert alert = new EmergencyAlert();
alert.setAlertTime(LocalDateTime.now());
alert.setEmergencyLevel(result.getEmergencyLevel());
alert.setAction(result.getAction());
alert.setRecoveryResult(result.getRecoveryResult());

// 发送告警
alertService.sendEmergencyAlert(alert);

} catch (Exception e) {
log.error("发送紧急告警失败", e);
}
}

/**
* 根据线程ID查找线程
*/
private Thread findThreadById(long threadId) {
ThreadGroup rootGroup = Thread.currentThread().getThreadGroup();
while (rootGroup.getParent() != null) {
rootGroup = rootGroup.getParent();
}

Thread[] threads = new Thread[rootGroup.activeCount()];
rootGroup.enumerate(threads);

for (Thread thread : threads) {
if (thread != null && thread.getId() == threadId) {
return thread;
}
}

return null;
}
}

三、故障恢复服务

3.1 故障恢复器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
@Service
@Slf4j
public class DeadlockRecoveryService {

@Autowired
private ApplicationContext applicationContext;

@Autowired
private DataSource dataSource;

@Autowired
private RedisTemplate<String, Object> redisTemplate;

@Autowired
private RecoveryMonitor recoveryMonitor;

/**
* 启动故障恢复
*/
public RecoveryResult startRecovery(DeadlockDiagnosticResult diagnosticResult) {
RecoveryResult result = new RecoveryResult();
result.setRecoveryStartTime(LocalDateTime.now());

try {
// 1. 评估恢复策略
RecoveryStrategy strategy = evaluateRecoveryStrategy(diagnosticResult);
result.setStrategy(strategy);

// 2. 执行恢复操作
RecoveryAction action = executeRecoveryAction(strategy, diagnosticResult);
result.setAction(action);

// 3. 验证恢复结果
boolean recoverySuccess = verifyRecovery(action);
result.setSuccess(recoverySuccess);

// 4. 记录恢复信息
recordRecoveryInfo(result);

// 5. 发送恢复通知
sendRecoveryNotification(result);

result.setRecoveryEndTime(LocalDateTime.now());
log.info("故障恢复完成: 策略={}, 成功={}", strategy, recoverySuccess);

return result;

} catch (Exception e) {
log.error("故障恢复失败", e);
result.setSuccess(false);
result.setError(e.getMessage());
result.setRecoveryEndTime(LocalDateTime.now());
return result;
}
}

/**
* 评估恢复策略
*/
private RecoveryStrategy evaluateRecoveryStrategy(DeadlockDiagnosticResult diagnosticResult) {
DeadlockAnalysis analysis = diagnosticResult.getAnalysis();
ImpactAssessment impact = diagnosticResult.getImpact();

// 根据死锁严重程度和影响评估恢复策略
if (analysis.getSeverity() == DeadlockSeverity.CRITICAL ||
impact.getBusinessImpact().getLevel() == ImpactLevel.CRITICAL) {
return RecoveryStrategy.FULL_RESTART;
} else if (analysis.getSeverity() == DeadlockSeverity.HIGH ||
impact.getBusinessImpact().getLevel() == ImpactLevel.HIGH) {
return RecoveryStrategy.PARTIAL_RESTART;
} else if (analysis.getSeverity() == DeadlockSeverity.MEDIUM ||
impact.getBusinessImpact().getLevel() == ImpactLevel.MEDIUM) {
return RecoveryStrategy.SERVICE_RECOVERY;
} else {
return RecoveryStrategy.MONITOR_ONLY;
}
}

/**
* 执行恢复操作
*/
private RecoveryAction executeRecoveryAction(RecoveryStrategy strategy, DeadlockDiagnosticResult diagnosticResult) {
RecoveryAction action = new RecoveryAction();
action.setActionTime(LocalDateTime.now());
action.setStrategy(strategy);

try {
switch (strategy) {
case FULL_RESTART:
action = executeFullRestart(diagnosticResult);
break;
case PARTIAL_RESTART:
action = executePartialRestart(diagnosticResult);
break;
case SERVICE_RECOVERY:
action = executeServiceRecovery(diagnosticResult);
break;
case MONITOR_ONLY:
action = executeMonitorOnly(diagnosticResult);
break;
}

return action;

} catch (Exception e) {
log.error("执行恢复操作失败", e);
action.setSuccess(false);
action.setError(e.getMessage());
return action;
}
}

/**
* 执行完全重启
*/
private RecoveryAction executeFullRestart(DeadlockDiagnosticResult diagnosticResult) {
RecoveryAction action = new RecoveryAction();
action.setActionType(RecoveryActionType.FULL_RESTART);

try {
// 1. 停止服务
stopServices();

// 2. 清理资源
cleanupResources();

// 3. 重启服务
restartServices();

// 4. 恢复数据
recoverData();

action.setSuccess(true);
action.setDescription("完全重启完成");

return action;

} catch (Exception e) {
log.error("完全重启失败", e);
action.setSuccess(false);
action.setError(e.getMessage());
return action;
}
}

/**
* 执行部分重启
*/
private RecoveryAction executePartialRestart(DeadlockDiagnosticResult diagnosticResult) {
RecoveryAction action = new RecoveryAction();
action.setActionType(RecoveryActionType.PARTIAL_RESTART);

try {
// 1. 停止相关服务
stopRelatedServices(diagnosticResult);

// 2. 清理相关资源
cleanupRelatedResources(diagnosticResult);

// 3. 重启相关服务
restartRelatedServices(diagnosticResult);

action.setSuccess(true);
action.setDescription("部分重启完成");

return action;

} catch (Exception e) {
log.error("部分重启失败", e);
action.setSuccess(false);
action.setError(e.getMessage());
return action;
}
}

/**
* 执行服务恢复
*/
private RecoveryAction executeServiceRecovery(DeadlockDiagnosticResult diagnosticResult) {
RecoveryAction action = new RecoveryAction();
action.setActionType(RecoveryActionType.SERVICE_RECOVERY);

try {
// 1. 恢复服务状态
recoverServiceStatus(diagnosticResult);

// 2. 清理死锁状态
cleanupDeadlockState(diagnosticResult);

// 3. 重新初始化服务
reinitializeServices(diagnosticResult);

action.setSuccess(true);
action.setDescription("服务恢复完成");

return action;

} catch (Exception e) {
log.error("服务恢复失败", e);
action.setSuccess(false);
action.setError(e.getMessage());
return action;
}
}

/**
* 执行仅监控
*/
private RecoveryAction executeMonitorOnly(DeadlockDiagnosticResult diagnosticResult) {
RecoveryAction action = new RecoveryAction();
action.setActionType(RecoveryActionType.MONITOR_ONLY);

try {
// 1. 启动监控
startMonitoring(diagnosticResult);

// 2. 记录状态
recordStatus(diagnosticResult);

action.setSuccess(true);
action.setDescription("监控启动完成");

return action;

} catch (Exception e) {
log.error("监控启动失败", e);
action.setSuccess(false);
action.setError(e.getMessage());
return action;
}
}

/**
* 停止服务
*/
private void stopServices() {
try {
// 停止Spring Boot应用
ConfigurableApplicationContext context = (ConfigurableApplicationContext) applicationContext;
context.close();

log.info("服务已停止");

} catch (Exception e) {
log.error("停止服务失败", e);
throw new RuntimeException("停止服务失败", e);
}
}

/**
* 清理资源
*/
private void cleanupResources() {
try {
// 1. 关闭数据库连接
if (dataSource instanceof HikariDataSource) {
((HikariDataSource) dataSource).close();
}

// 2. 关闭Redis连接
if (redisTemplate != null) {
redisTemplate.getConnectionFactory().getConnection().close();
}

// 3. 清理其他资源
cleanupOtherResources();

log.info("资源清理完成");

} catch (Exception e) {
log.error("清理资源失败", e);
throw new RuntimeException("清理资源失败", e);
}
}

/**
* 重启服务
*/
private void restartServices() {
try {
// 重启Spring Boot应用
ConfigurableApplicationContext context = (ConfigurableApplicationContext) applicationContext;
context.refresh();

log.info("服务已重启");

} catch (Exception e) {
log.error("重启服务失败", e);
throw new RuntimeException("重启服务失败", e);
}
}

/**
* 恢复数据
*/
private void recoverData() {
try {
// 1. 恢复数据库数据
recoverDatabaseData();

// 2. 恢复缓存数据
recoverCacheData();

// 3. 恢复其他数据
recoverOtherData();

log.info("数据恢复完成");

} catch (Exception e) {
log.error("数据恢复失败", e);
throw new RuntimeException("数据恢复失败", e);
}
}

/**
* 停止相关服务
*/
private void stopRelatedServices(DeadlockDiagnosticResult diagnosticResult) {
try {
DeadlockScope scope = diagnosticResult.getAnalysis().getScope();

switch (scope) {
case WEB_SERVICE:
stopWebServices();
break;
case MESSAGE_QUEUE:
stopMessageQueueServices();
break;
case BACKGROUND_TASK:
stopBackgroundTaskServices();
break;
default:
stopDefaultServices();
break;
}

log.info("相关服务已停止");

} catch (Exception e) {
log.error("停止相关服务失败", e);
throw new RuntimeException("停止相关服务失败", e);
}
}

/**
* 清理相关资源
*/
private void cleanupRelatedResources(DeadlockDiagnosticResult diagnosticResult) {
try {
DeadlockScope scope = diagnosticResult.getAnalysis().getScope();

switch (scope) {
case WEB_SERVICE:
cleanupWebResources();
break;
case MESSAGE_QUEUE:
cleanupMessageQueueResources();
break;
case BACKGROUND_TASK:
cleanupBackgroundTaskResources();
break;
default:
cleanupDefaultResources();
break;
}

log.info("相关资源已清理");

} catch (Exception e) {
log.error("清理相关资源失败", e);
throw new RuntimeException("清理相关资源失败", e);
}
}

/**
* 重启相关服务
*/
private void restartRelatedServices(DeadlockDiagnosticResult diagnosticResult) {
try {
DeadlockScope scope = diagnosticResult.getAnalysis().getScope();

switch (scope) {
case WEB_SERVICE:
restartWebServices();
break;
case MESSAGE_QUEUE:
restartMessageQueueServices();
break;
case BACKGROUND_TASK:
restartBackgroundTaskServices();
break;
default:
restartDefaultServices();
break;
}

log.info("相关服务已重启");

} catch (Exception e) {
log.error("重启相关服务失败", e);
throw new RuntimeException("重启相关服务失败", e);
}
}

/**
* 恢复服务状态
*/
private void recoverServiceStatus(DeadlockDiagnosticResult diagnosticResult) {
try {
// 1. 恢复服务状态
recoverServiceState();

// 2. 恢复连接状态
recoverConnectionState();

// 3. 恢复缓存状态
recoverCacheState();

log.info("服务状态已恢复");

} catch (Exception e) {
log.error("恢复服务状态失败", e);
throw new RuntimeException("恢复服务状态失败", e);
}
}

/**
* 清理死锁状态
*/
private void cleanupDeadlockState(DeadlockDiagnosticResult diagnosticResult) {
try {
// 1. 清理死锁线程状态
cleanupDeadlockThreadState(diagnosticResult);

// 2. 清理锁状态
cleanupLockState(diagnosticResult);

// 3. 清理资源状态
cleanupResourceState(diagnosticResult);

log.info("死锁状态已清理");

} catch (Exception e) {
log.error("清理死锁状态失败", e);
throw new RuntimeException("清理死锁状态失败", e);
}
}

/**
* 重新初始化服务
*/
private void reinitializeServices(DeadlockDiagnosticResult diagnosticResult) {
try {
// 1. 重新初始化服务
reinitializeService();

// 2. 重新初始化连接
reinitializeConnections();

// 3. 重新初始化缓存
reinitializeCache();

log.info("服务已重新初始化");

} catch (Exception e) {
log.error("重新初始化服务失败", e);
throw new RuntimeException("重新初始化服务失败", e);
}
}

/**
* 启动监控
*/
private void startMonitoring(DeadlockDiagnosticResult diagnosticResult) {
try {
// 启动死锁监控
recoveryMonitor.startMonitoring(diagnosticResult);

log.info("监控已启动");

} catch (Exception e) {
log.error("启动监控失败", e);
throw new RuntimeException("启动监控失败", e);
}
}

/**
* 记录状态
*/
private void recordStatus(DeadlockDiagnosticResult diagnosticResult) {
try {
// 记录死锁状态
recoveryMonitor.recordDeadlockStatus(diagnosticResult);

log.info("状态已记录");

} catch (Exception e) {
log.error("记录状态失败", e);
throw new RuntimeException("记录状态失败", e);
}
}

/**
* 验证恢复结果
*/
private boolean verifyRecovery(RecoveryAction action) {
try {
// 1. 验证服务状态
boolean serviceStatus = verifyServiceStatus();

// 2. 验证数据状态
boolean dataStatus = verifyDataStatus();

// 3. 验证连接状态
boolean connectionStatus = verifyConnectionStatus();

return serviceStatus && dataStatus && connectionStatus;

} catch (Exception e) {
log.error("验证恢复结果失败", e);
return false;
}
}

/**
* 验证服务状态
*/
private boolean verifyServiceStatus() {
try {
// 检查服务是否正常运行
return applicationContext.isActive();
} catch (Exception e) {
log.error("验证服务状态失败", e);
return false;
}
}

/**
* 验证数据状态
*/
private boolean verifyDataStatus() {
try {
// 检查数据库连接
if (dataSource != null) {
Connection connection = dataSource.getConnection();
boolean isValid = connection.isValid(5);
connection.close();
return isValid;
}
return true;
} catch (Exception e) {
log.error("验证数据状态失败", e);
return false;
}
}

/**
* 验证连接状态
*/
private boolean verifyConnectionStatus() {
try {
// 检查Redis连接
if (redisTemplate != null) {
redisTemplate.opsForValue().get("test");
return true;
}
return true;
} catch (Exception e) {
log.error("验证连接状态失败", e);
return false;
}
}

/**
* 记录恢复信息
*/
private void recordRecoveryInfo(RecoveryResult result) {
try {
// 记录恢复信息到数据库
recoveryMonitor.recordRecoveryInfo(result);

log.info("恢复信息已记录");

} catch (Exception e) {
log.error("记录恢复信息失败", e);
}
}

/**
* 发送恢复通知
*/
private void sendRecoveryNotification(RecoveryResult result) {
try {
// 发送恢复通知
RecoveryNotification notification = new RecoveryNotification();
notification.setRecoveryTime(LocalDateTime.now());
notification.setSuccess(result.isSuccess());
notification.setStrategy(result.getStrategy());
notification.setAction(result.getAction());

alertService.sendRecoveryNotification(notification);

} catch (Exception e) {
log.error("发送恢复通知失败", e);
}
}

// 其他辅助方法的实现...
private void cleanupOtherResources() {
// 清理其他资源的实现
}

private void recoverDatabaseData() {
// 恢复数据库数据的实现
}

private void recoverCacheData() {
// 恢复缓存数据的实现
}

private void recoverOtherData() {
// 恢复其他数据的实现
}

private void stopWebServices() {
// 停止Web服务的实现
}

private void stopMessageQueueServices() {
// 停止消息队列服务的实现
}

private void stopBackgroundTaskServices() {
// 停止后台任务服务的实现
}

private void stopDefaultServices() {
// 停止默认服务的实现
}

private void cleanupWebResources() {
// 清理Web资源的实现
}

private void cleanupMessageQueueResources() {
// 清理消息队列资源的实现
}

private void cleanupBackgroundTaskResources() {
// 清理后台任务资源的实现
}

private void cleanupDefaultResources() {
// 清理默认资源的实现
}

private void restartWebServices() {
// 重启Web服务的实现
}

private void restartMessageQueueServices() {
// 重启消息队列服务的实现
}

private void restartBackgroundTaskServices() {
// 重启后台任务服务的实现
}

private void restartDefaultServices() {
// 重启默认服务的实现
}

private void recoverServiceState() {
// 恢复服务状态的实现
}

private void recoverConnectionState() {
// 恢复连接状态的实现
}

private void recoverCacheState() {
// 恢复缓存状态的实现
}

private void cleanupDeadlockThreadState(DeadlockDiagnosticResult diagnosticResult) {
// 清理死锁线程状态的实现
}

private void cleanupLockState(DeadlockDiagnosticResult diagnosticResult) {
// 清理锁状态的实现
}

private void cleanupResourceState(DeadlockDiagnosticResult diagnosticResult) {
// 清理资源状态的实现
}

private void reinitializeService() {
// 重新初始化服务的实现
}

private void reinitializeConnections() {
// 重新初始化连接的实现
}

private void reinitializeCache() {
// 重新初始化缓存的实现
}
}

四、运维手册与最佳实践

4.1 死锁处理运维手册

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
@Component
@Slf4j
public class DeadlockOperationsManual {

/**
* 死锁处理流程
*/
public void handleDeadlockProcess() {
log.info("=== 死锁处理流程 ===");

// 1. 发现死锁
log.info("1. 发现死锁");
log.info(" - 监控告警");
log.info(" - 用户反馈");
log.info(" - 系统异常");
log.info(" - 性能下降");

// 2. 诊断分析
log.info("2. 诊断分析");
log.info(" - 死锁检测");
log.info(" - 死锁分析");
log.info(" - 根因分析");
log.info(" - 影响评估");

// 3. 应急处理
log.info("3. 应急处理");
log.info(" - 服务降级");
log.info(" - 流量切换");
log.info(" - 线程中断");
log.info(" - 资源释放");

// 4. 故障恢复
log.info("4. 故障恢复");
log.info(" - 服务重启");
log.info(" - 数据恢复");
log.info(" - 状态同步");
log.info(" - 业务恢复");

// 5. 预防措施
log.info("5. 预防措施");
log.info(" - 代码审查");
log.info(" - 压力测试");
log.info(" - 监控完善");
log.info(" - 流程规范");
}

/**
* 死锁诊断步骤
*/
public void deadlockDiagnosisSteps() {
log.info("=== 死锁诊断步骤 ===");

// 1. 检查死锁
log.info("1. 检查死锁");
log.info(" - 使用jstack命令");
log.info(" - 使用jconsole工具");
log.info(" - 使用VisualVM工具");
log.info(" - 使用自定义检测工具");

// 2. 分析死锁
log.info("2. 分析死锁");
log.info(" - 分析死锁线程");
log.info(" - 分析死锁锁");
log.info(" - 分析死锁原因");
log.info(" - 分析死锁影响");

// 3. 定位问题
log.info("3. 定位问题");
log.info(" - 定位死锁代码");
log.info(" - 定位死锁逻辑");
log.info(" - 定位死锁场景");
log.info(" - 定位死锁触发条件");

// 4. 制定方案
log.info("4. 制定方案");
log.info(" - 制定应急方案");
log.info(" - 制定恢复方案");
log.info(" - 制定预防方案");
log.info(" - 制定优化方案");
}

/**
* 应急处理步骤
*/
public void emergencyHandlingSteps() {
log.info("=== 应急处理步骤 ===");

// 1. 立即响应
log.info("1. 立即响应");
log.info(" - 确认死锁发生");
log.info(" - 评估影响范围");
log.info(" - 启动应急流程");
log.info(" - 通知相关人员");

// 2. 快速处理
log.info("2. 快速处理");
log.info(" - 中断死锁线程");
log.info(" - 释放死锁资源");
log.info(" - 恢复服务响应");
log.info(" - 监控系统状态");

// 3. 稳定系统
log.info("3. 稳定系统");
log.info(" - 检查系统状态");
log.info(" - 验证服务功能");
log.info(" - 确认数据一致性");
log.info(" - 监控性能指标");

// 4. 记录信息
log.info("4. 记录信息");
log.info(" - 记录死锁信息");
log.info(" - 记录处理过程");
log.info(" - 记录恢复结果");
log.info(" - 记录经验教训");
}

/**
* 故障恢复步骤
*/
public void faultRecoverySteps() {
log.info("=== 故障恢复步骤 ===");

// 1. 评估恢复策略
log.info("1. 评估恢复策略");
log.info(" - 评估死锁严重程度");
log.info(" - 评估业务影响范围");
log.info(" - 评估恢复时间要求");
log.info(" - 评估恢复资源需求");

// 2. 执行恢复操作
log.info("2. 执行恢复操作");
log.info(" - 停止相关服务");
log.info(" - 清理相关资源");
log.info(" - 重启相关服务");
log.info(" - 恢复相关数据");

// 3. 验证恢复结果
log.info("3. 验证恢复结果");
log.info(" - 验证服务状态");
log.info(" - 验证数据状态");
log.info(" - 验证连接状态");
log.info(" - 验证功能状态");

// 4. 监控恢复过程
log.info("4. 监控恢复过程");
log.info(" - 监控服务状态");
log.info(" - 监控性能指标");
log.info(" - 监控错误日志");
log.info(" - 监控用户反馈");
}

/**
* 预防措施
*/
public void preventionMeasures() {
log.info("=== 预防措施 ===");

// 1. 代码层面
log.info("1. 代码层面");
log.info(" - 统一锁的获取顺序");
log.info(" - 使用锁超时机制");
log.info(" - 减少锁的持有时间");
log.info(" - 使用无锁数据结构");

// 2. 架构层面
log.info("2. 架构层面");
log.info(" - 设计合理的锁粒度");
log.info(" - 避免嵌套锁的使用");
log.info(" - 使用读写锁分离");
log.info(" - 使用分段锁技术");

// 3. 运维层面
log.info("3. 运维层面");
log.info(" - 完善监控告警");
log.info(" - 定期压力测试");
log.info(" - 建立故障处理流程");
log.info(" - 制定应急预案");

// 4. 管理层面
log.info("4. 管理层面");
log.info(" - 建立代码审查制度");
log.info(" - 建立培训体系");
log.info(" - 建立知识库");
log.info(" - 建立经验分享机制");
}
}

4.2 最佳实践总结

4.2.1 死锁预防最佳实践

  1. 锁顺序统一:确保所有线程以相同的顺序获取锁
  2. 锁超时机制:使用锁超时,避免无限等待
  3. 锁粒度优化:合理控制锁的粒度,减少锁竞争
  4. 无锁设计:使用无锁数据结构,避免锁的使用

4.2.2 死锁检测最佳实践

  1. 实时监控:实时监控线程状态和锁竞争
  2. 定期检测:定期检测死锁,及时发现问题
  3. 自动告警:自动告警死锁,快速响应
  4. 详细分析:详细分析死锁原因和影响

4.2.3 应急处理最佳实践

  1. 快速响应:快速响应死锁,减少影响
  2. 分级处理:根据严重程度分级处理
  3. 最小影响:最小化对业务的影响
  4. 完整记录:完整记录处理过程

4.2.4 故障恢复最佳实践

  1. 策略评估:评估恢复策略,选择合适方案
  2. 分步执行:分步执行恢复操作,确保安全
  3. 结果验证:验证恢复结果,确保正确
  4. 持续监控:持续监控恢复过程,及时调整

五、总结

线程死锁是生产环境中的严重故障,需要从诊断、应急处理、故障恢复等多个维度进行综合治理。通过完善的死锁诊断工具、快速的应急处理机制、可靠的故障恢复服务和完善的运维手册,可以构建一个完整的死锁处理体系。

关键要点:

  1. 快速诊断:快速诊断死锁,分析原因和影响
  2. 应急处理:快速应急处理,减少业务影响
  3. 故障恢复:可靠故障恢复,确保系统稳定
  4. 预防措施:完善预防措施,避免死锁发生
  5. 运维手册:建立运维手册,规范处理流程

通过本文的实践指导,读者可以构建一个完善的死锁处理体系,为生产环境的稳定运行提供强有力的技术支撑。