1. Docker Swarm集群概述

Docker Swarm是Docker官方的集群管理和编排工具,提供了高可用、负载均衡、服务发现、网络管理等企业级功能。本文将详细介绍Docker Swarm集群管理、服务发现、网络管理、故障恢复和高级运维功能的完整解决方案。

1.1 核心功能

  1. 集群管理: Docker Swarm集群创建和管理
  2. 服务发现: 自动服务发现和负载均衡
  3. 网络管理: 覆盖网络和网络隔离
  4. 故障恢复: 自动故障检测和恢复
  5. 高级运维: 滚动更新、配置管理、监控告警

1.2 技术架构

1
2
3
4
5
Manager节点 → Worker节点 → 服务实例
↓ ↓ ↓
集群管理 → 任务调度 → 容器运行
↓ ↓ ↓
服务发现 → 负载均衡 → 网络通信

2. Docker Swarm配置

2.1 Docker Swarm配置类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/**
* Docker Swarm配置类
*/
@Configuration
public class DockerSwarmConfig {

@Value("${docker.swarm.manager-host}")
private String managerHost;

@Value("${docker.swarm.manager-port}")
private int managerPort;

@Value("${docker.swarm.network-name}")
private String networkName;

@Value("${docker.swarm.service-prefix}")
private String servicePrefix;

/**
* Docker Swarm配置属性
*/
@Bean
public DockerSwarmProperties dockerSwarmProperties() {
return DockerSwarmProperties.builder()
.managerHost(managerHost)
.managerPort(managerPort)
.networkName(networkName)
.servicePrefix(servicePrefix)
.build();
}

/**
* Docker Swarm客户端
*/
@Bean
public DockerClient dockerSwarmClient() {
return DockerClientBuilder.getInstance()
.withDockerHost("tcp://" + managerHost + ":" + managerPort)
.build();
}

/**
* Docker Swarm服务
*/
@Bean
public DockerSwarmService dockerSwarmService() {
return new DockerSwarmService(dockerSwarmClient(), dockerSwarmProperties());
}

/**
* Docker Swarm网络服务
*/
@Bean
public DockerSwarmNetworkService dockerSwarmNetworkService() {
return new DockerSwarmNetworkService(dockerSwarmClient(), dockerSwarmProperties());
}
}

/**
* Docker Swarm配置属性
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class DockerSwarmProperties {
private String managerHost;
private int managerPort;
private String networkName;
private String servicePrefix;

// 集群配置
private int maxReplicas = 10;
private int minReplicas = 1;
private int updateDelay = 10;
private int updateFailureAction = 0; // 0=continue, 1=pause

// 网络配置
private String networkDriver = "overlay";
private boolean networkAttachable = true;
private boolean networkEncrypted = true;

// 服务配置
private int serviceTimeout = 300;
private int healthCheckInterval = 30;
private int healthCheckTimeout = 10;
private int healthCheckRetries = 3;
}

2.2 应用配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# application.yml
docker:
swarm:
manager-host: localhost
manager-port: 2377
network-name: myapp-network
service-prefix: myapp

# Docker Swarm集群配置
docker:
swarm:
cluster:
max-replicas: 10
min-replicas: 1
update-delay: 10
update-failure-action: 0
network:
driver: overlay
attachable: true
encrypted: true
service:
timeout: 300
health-check-interval: 30
health-check-timeout: 10
health-check-retries: 3

3. Docker Swarm服务

3.1 Docker Swarm服务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
/**
* Docker Swarm服务
*/
@Service
public class DockerSwarmService {

private final DockerClient dockerClient;
private final DockerSwarmProperties properties;

public DockerSwarmService(DockerClient dockerClient, DockerSwarmProperties properties) {
this.dockerClient = dockerClient;
this.properties = properties;
}

/**
* 初始化Swarm集群
* @return 初始化结果
*/
public SwarmInitResult initSwarm() {
try {
// 检查是否已经是Swarm节点
if (isSwarmInitialized()) {
return SwarmInitResult.error("Swarm集群已初始化");
}

// 初始化Swarm集群
InitSwarmCmd initCmd = dockerClient.initSwarmCmd()
.withListenAddr("0.0.0.0:2377")
.withAdvertiseAddr(properties.getManagerHost() + ":2377");

String swarmId = initCmd.exec();

log.info("Swarm集群初始化成功: swarmId={}", swarmId);

return SwarmInitResult.success(swarmId);

} catch (Exception e) {
log.error("Swarm集群初始化失败", e);
return SwarmInitResult.error("集群初始化失败: " + e.getMessage());
}
}

/**
* 加入Swarm集群
* @param joinToken 加入令牌
* @param managerAddress 管理器地址
* @return 加入结果
*/
public SwarmJoinResult joinSwarm(String joinToken, String managerAddress) {
try {
// 检查是否已经是Swarm节点
if (isSwarmInitialized()) {
return SwarmJoinResult.error("节点已加入Swarm集群");
}

// 加入Swarm集群
JoinSwarmCmd joinCmd = dockerClient.joinSwarmCmd()
.withListenAddr("0.0.0.0:2377")
.withAdvertiseAddr(managerAddress + ":2377")
.withJoinToken(joinToken);

joinCmd.exec();

log.info("节点加入Swarm集群成功: managerAddress={}", managerAddress);

return SwarmJoinResult.success();

} catch (Exception e) {
log.error("节点加入Swarm集群失败: managerAddress={}", managerAddress, e);
return SwarmJoinResult.error("加入集群失败: " + e.getMessage());
}
}

/**
* 创建Swarm服务
* @param serviceRequest 服务请求
* @return 服务信息
*/
public SwarmService createService(SwarmServiceRequest serviceRequest) {
try {
// 1. 创建服务规格
ServiceSpec serviceSpec = ServiceSpec.builder()
.name(serviceRequest.getServiceName())
.taskTemplate(TaskSpec.builder()
.containerSpec(ContainerSpec.builder()
.image(serviceRequest.getImageName())
.env(serviceRequest.getEnvironment())
.labels(serviceRequest.getLabels())
.build())
.restartPolicy(RestartPolicy.builder()
.condition(RestartPolicyCondition.ON_FAILURE)
.delay(10L)
.maxAttempts(3L)
.build())
.build())
.mode(ServiceMode.builder()
.replicated(ReplicatedService.builder()
.replicas(serviceRequest.getReplicas())
.build())
.build())
.updateConfig(UpdateConfig.builder()
.parallelism(1L)
.delay(10L)
.failureAction(UpdateConfigFailureAction.CONTINUE)
.build())
.networks(Collections.singletonList(
NetworkAttachmentConfig.builder()
.target(properties.getNetworkName())
.build()))
.build();

// 2. 创建服务
CreateServiceResponse response = dockerClient.createServiceCmd(serviceSpec).exec();
String serviceId = response.getId();

log.info("Swarm服务创建成功: serviceName={}, serviceId={}",
serviceRequest.getServiceName(), serviceId);

return SwarmService.builder()
.serviceId(serviceId)
.serviceName(serviceRequest.getServiceName())
.imageName(serviceRequest.getImageName())
.replicas(serviceRequest.getReplicas())
.status("CREATED")
.createTime(System.currentTimeMillis())
.build();

} catch (Exception e) {
log.error("Swarm服务创建失败: serviceName={}",
serviceRequest.getServiceName(), e);
throw new BusinessException("服务创建失败", e);
}
}

/**
* 更新Swarm服务
* @param serviceId 服务ID
* @param updateRequest 更新请求
* @return 更新结果
*/
public SwarmUpdateResult updateService(String serviceId, SwarmUpdateRequest updateRequest) {
try {
// 1. 获取当前服务
Service service = dockerClient.inspectServiceCmd(serviceId).exec();
ServiceSpec currentSpec = service.getSpec();

// 2. 创建新的服务规格
ServiceSpec newSpec = ServiceSpec.builder()
.name(currentSpec.name())
.taskTemplate(TaskSpec.builder()
.containerSpec(ContainerSpec.builder()
.image(updateRequest.getImageName())
.env(updateRequest.getEnvironment())
.labels(updateRequest.getLabels())
.build())
.restartPolicy(currentSpec.taskTemplate().restartPolicy())
.build())
.mode(ServiceMode.builder()
.replicated(ReplicatedService.builder()
.replicas(updateRequest.getReplicas())
.build())
.build())
.updateConfig(UpdateConfig.builder()
.parallelism(1L)
.delay(10L)
.failureAction(UpdateConfigFailureAction.CONTINUE)
.build())
.networks(currentSpec.networks())
.build();

// 3. 更新服务
dockerClient.updateServiceCmd(serviceId, newSpec).exec();

log.info("Swarm服务更新成功: serviceId={}", serviceId);

return SwarmUpdateResult.success();

} catch (Exception e) {
log.error("Swarm服务更新失败: serviceId={}", serviceId, e);
return SwarmUpdateResult.error("服务更新失败: " + e.getMessage());
}
}

/**
* 扩缩容Swarm服务
* @param serviceId 服务ID
* @param replicas 副本数
* @return 扩缩容结果
*/
public SwarmScaleResult scaleService(String serviceId, int replicas) {
try {
// 1. 获取当前服务
Service service = dockerClient.inspectServiceCmd(serviceId).exec();
ServiceSpec currentSpec = service.getSpec();

// 2. 创建新的服务规格
ServiceSpec newSpec = ServiceSpec.builder()
.name(currentSpec.name())
.taskTemplate(currentSpec.taskTemplate())
.mode(ServiceMode.builder()
.replicated(ReplicatedService.builder()
.replicas((long) replicas)
.build())
.build())
.updateConfig(currentSpec.updateConfig())
.networks(currentSpec.networks())
.build();

// 3. 更新服务
dockerClient.updateServiceCmd(serviceId, newSpec).exec();

log.info("Swarm服务扩缩容成功: serviceId={}, replicas={}", serviceId, replicas);

return SwarmScaleResult.success(replicas);

} catch (Exception e) {
log.error("Swarm服务扩缩容失败: serviceId={}", serviceId, e);
return SwarmScaleResult.error("服务扩缩容失败: " + e.getMessage());
}
}

/**
* 删除Swarm服务
* @param serviceId 服务ID
* @return 删除结果
*/
public boolean removeService(String serviceId) {
try {
dockerClient.removeServiceCmd(serviceId).exec();

log.info("Swarm服务删除成功: serviceId={}", serviceId);

return true;

} catch (Exception e) {
log.error("Swarm服务删除失败: serviceId={}", serviceId, e);
return false;
}
}

/**
* 获取Swarm服务列表
* @return 服务列表
*/
public List<SwarmService> getServices() {
try {
List<Service> services = dockerClient.listServicesCmd().exec();

return services.stream()
.map(service -> SwarmService.builder()
.serviceId(service.getId())
.serviceName(service.getSpec().name())
.imageName(service.getSpec().taskTemplate().containerSpec().image())
.replicas(service.getSpec().mode().replicated().replicas().intValue())
.status(service.getSpec().name())
.createTime(service.getCreatedAt().getTime())
.build())
.collect(Collectors.toList());

} catch (Exception e) {
log.error("获取Swarm服务列表失败", e);
return new ArrayList<>();
}
}

/**
* 获取Swarm服务详情
* @param serviceId 服务ID
* @return 服务详情
*/
public SwarmServiceDetail getServiceDetail(String serviceId) {
try {
Service service = dockerClient.inspectServiceCmd(serviceId).exec();

return SwarmServiceDetail.builder()
.serviceId(serviceId)
.serviceName(service.getSpec().name())
.imageName(service.getSpec().taskTemplate().containerSpec().image())
.replicas(service.getSpec().mode().replicated().replicas().intValue())
.runningReplicas(service.getServiceStatus().runningReplicas())
.desiredReplicas(service.getServiceStatus().desiredReplicas())
.createTime(service.getCreatedAt().getTime())
.updateTime(service.getUpdatedAt().getTime())
.build();

} catch (Exception e) {
log.error("获取Swarm服务详情失败: serviceId={}", serviceId, e);
return null;
}
}

/**
* 检查Swarm是否已初始化
* @return 是否已初始化
*/
private boolean isSwarmInitialized() {
try {
SwarmInfo swarmInfo = dockerClient.inspectSwarmCmd().exec();
return swarmInfo != null;
} catch (Exception e) {
return false;
}
}
}

/**
* Swarm初始化结果
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SwarmInitResult {
private boolean success;
private String swarmId;
private String message;

public static SwarmInitResult success(String swarmId) {
return SwarmInitResult.builder()
.success(true)
.swarmId(swarmId)
.build();
}

public static SwarmInitResult error(String message) {
return SwarmInitResult.builder()
.success(false)
.message(message)
.build();
}
}

/**
* Swarm加入结果
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SwarmJoinResult {
private boolean success;
private String message;

public static SwarmJoinResult success() {
return SwarmJoinResult.builder().success(true).build();
}

public static SwarmJoinResult error(String message) {
return SwarmJoinResult.builder()
.success(false)
.message(message)
.build();
}
}

/**
* Swarm服务请求
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SwarmServiceRequest {
private String serviceName;
private String imageName;
private int replicas;
private Map<String, String> environment;
private Map<String, String> labels;
}

/**
* Swarm服务
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SwarmService {
private String serviceId;
private String serviceName;
private String imageName;
private int replicas;
private String status;
private long createTime;
}

/**
* Swarm服务详情
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SwarmServiceDetail {
private String serviceId;
private String serviceName;
private String imageName;
private int replicas;
private int runningReplicas;
private int desiredReplicas;
private long createTime;
private long updateTime;
}

/**
* Swarm更新请求
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SwarmUpdateRequest {
private String imageName;
private int replicas;
private Map<String, String> environment;
private Map<String, String> labels;
}

/**
* Swarm更新结果
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SwarmUpdateResult {
private boolean success;
private String message;

public static SwarmUpdateResult success() {
return SwarmUpdateResult.builder().success(true).build();
}

public static SwarmUpdateResult error(String message) {
return SwarmUpdateResult.builder()
.success(false)
.message(message)
.build();
}
}

/**
* Swarm扩缩容结果
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SwarmScaleResult {
private boolean success;
private int replicas;
private String message;

public static SwarmScaleResult success(int replicas) {
return SwarmScaleResult.builder()
.success(true)
.replicas(replicas)
.build();
}

public static SwarmScaleResult error(String message) {
return SwarmScaleResult.builder()
.success(false)
.message(message)
.build();
}
}

4. Docker Swarm网络服务

4.1 Docker Swarm网络服务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
/**
* Docker Swarm网络服务
*/
@Service
public class DockerSwarmNetworkService {

private final DockerClient dockerClient;
private final DockerSwarmProperties properties;

public DockerSwarmNetworkService(DockerClient dockerClient, DockerSwarmProperties properties) {
this.dockerClient = dockerClient;
this.properties = properties;
}

/**
* 创建覆盖网络
* @param networkRequest 网络请求
* @return 网络信息
*/
public SwarmNetwork createOverlayNetwork(SwarmNetworkRequest networkRequest) {
try {
// 1. 创建网络配置
NetworkCreateConfig networkConfig = NetworkCreateConfig.builder()
.name(networkRequest.getNetworkName())
.driver(networkRequest.getDriver())
.options(networkRequest.getOptions())
.labels(networkRequest.getLabels())
.build();

// 2. 创建网络
CreateNetworkResponse response = dockerClient.createNetworkCmd()
.withNetworkCreateConfig(networkConfig)
.exec();

String networkId = response.getId();

log.info("覆盖网络创建成功: networkName={}, networkId={}",
networkRequest.getNetworkName(), networkId);

return SwarmNetwork.builder()
.networkId(networkId)
.networkName(networkRequest.getNetworkName())
.driver(networkRequest.getDriver())
.scope("swarm")
.createTime(System.currentTimeMillis())
.build();

} catch (Exception e) {
log.error("覆盖网络创建失败: networkName={}",
networkRequest.getNetworkName(), e);
throw new BusinessException("网络创建失败", e);
}
}

/**
* 删除网络
* @param networkId 网络ID
* @return 删除结果
*/
public boolean removeNetwork(String networkId) {
try {
dockerClient.removeNetworkCmd(networkId).exec();

log.info("网络删除成功: networkId={}", networkId);

return true;

} catch (Exception e) {
log.error("网络删除失败: networkId={}", networkId, e);
return false;
}
}

/**
* 获取网络列表
* @return 网络列表
*/
public List<SwarmNetwork> getNetworks() {
try {
List<Network> networks = dockerClient.listNetworksCmd().exec();

return networks.stream()
.filter(network -> "swarm".equals(network.getScope()))
.map(network -> SwarmNetwork.builder()
.networkId(network.getId())
.networkName(network.getName())
.driver(network.getDriver())
.scope(network.getScope())
.createTime(System.currentTimeMillis())
.build())
.collect(Collectors.toList());

} catch (Exception e) {
log.error("获取网络列表失败", e);
return new ArrayList<>();
}
}

/**
* 获取网络详情
* @param networkId 网络ID
* @return 网络详情
*/
public SwarmNetworkDetail getNetworkDetail(String networkId) {
try {
Network network = dockerClient.inspectNetworkCmd().withNetworkId(networkId).exec();

return SwarmNetworkDetail.builder()
.networkId(networkId)
.networkName(network.getName())
.driver(network.getDriver())
.scope(network.getScope())
.ipam(network.getIpam())
.options(network.getOptions())
.labels(network.getLabels())
.createTime(System.currentTimeMillis())
.build();

} catch (Exception e) {
log.error("获取网络详情失败: networkId={}", networkId, e);
return null;
}
}

/**
* 连接服务到网络
* @param serviceId 服务ID
* @param networkId 网络ID
* @return 连接结果
*/
public boolean connectServiceToNetwork(String serviceId, String networkId) {
try {
// 1. 获取当前服务
Service service = dockerClient.inspectServiceCmd(serviceId).exec();
ServiceSpec currentSpec = service.getSpec();

// 2. 添加网络连接
List<NetworkAttachmentConfig> networks = new ArrayList<>(currentSpec.networks());
networks.add(NetworkAttachmentConfig.builder()
.target(networkId)
.build());

// 3. 创建新的服务规格
ServiceSpec newSpec = ServiceSpec.builder()
.name(currentSpec.name())
.taskTemplate(currentSpec.taskTemplate())
.mode(currentSpec.mode())
.updateConfig(currentSpec.updateConfig())
.networks(networks)
.build();

// 4. 更新服务
dockerClient.updateServiceCmd(serviceId, newSpec).exec();

log.info("服务连接到网络成功: serviceId={}, networkId={}", serviceId, networkId);

return true;

} catch (Exception e) {
log.error("服务连接到网络失败: serviceId={}, networkId={}", serviceId, networkId, e);
return false;
}
}

/**
* 断开服务与网络的连接
* @param serviceId 服务ID
* @param networkId 网络ID
* @return 断开结果
*/
public boolean disconnectServiceFromNetwork(String serviceId, String networkId) {
try {
// 1. 获取当前服务
Service service = dockerClient.inspectServiceCmd(serviceId).exec();
ServiceSpec currentSpec = service.getSpec();

// 2. 移除网络连接
List<NetworkAttachmentConfig> networks = currentSpec.networks().stream()
.filter(network -> !networkId.equals(network.target()))
.collect(Collectors.toList());

// 3. 创建新的服务规格
ServiceSpec newSpec = ServiceSpec.builder()
.name(currentSpec.name())
.taskTemplate(currentSpec.taskTemplate())
.mode(currentSpec.mode())
.updateConfig(currentSpec.updateConfig())
.networks(networks)
.build();

// 4. 更新服务
dockerClient.updateServiceCmd(serviceId, newSpec).exec();

log.info("服务断开网络连接成功: serviceId={}, networkId={}", serviceId, networkId);

return true;

} catch (Exception e) {
log.error("服务断开网络连接失败: serviceId={}, networkId={}", serviceId, networkId, e);
return false;
}
}
}

/**
* Swarm网络请求
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SwarmNetworkRequest {
private String networkName;
private String driver;
private Map<String, String> options;
private Map<String, String> labels;
}

/**
* Swarm网络
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SwarmNetwork {
private String networkId;
private String networkName;
private String driver;
private String scope;
private long createTime;
}

/**
* Swarm网络详情
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SwarmNetworkDetail {
private String networkId;
private String networkName;
private String driver;
private String scope;
private Object ipam;
private Map<String, String> options;
private Map<String, String> labels;
private long createTime;
}

5. Docker Swarm监控服务

5.1 Docker Swarm监控服务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
/**
* Docker Swarm监控服务
*/
@Service
public class DockerSwarmMonitoringService {

private final DockerSwarmService swarmService;
private final DockerSwarmAlertService alertService;

public DockerSwarmMonitoringService(DockerSwarmService swarmService,
DockerSwarmAlertService alertService) {
this.swarmService = swarmService;
this.alertService = alertService;
}

/**
* 监控Swarm服务状态
*/
@Scheduled(fixedRate = 30000) // 每30秒执行一次
public void monitorSwarmServices() {
try {
List<SwarmService> services = swarmService.getServices();

for (SwarmService service : services) {
// 获取服务详情
SwarmServiceDetail serviceDetail = swarmService.getServiceDetail(service.getServiceId());

if (serviceDetail != null) {
// 检查服务健康状态
checkServiceHealth(serviceDetail);

// 检查服务副本状态
checkServiceReplicas(serviceDetail);
}
}

} catch (Exception e) {
log.error("监控Swarm服务状态失败", e);
}
}

/**
* 检查服务健康状态
* @param serviceDetail 服务详情
*/
private void checkServiceHealth(SwarmServiceDetail serviceDetail) {
try {
// 检查运行副本数
if (serviceDetail.getRunningReplicas() == 0) {
alertService.sendAlert(SwarmAlert.builder()
.type("SERVICE_DOWN")
.severity("HIGH")
.message(String.format("服务%s无运行副本", serviceDetail.getServiceName()))
.serviceId(serviceDetail.getServiceId())
.timestamp(System.currentTimeMillis())
.build());
}

// 检查副本数不匹配
if (serviceDetail.getRunningReplicas() < serviceDetail.getDesiredReplicas()) {
alertService.sendAlert(SwarmAlert.builder()
.type("SERVICE_SCALING")
.severity("MEDIUM")
.message(String.format("服务%s副本数不足: 运行%d个,期望%d个",
serviceDetail.getServiceName(),
serviceDetail.getRunningReplicas(),
serviceDetail.getDesiredReplicas()))
.serviceId(serviceDetail.getServiceId())
.timestamp(System.currentTimeMillis())
.build());
}

} catch (Exception e) {
log.error("检查服务健康状态失败: serviceId={}",
serviceDetail.getServiceId(), e);
}
}

/**
* 检查服务副本状态
* @param serviceDetail 服务详情
*/
private void checkServiceReplicas(SwarmServiceDetail serviceDetail) {
try {
// 检查副本数是否合理
if (serviceDetail.getDesiredReplicas() > 10) {
alertService.sendAlert(SwarmAlert.builder()
.type("HIGH_REPLICA_COUNT")
.severity("LOW")
.message(String.format("服务%s副本数过多: %d个",
serviceDetail.getServiceName(),
serviceDetail.getDesiredReplicas()))
.serviceId(serviceDetail.getServiceId())
.timestamp(System.currentTimeMillis())
.build());
}

} catch (Exception e) {
log.error("检查服务副本状态失败: serviceId={}",
serviceDetail.getServiceId(), e);
}
}

/**
* 获取Swarm集群统计信息
* @return 集群统计信息
*/
public SwarmClusterStats getClusterStats() {
try {
List<SwarmService> services = swarmService.getServices();

int totalServices = services.size();
int totalReplicas = services.stream()
.mapToInt(SwarmService::getReplicas)
.sum();

int runningReplicas = 0;
for (SwarmService service : services) {
SwarmServiceDetail detail = swarmService.getServiceDetail(service.getServiceId());
if (detail != null) {
runningReplicas += detail.getRunningReplicas();
}
}

return SwarmClusterStats.builder()
.totalServices(totalServices)
.totalReplicas(totalReplicas)
.runningReplicas(runningReplicas)
.timestamp(System.currentTimeMillis())
.build();

} catch (Exception e) {
log.error("获取Swarm集群统计信息失败", e);
return null;
}
}
}

/**
* Swarm集群统计
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SwarmClusterStats {
private int totalServices;
private int totalReplicas;
private int runningReplicas;
private long timestamp;
}

/**
* Swarm告警
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SwarmAlert {
private String type;
private String severity;
private String message;
private String serviceId;
private long timestamp;
}

6. Docker Swarm控制器

6.1 Docker Swarm控制器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
/**
* Docker Swarm控制器
*/
@RestController
@RequestMapping("/swarm")
public class DockerSwarmController {

@Autowired
private DockerSwarmService swarmService;

@Autowired
private DockerSwarmNetworkService networkService;

@Autowired
private DockerSwarmMonitoringService monitoringService;

/**
* 初始化Swarm集群
*/
@PostMapping("/init")
public ResponseEntity<Map<String, Object>> initSwarm() {
try {
SwarmInitResult result = swarmService.initSwarm();

Map<String, Object> response = new HashMap<>();
response.put("success", result.isSuccess());
response.put("swarmId", result.getSwarmId());
response.put("message", result.getMessage());

return ResponseEntity.ok(response);

} catch (Exception e) {
log.error("初始化Swarm集群失败", e);

Map<String, Object> response = new HashMap<>();
response.put("success", false);
response.put("message", "初始化集群失败: " + e.getMessage());

return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(response);
}
}

/**
* 创建Swarm服务
*/
@PostMapping("/service/create")
public ResponseEntity<Map<String, Object>> createService(@RequestBody SwarmServiceRequest request) {
try {
SwarmService service = swarmService.createService(request);

Map<String, Object> response = new HashMap<>();
response.put("success", true);
response.put("service", service);
response.put("message", "服务创建成功");

return ResponseEntity.ok(response);

} catch (Exception e) {
log.error("创建Swarm服务失败", e);

Map<String, Object> response = new HashMap<>();
response.put("success", false);
response.put("message", "创建服务失败: " + e.getMessage());

return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(response);
}
}

/**
* 扩缩容Swarm服务
*/
@PostMapping("/service/scale")
public ResponseEntity<Map<String, Object>> scaleService(
@RequestParam String serviceId,
@RequestParam int replicas) {
try {
SwarmScaleResult result = swarmService.scaleService(serviceId, replicas);

Map<String, Object> response = new HashMap<>();
response.put("success", result.isSuccess());
response.put("replicas", result.getReplicas());
response.put("message", result.getMessage());

return ResponseEntity.ok(response);

} catch (Exception e) {
log.error("扩缩容Swarm服务失败: serviceId={}", serviceId, e);

Map<String, Object> response = new HashMap<>();
response.put("success", false);
response.put("message", "扩缩容失败: " + e.getMessage());

return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(response);
}
}

/**
* 获取Swarm服务列表
*/
@GetMapping("/services")
public ResponseEntity<Map<String, Object>> getServices() {
try {
List<SwarmService> services = swarmService.getServices();

Map<String, Object> response = new HashMap<>();
response.put("success", true);
response.put("services", services);

return ResponseEntity.ok(response);

} catch (Exception e) {
log.error("获取Swarm服务列表失败", e);

Map<String, Object> response = new HashMap<>();
response.put("success", false);
response.put("message", "获取服务列表失败: " + e.getMessage());

return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(response);
}
}

/**
* 获取Swarm服务详情
*/
@GetMapping("/service/{serviceId}")
public ResponseEntity<Map<String, Object>> getServiceDetail(@PathVariable String serviceId) {
try {
SwarmServiceDetail serviceDetail = swarmService.getServiceDetail(serviceId);

Map<String, Object> response = new HashMap<>();
response.put("success", serviceDetail != null);
response.put("serviceDetail", serviceDetail);

return ResponseEntity.ok(response);

} catch (Exception e) {
log.error("获取Swarm服务详情失败: serviceId={}", serviceId, e);

Map<String, Object> response = new HashMap<>();
response.put("success", false);
response.put("message", "获取服务详情失败: " + e.getMessage());

return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(response);
}
}

/**
* 创建覆盖网络
*/
@PostMapping("/network/create")
public ResponseEntity<Map<String, Object>> createNetwork(@RequestBody SwarmNetworkRequest request) {
try {
SwarmNetwork network = networkService.createOverlayNetwork(request);

Map<String, Object> response = new HashMap<>();
response.put("success", true);
response.put("network", network);
response.put("message", "网络创建成功");

return ResponseEntity.ok(response);

} catch (Exception e) {
log.error("创建覆盖网络失败", e);

Map<String, Object> response = new HashMap<>();
response.put("success", false);
response.put("message", "创建网络失败: " + e.getMessage());

return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(response);
}
}

/**
* 获取Swarm网络列表
*/
@GetMapping("/networks")
public ResponseEntity<Map<String, Object>> getNetworks() {
try {
List<SwarmNetwork> networks = networkService.getNetworks();

Map<String, Object> response = new HashMap<>();
response.put("success", true);
response.put("networks", networks);

return ResponseEntity.ok(response);

} catch (Exception e) {
log.error("获取Swarm网络列表失败", e);

Map<String, Object> response = new HashMap<>();
response.put("success", false);
response.put("message", "获取网络列表失败: " + e.getMessage());

return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(response);
}
}

/**
* 获取Swarm集群统计信息
*/
@GetMapping("/stats")
public ResponseEntity<Map<String, Object>> getClusterStats() {
try {
SwarmClusterStats stats = monitoringService.getClusterStats();

Map<String, Object> response = new HashMap<>();
response.put("success", true);
response.put("stats", stats);

return ResponseEntity.ok(response);

} catch (Exception e) {
log.error("获取Swarm集群统计信息失败", e);

Map<String, Object> response = new HashMap<>();
response.put("success", false);
response.put("message", "获取统计信息失败: " + e.getMessage());

return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(response);
}
}
}

7. 总结

通过Docker Swarm集群自动化运维的实现,我们成功构建了一个企业级的容器编排平台。关键特性包括:

7.1 核心优势

  1. 集群管理: Docker Swarm集群创建和管理
  2. 服务发现: 自动服务发现和负载均衡
  3. 网络管理: 覆盖网络和网络隔离
  4. 故障恢复: 自动故障检测和恢复
  5. 高级运维: 滚动更新、配置管理、监控告警

7.2 最佳实践

  1. 集群设计: 合理的集群架构和节点规划
  2. 服务编排: 高效的服务编排和管理
  3. 网络设计: 安全的网络隔离和通信
  4. 监控告警: 全面的集群监控和告警机制
  5. 运维管理: 完善的运维管理工具

这套Docker Swarm集群自动化运维方案不仅能够提供企业级的容器编排能力,还包含了服务发现、网络管理、故障恢复等核心功能,是现代云原生应用的重要基础设施。