1. 架构概述

高可用架构是现代互联网应用的基础要求,Nginx作为高性能的负载均衡器,结合Keepalived实现双活高可用方案,能够有效提升系统的稳定性和可靠性。

1.1 技术架构图

1
2
3
4
5
6
7
Internet

Keepalived VIP (虚拟IP)

Nginx Master/Slave (双活)

Backend Servers (后端服务集群)

1.2 核心组件

  • Nginx: 负载均衡和反向代理
  • Keepalived: 高可用和故障转移
  • 健康检查: 服务状态监控
  • 虚拟IP: 统一入口点

2. Nginx负载均衡配置

2.1 主配置文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# /etc/nginx/nginx.conf
user nginx;
worker_processes auto; # 自动检测CPU核心数
error_log /var/log/nginx/error.log warn;
pid /var/run/nginx.pid;

# 事件模块配置
events {
worker_connections 1024; # 每个worker进程的最大连接数
use epoll; # 使用epoll事件模型
multi_accept on; # 允许一次接受多个连接
}

# HTTP模块配置
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;

# 日志格式定义
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for" '
'rt=$request_time uct="$upstream_connect_time" '
'uht="$upstream_header_time" urt="$upstream_response_time"';

access_log /var/log/nginx/access.log main;

# 性能优化配置
sendfile on; # 启用高效文件传输
tcp_nopush on; # 优化TCP传输
tcp_nodelay on; # 禁用Nagle算法
keepalive_timeout 65; # 保持连接超时时间
keepalive_requests 1000; # 每个连接的最大请求数
types_hash_max_size 2048; # 类型哈希表大小
client_max_body_size 10m; # 客户端请求体最大大小

# Gzip压缩配置
gzip on;
gzip_vary on;
gzip_min_length 1024;
gzip_proxied any;
gzip_comp_level 6;
gzip_types
text/plain
text/css
text/xml
text/javascript
application/json
application/javascript
application/xml+rss
application/atom+xml
image/svg+xml;

# 上游服务器组定义
upstream backend_servers {
# 负载均衡算法:轮询(默认)、ip_hash、least_conn、hash
least_conn; # 最少连接数算法

# 服务器配置
server 192.168.1.10:8080 weight=3 max_fails=3 fail_timeout=30s;
server 192.168.1.11:8080 weight=3 max_fails=3 fail_timeout=30s;
server 192.168.1.12:8080 weight=2 max_fails=3 fail_timeout=30s;
server 192.168.1.13:8080 weight=2 max_fails=3 fail_timeout=30s backup;

# 保持连接池
keepalive 32;
keepalive_requests 100;
keepalive_timeout 60s;
}

# 健康检查配置
upstream health_check {
server 192.168.1.10:8080;
server 192.168.1.11:8080;
server 192.168.1.12:8080;
}

# 主服务器配置
server {
listen 80;
server_name example.com www.example.com;

# 安全头配置
add_header X-Frame-Options DENY;
add_header X-Content-Type-Options nosniff;
add_header X-XSS-Protection "1; mode=block";

# 访问日志
access_log /var/log/nginx/example.com.access.log main;
error_log /var/log/nginx/example.com.error.log;

# 健康检查端点
location /health {
access_log off;
return 200 "healthy\n";
add_header Content-Type text/plain;
}

# 状态监控页面
location /nginx_status {
stub_status on;
access_log off;
allow 192.168.1.0/24; # 只允许内网访问
deny all;
}

# 主应用代理
location / {
# 代理到后端服务器组
proxy_pass http://backend_servers;

# 代理头设置
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;

# 超时配置
proxy_connect_timeout 5s;
proxy_send_timeout 60s;
proxy_read_timeout 60s;

# 缓冲配置
proxy_buffering on;
proxy_buffer_size 4k;
proxy_buffers 8 4k;
proxy_busy_buffers_size 8k;

# 错误处理
proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
proxy_next_upstream_tries 3;
proxy_next_upstream_timeout 10s;
}

# 静态文件处理
location ~* \.(jpg|jpeg|png|gif|ico|css|js)$ {
expires 1y;
add_header Cache-Control "public, immutable";
try_files $uri @backend;
}

location @backend {
proxy_pass http://backend_servers;
proxy_set_header Host $host;
}
}
}

2.2 负载均衡算法详解

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 1. 轮询算法 (默认)
upstream round_robin {
server 192.168.1.10:8080;
server 192.168.1.11:8080;
server 192.168.1.12:8080;
}

# 2. 加权轮询算法
upstream weighted_round_robin {
server 192.168.1.10:8080 weight=3; # 权重3
server 192.168.1.11:8080 weight=2; # 权重2
server 192.168.1.12:8080 weight=1; # 权重1
}

# 3. IP哈希算法 (会话保持)
upstream ip_hash {
ip_hash; # 根据客户端IP计算哈希值
server 192.168.1.10:8080;
server 192.168.1.11:8080;
server 192.168.1.12:8080;
}

# 4. 最少连接数算法
upstream least_conn {
least_conn; # 选择当前连接数最少的服务器
server 192.168.1.10:8080;
server 192.168.1.11:8080;
server 192.168.1.12:8080;
}

# 5. 一致性哈希算法
upstream consistent_hash {
hash $request_uri consistent; # 根据请求URI进行一致性哈希
server 192.168.1.10:8080;
server 192.168.1.11:8080;
server 192.168.1.12:8080;
}

3. Keepalived高可用配置

3.1 Master节点配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# /etc/keepalived/keepalived.conf (Master节点)
! Configuration File for keepalived

# 全局配置
global_defs {
# 邮件通知配置
notification_email {
admin@example.com
ops@example.com
}
notification_email_from keepalived@example.com
smtp_server 192.168.1.100
smtp_connect_timeout 30

# 路由ID,同一网段内必须唯一
router_id LVS_DEVEL

# 脚本执行用户
script_user root
enable_script_security
}

# 健康检查脚本
vrrp_script chk_nginx {
script "/etc/keepalived/check_nginx.sh" # 检查脚本路径
interval 2 # 检查间隔(秒)
weight -2 # 权重变化
fall 3 # 连续失败次数
rise 2 # 连续成功次数
}

# VRRP实例配置
vrrp_instance VI_1 {
state MASTER # 初始状态:MASTER/BACKUP
interface eth0 # 网络接口
virtual_router_id 51 # 虚拟路由ID,同一组内必须相同
priority 100 # 优先级,数值越大优先级越高
advert_int 1 # 广播间隔(秒)

# 认证配置
authentication {
auth_type PASS
auth_pass 1234 # 认证密码
}

# 虚拟IP配置
virtual_ipaddress {
192.168.1.100/24 dev eth0 # 虚拟IP地址
}

# 健康检查
track_script {
chk_nginx
}

# 状态变化通知
notify_master "/etc/keepalived/master.sh"
notify_backup "/etc/keepalived/backup.sh"
notify_fault "/etc/keepalived/fault.sh"
}

# 虚拟服务器配置 (可选,用于LVS负载均衡)
virtual_server 192.168.1.100 80 {
delay_loop 6 # 检查间隔
lb_algo rr # 负载均衡算法
lb_kind DR # 负载均衡模式
persistence_timeout 50 # 会话保持时间

# 真实服务器配置
real_server 192.168.1.10 8080 {
weight 1
TCP_CHECK {
connect_timeout 3
nb_get_retry 3
delay_before_retry 3
}
}

real_server 192.168.1.11 8080 {
weight 1
TCP_CHECK {
connect_timeout 3
nb_get_retry 3
delay_before_retry 3
}
}
}

3.2 Backup节点配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# /etc/keepalived/keepalived.conf (Backup节点)
! Configuration File for keepalived

global_defs {
notification_email {
admin@example.com
ops@example.com
}
notification_email_from keepalived@example.com
smtp_server 192.168.1.100
smtp_connect_timeout 30
router_id LVS_DEVEL_BACKUP
script_user root
enable_script_security
}

# 健康检查脚本
vrrp_script chk_nginx {
script "/etc/keepalived/check_nginx.sh"
interval 2
weight -2
fall 3
rise 2
}

# VRRP实例配置
vrrp_instance VI_1 {
state BACKUP # Backup节点
interface eth0
virtual_router_id 51 # 与Master节点相同
priority 90 # 优先级低于Master
advert_int 1

authentication {
auth_type PASS
auth_pass 1234
}

virtual_ipaddress {
192.168.1.100/24 dev eth0
}

track_script {
chk_nginx
}

notify_master "/etc/keepalived/master.sh"
notify_backup "/etc/keepalived/backup.sh"
notify_fault "/etc/keepalived/fault.sh"
}

3.3 健康检查脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/bash
# /etc/keepalived/check_nginx.sh

# Nginx健康检查脚本
# 返回值:0-正常,1-异常

# 检查Nginx进程是否存在
if ! pgrep -x "nginx" > /dev/null; then
echo "Nginx process not found"
exit 1
fi

# 检查Nginx端口是否监听
if ! netstat -tlnp | grep ":80 " > /dev/null; then
echo "Nginx port 80 not listening"
exit 1
fi

# 检查Nginx状态页面
if ! curl -s -f http://localhost/nginx_status > /dev/null; then
echo "Nginx status page not accessible"
exit 1
fi

# 检查后端服务健康状态
backend_servers=("192.168.1.10:8080" "192.168.1.11:8080" "192.168.1.12:8080")
healthy_count=0

for server in "${backend_servers[@]}"; do
if curl -s -f --connect-timeout 3 "http://$server/health" > /dev/null; then
((healthy_count++))
fi
done

# 如果健康的后端服务少于一半,认为不健康
if [ $healthy_count -lt 2 ]; then
echo "Insufficient healthy backend servers: $healthy_count/3"
exit 1
fi

echo "Nginx health check passed"
exit 0

3.4 状态变化通知脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#!/bin/bash
# /etc/keepalived/master.sh
# 成为Master节点时执行的脚本

echo "$(date): Becoming MASTER" >> /var/log/keepalived.log

# 启动Nginx服务
systemctl start nginx

# 发送通知邮件
echo "Keepalived MASTER node activated on $(hostname)" | \
mail -s "Keepalived MASTER Alert" admin@example.com

# 记录到系统日志
logger "Keepalived: This node is now MASTER"
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#!/bin/bash
# /etc/keepalived/backup.sh
# 成为Backup节点时执行的脚本

echo "$(date): Becoming BACKUP" >> /var/log/keepalived.log

# 停止Nginx服务(可选,根据需求决定)
# systemctl stop nginx

# 发送通知邮件
echo "Keepalived BACKUP node activated on $(hostname)" | \
mail -s "Keepalived BACKUP Alert" admin@example.com

# 记录到系统日志
logger "Keepalived: This node is now BACKUP"
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#!/bin/bash
# /etc/keepalived/fault.sh
# 节点故障时执行的脚本

echo "$(date): Node FAULT detected" >> /var/log/keepalived.log

# 尝试重启Nginx
systemctl restart nginx

# 发送告警邮件
echo "Keepalived FAULT detected on $(hostname)" | \
mail -s "Keepalived FAULT Alert" admin@example.com

# 记录到系统日志
logger "Keepalived: FAULT detected on this node"

4. Java应用健康检查

4.1 Spring Boot健康检查端点

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
@RestController
@RequestMapping("/health")
public class HealthCheckController {

@Autowired
private RedisTemplate<String, Object> redisTemplate;

@Autowired
private DataSource dataSource;

/**
* 基础健康检查
*/
@GetMapping("/basic")
public ResponseEntity<Map<String, Object>> basicHealth() {
Map<String, Object> health = new HashMap<>();
health.put("status", "UP");
health.put("timestamp", System.currentTimeMillis());
health.put("service", "backend-service");

return ResponseEntity.ok(health);
}

/**
* 详细健康检查
*/
@GetMapping("/detailed")
public ResponseEntity<Map<String, Object>> detailedHealth() {
Map<String, Object> health = new HashMap<>();
health.put("status", "UP");
health.put("timestamp", System.currentTimeMillis());

// 检查数据库连接
Map<String, Object> database = checkDatabase();
health.put("database", database);

// 检查Redis连接
Map<String, Object> redis = checkRedis();
health.put("redis", redis);

// 检查JVM状态
Map<String, Object> jvm = checkJVM();
health.put("jvm", jvm);

// 检查线程池状态
Map<String, Object> threadPool = checkThreadPool();
health.put("threadPool", threadPool);

return ResponseEntity.ok(health);
}

/**
* 检查数据库连接
*/
private Map<String, Object> checkDatabase() {
Map<String, Object> db = new HashMap<>();
try {
Connection connection = dataSource.getConnection();
boolean isValid = connection.isValid(5);
connection.close();

db.put("status", isValid ? "UP" : "DOWN");
db.put("responseTime", System.currentTimeMillis());
} catch (Exception e) {
db.put("status", "DOWN");
db.put("error", e.getMessage());
}
return db;
}

/**
* 检查Redis连接
*/
private Map<String, Object> checkRedis() {
Map<String, Object> redis = new HashMap<>();
try {
long start = System.currentTimeMillis();
String result = redisTemplate.opsForValue().get("health:check").toString();
long responseTime = System.currentTimeMillis() - start;

redis.put("status", "UP");
redis.put("responseTime", responseTime);
} catch (Exception e) {
redis.put("status", "DOWN");
redis.put("error", e.getMessage());
}
return redis;
}

/**
* 检查JVM状态
*/
private Map<String, Object> checkJVM() {
Map<String, Object> jvm = new HashMap<>();

Runtime runtime = Runtime.getRuntime();
long totalMemory = runtime.totalMemory();
long freeMemory = runtime.freeMemory();
long usedMemory = totalMemory - freeMemory;
long maxMemory = runtime.maxMemory();

jvm.put("totalMemory", totalMemory);
jvm.put("freeMemory", freeMemory);
jvm.put("usedMemory", usedMemory);
jvm.put("maxMemory", maxMemory);
jvm.put("memoryUsage", (double) usedMemory / maxMemory);

return jvm;
}

/**
* 检查线程池状态
*/
private Map<String, Object> checkThreadPool() {
Map<String, Object> threadPool = new HashMap<>();

ThreadMXBean threadBean = ManagementFactory.getThreadMXBean();
int threadCount = threadBean.getThreadCount();
int peakThreadCount = threadBean.getPeakThreadCount();

threadPool.put("threadCount", threadCount);
threadPool.put("peakThreadCount", peakThreadCount);

return threadPool;
}
}

4.2 应用配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# application.yml
server:
port: 8080
servlet:
context-path: /

# 健康检查配置
management:
endpoints:
web:
exposure:
include: health,info,metrics
endpoint:
health:
show-details: always
health:
redis:
enabled: true
db:
enabled: true

# 日志配置
logging:
level:
com.example: DEBUG
pattern:
console: "%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n"
file: "%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n"
file:
name: /var/log/app/application.log

5. 部署脚本

5.1 安装脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/bin/bash
# install_nginx_keepalived.sh

set -e

echo "开始安装Nginx和Keepalived..."

# 更新系统包
yum update -y

# 安装EPEL仓库
yum install -y epel-release

# 安装Nginx
yum install -y nginx

# 安装Keepalived
yum install -y keepalived

# 安装其他必要工具
yum install -y curl wget net-tools

# 启动并设置开机自启
systemctl enable nginx
systemctl enable keepalived

# 创建必要的目录
mkdir -p /etc/keepalived/scripts
mkdir -p /var/log/keepalived

# 设置脚本权限
chmod +x /etc/keepalived/scripts/*.sh

echo "安装完成!"
echo "请配置Nginx和Keepalived配置文件后启动服务"

5.2 启动脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/bin/bash
# start_services.sh

echo "启动Nginx和Keepalived服务..."

# 检查配置文件语法
echo "检查Nginx配置..."
nginx -t
if [ $? -ne 0 ]; then
echo "Nginx配置有误,请检查配置文件"
exit 1
fi

echo "检查Keepalived配置..."
keepalived -t
if [ $? -ne 0 ]; then
echo "Keepalived配置有误,请检查配置文件"
exit 1
fi

# 启动服务
echo "启动Nginx..."
systemctl start nginx

echo "启动Keepalived..."
systemctl start keepalived

# 检查服务状态
echo "检查服务状态..."
systemctl status nginx --no-pager
systemctl status keepalived --no-pager

echo "服务启动完成!"

5.3 监控脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/bin/bash
# monitor.sh

# 监控Nginx和Keepalived状态

LOG_FILE="/var/log/monitor.log"
DATE=$(date '+%Y-%m-%d %H:%M:%S')

# 检查Nginx状态
check_nginx() {
if systemctl is-active --quiet nginx; then
echo "[$DATE] Nginx: RUNNING" >> $LOG_FILE
else
echo "[$DATE] Nginx: STOPPED" >> $LOG_FILE
# 尝试重启Nginx
systemctl restart nginx
echo "[$DATE] Nginx: RESTARTED" >> $LOG_FILE
fi
}

# 检查Keepalived状态
check_keepalived() {
if systemctl is-active --quiet keepalived; then
echo "[$DATE] Keepalived: RUNNING" >> $LOG_FILE
else
echo "[$DATE] Keepalived: STOPPED" >> $LOG_FILE
# 尝试重启Keepalived
systemctl restart keepalived
echo "[$DATE] Keepalived: RESTARTED" >> $LOG_FILE
fi
}

# 检查虚拟IP
check_vip() {
VIP="192.168.1.100"
if ip addr show | grep -q $VIP; then
echo "[$DATE] VIP: ACTIVE" >> $LOG_FILE
else
echo "[$DATE] VIP: INACTIVE" >> $LOG_FILE
fi
}

# 检查后端服务
check_backend() {
BACKEND_SERVERS=("192.168.1.10:8080" "192.168.1.11:8080" "192.168.1.12:8080")

for server in "${BACKEND_SERVERS[@]}"; do
if curl -s -f --connect-timeout 3 "http://$server/health" > /dev/null; then
echo "[$DATE] Backend $server: HEALTHY" >> $LOG_FILE
else
echo "[$DATE] Backend $server: UNHEALTHY" >> $LOG_FILE
fi
done
}

# 执行所有检查
check_nginx
check_keepalived
check_vip
check_backend

echo "[$DATE] Monitor check completed" >> $LOG_FILE

6. 性能优化

6.1 Nginx性能优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# nginx.conf 性能优化配置
worker_processes auto; # 自动检测CPU核心数
worker_cpu_affinity auto; # CPU亲和性
worker_rlimit_nofile 65535; # 文件描述符限制

events {
worker_connections 65535; # 每个worker的最大连接数
use epoll; # 使用epoll事件模型
multi_accept on; # 一次接受多个连接
accept_mutex off; # 关闭accept互斥锁
}

http {
# 连接优化
keepalive_timeout 75; # 保持连接超时
keepalive_requests 1000; # 每个连接的最大请求数
tcp_nopush on; # 优化TCP传输
tcp_nodelay on; # 禁用Nagle算法

# 缓冲优化
client_body_buffer_size 128k; # 客户端请求体缓冲
client_max_body_size 10m; # 最大请求体大小
client_header_buffer_size 1k; # 请求头缓冲
large_client_header_buffers 4 4k; # 大请求头缓冲

# 代理优化
proxy_connect_timeout 5s; # 连接超时
proxy_send_timeout 60s; # 发送超时
proxy_read_timeout 60s; # 读取超时
proxy_buffering on; # 启用缓冲
proxy_buffer_size 4k; # 代理缓冲大小
proxy_buffers 8 4k; # 代理缓冲数量
proxy_busy_buffers_size 8k; # 忙缓冲大小

# 上游连接池
upstream backend_servers {
least_conn;
server 192.168.1.10:8080 weight=3 max_fails=3 fail_timeout=30s;
server 192.168.1.11:8080 weight=3 max_fails=3 fail_timeout=30s;
server 192.168.1.12:8080 weight=2 max_fails=3 fail_timeout=30s;

# 连接池配置
keepalive 32; # 保持连接数
keepalive_requests 100; # 每个连接的最大请求数
keepalive_timeout 60s; # 连接超时时间
}
}

6.2 系统优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/bin/bash
# system_optimization.sh

echo "开始系统优化..."

# 1. 内核参数优化
cat >> /etc/sysctl.conf << EOF
# 网络优化
net.core.somaxconn = 65535
net.core.netdev_max_backlog = 5000
net.ipv4.tcp_max_syn_backlog = 65535
net.ipv4.tcp_fin_timeout = 10
net.ipv4.tcp_tw_reuse = 1
net.ipv4.tcp_timestamps = 1
net.ipv4.tcp_window_scaling = 1
net.ipv4.tcp_sack = 1
net.ipv4.tcp_no_metrics_save = 1
net.ipv4.tcp_congestion_control = bbr

# 文件描述符限制
fs.file-max = 65535
fs.nr_open = 65535
EOF

# 应用内核参数
sysctl -p

# 2. 文件描述符限制
cat >> /etc/security/limits.conf << EOF
* soft nofile 65535
* hard nofile 65535
* soft nproc 65535
* hard nproc 65535
EOF

# 3. 创建systemd服务配置
cat > /etc/systemd/system/nginx.service.d/override.conf << EOF
[Service]
LimitNOFILE=65535
LimitNPROC=65535
EOF

# 4. 重新加载systemd配置
systemctl daemon-reload

echo "系统优化完成!"

7. 故障排查

7.1 常见问题诊断

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/bin/bash
# troubleshoot.sh

echo "开始故障排查..."

# 1. 检查服务状态
echo "=== 服务状态检查 ==="
systemctl status nginx --no-pager
systemctl status keepalived --no-pager

# 2. 检查端口监听
echo "=== 端口监听检查 ==="
netstat -tlnp | grep -E ":(80|443|8080)"

# 3. 检查虚拟IP
echo "=== 虚拟IP检查 ==="
ip addr show | grep "192.168.1.100"

# 4. 检查Keepalived日志
echo "=== Keepalived日志 ==="
tail -20 /var/log/messages | grep keepalived

# 5. 检查Nginx日志
echo "=== Nginx错误日志 ==="
tail -20 /var/log/nginx/error.log

# 6. 检查后端服务
echo "=== 后端服务检查 ==="
for server in 192.168.1.10:8080 192.168.1.11:8080 192.168.1.12:8080; do
echo "检查 $server..."
curl -s -f --connect-timeout 3 "http://$server/health" || echo "服务不可达"
done

# 7. 检查网络连通性
echo "=== 网络连通性检查 ==="
ping -c 3 192.168.1.100
ping -c 3 192.168.1.10
ping -c 3 192.168.1.11
ping -c 3 192.168.1.12

echo "故障排查完成!"

8. 总结

通过Nginx和Keepalived的组合使用,我们成功构建了一个高可用的负载均衡系统。关键特性包括:

8.1 技术优势

  1. 高可用性: 双活架构,单点故障不影响服务
  2. 负载均衡: 多种算法支持,智能分发请求
  3. 健康检查: 实时监控服务状态,自动故障转移
  4. 性能优化: 连接池、缓冲优化,提升处理能力

8.2 部署要点

  1. 配置一致性: 确保Master和Backup节点配置一致
  2. 网络规划: 合理规划虚拟IP和网络拓扑
  3. 监控告警: 完善的监控和告警机制
  4. 故障演练: 定期进行故障转移测试

这套高可用架构不仅能够满足生产环境的需求,还为系统的稳定运行提供了强有力的保障。