第195集CPU使用率与内存使用率监控实战
|字数总计:7.7k|阅读时长:35分钟|阅读量:
1. 系统资源监控概述
CPU使用率和内存使用率是系统性能监控的核心指标,通过实时监控这些关键资源的使用情况,可以及时发现性能瓶颈、预防系统故障、优化资源配置。本文将详细介绍CPU和内存监控的方法、工具使用以及生产级监控方案。
1.1 系统资源监控的重要性
- 性能保障: 确保系统运行在最佳性能状态
- 故障预防: 提前发现资源瓶颈,预防系统崩溃
- 容量规划: 为系统扩容提供数据支持
- 成本优化: 合理配置资源,降低运营成本
- 用户体验: 保证应用响应速度和稳定性
1.2 核心监控指标
CPU监控指标
- CPU使用率: 处理器使用百分比
- 负载均衡: 系统负载平均值
- 上下文切换: 进程切换频率
- 中断处理: 硬件中断处理情况
- CPU队列: 等待CPU的进程数量
内存监控指标
- 内存使用率: 物理内存使用百分比
- 交换分区: 虚拟内存使用情况
- 缓存命中率: 内存缓存效率
- 内存泄漏: 内存使用异常增长
- 页面交换: 内存页面交换频率
1.3 监控层次
- 硬件层: CPU核心、内存条状态
- 内核层: 内核资源使用情况
- 进程层: 单个进程资源占用
- 应用层: 应用程序性能指标
- 业务层: 业务相关性能指标
2. CPU使用率监控
2.1 CPU监控工具详解
2.1.1 top命令详解
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
| top
top -o %CPU
top -o %MEM
top -u username
top -d 2
top -H
top -b -n 1 > cpu_info.txt
|
2.1.2 htop命令详解
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
| yum install htop apt install htop
htop
htop -s PERCENT_CPU
htop -s PERCENT_MEM
htop -u username
htop -d 5
htop -H
|
2.1.3 iostat命令详解
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
| yum install sysstat apt install sysstat
iostat -c
iostat -c -x
iostat -c 2 5
iostat -c -x 1
|
2.1.4 vmstat命令详解
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| vmstat
vmstat 2 5
vmstat -s
vmstat -m
vmstat -d
|
2.2 CPU监控脚本
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
| #!/bin/bash
LOG_FILE="/var/log/cpu_monitor.log" ALERT_CPU_THRESHOLD=80 CHECK_INTERVAL=60
log_message() { echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> $LOG_FILE }
get_cpu_usage() { cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1) echo $cpu_usage }
get_cpu_load() { load_1min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | tr -d ' ') load_5min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $2}' | tr -d ' ') load_15min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $3}' | tr -d ' ') echo "$load_1min,$load_5min,$load_15min" }
get_cpu_cores() { cat /proc/cpuinfo | grep "processor" | wc -l }
check_cpu_alert() { cpu_usage=$(get_cpu_usage) cpu_cores=$(get_cpu_cores) if (( $(echo "$cpu_usage > $ALERT_CPU_THRESHOLD" | bc -l) )); then log_message "WARNING: CPU使用率过高: ${cpu_usage}%" top_processes=$(ps aux --sort=-%cpu | head -10) log_message "CPU使用率最高的进程:" echo "$top_processes" | while read line; do log_message " $line" done fi }
check_load_alert() { load_info=$(get_cpu_load) load_1min=$(echo $load_info | cut -d',' -f1) cpu_cores=$(get_cpu_cores) load_threshold=$(echo "$cpu_cores * 2" | bc) if (( $(echo "$load_1min > $load_threshold" | bc -l) )); then log_message "WARNING: CPU负载过高: ${load_1min} (CPU核心数: ${cpu_cores})" fi }
get_cpu_details() { echo "=== CPU详细信息 ===" echo "CPU型号: $(cat /proc/cpuinfo | grep "model name" | head -1 | cut -d':' -f2 | tr -d ' ')" echo "CPU核心数: $(get_cpu_cores)" echo "CPU频率: $(cat /proc/cpuinfo | grep "cpu MHz" | head -1 | cut -d':' -f2 | tr -d ' ')" echo "CPU缓存: $(cat /proc/cpuinfo | grep "cache size" | head -1 | cut -d':' -f2 | tr -d ' ')" echo -e "\n=== CPU使用率统计 ===" echo "当前CPU使用率: $(get_cpu_usage)%" echo -e "\n=== CPU负载统计 ===" load_info=$(get_cpu_load) echo "1分钟负载: $(echo $load_info | cut -d',' -f1)" echo "5分钟负载: $(echo $load_info | cut -d',' -f2)" echo "15分钟负载: $(echo $load_info | cut -d',' -f3)" echo -e "\n=== CPU进程统计 ===" ps aux --sort=-%cpu | head -10 }
main() { log_message "CPU监控启动" while true; do check_cpu_alert check_load_alert cpu_usage=$(get_cpu_usage) load_info=$(get_cpu_load) log_message "CPU使用率: ${cpu_usage}%, 负载: $(echo $load_info | cut -d',' -f1)" sleep $CHECK_INTERVAL done }
main
|
2.3 CPU性能分析
2.3.1 CPU热点分析
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
| #!/bin/bash
echo "=== CPU热点分析 ==="
echo "1. CPU使用率最高的10个进程:" ps aux --sort=-%cpu | head -11
echo -e "\n2. CPU使用率最高的10个线程:" ps -eLf --sort=-%cpu | head -11
echo -e "\n3. CPU使用模式分析:" echo "用户态CPU使用率:" iostat -c 1 1 | grep -v "avg-cpu" | awk '{print "用户态: " $1 "%"}'
echo "系统态CPU使用率:" iostat -c 1 1 | grep -v "avg-cpu" | awk '{print "系统态: " $3 "%"}'
echo "等待I/O的CPU使用率:" iostat -c 1 1 | grep -v "avg-cpu" | awk '{print "I/O等待: " $4 "%"}'
echo -e "\n4. CPU中断分析:" cat /proc/interrupts | head -20
echo -e "\n5. CPU上下文切换分析:" vmstat 1 5 | tail -4
|
2.3.2 CPU瓶颈诊断
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
| #!/bin/bash
echo "=== CPU瓶颈诊断 ==="
echo "1. CPU使用率检查:" cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1) echo "当前CPU使用率: ${cpu_usage}%"
if (( $(echo "$cpu_usage > 80" | bc -l) )); then echo "WARNING: CPU使用率过高" fi
echo -e "\n2. CPU负载检查:" load_1min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | tr -d ' ') cpu_cores=$(cat /proc/cpuinfo | grep "processor" | wc -l) echo "1分钟负载: $load_1min" echo "CPU核心数: $cpu_cores"
if (( $(echo "$load_1min > $cpu_cores" | bc -l) )); then echo "WARNING: CPU负载过高" fi
echo -e "\n3. CPU队列检查:" vmstat 1 1 | tail -1 | awk '{print "运行队列长度: " $1}' vmstat 1 1 | tail -1 | awk '{print "阻塞队列长度: " $2}'
echo -e "\n4. 上下文切换检查:" vmstat 1 1 | tail -1 | awk '{print "上下文切换次数: " $12}'
echo -e "\n5. 中断处理检查:" cat /proc/stat | grep "intr" | awk '{print "中断总数: " $2}'
|
3. 内存使用率监控
3.1 内存监控工具详解
3.1.1 free命令详解
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| free
free -m
free -g
free -h
watch -n 1 free -h
|
3.1.2 top命令内存监控
1 2 3 4 5 6 7 8
| top -o %MEM
top -n 1 -b | grep "Mem:"
top -n 1 -b | grep "Swap:"
|
3.1.3 vmstat命令内存监控
1 2 3 4 5 6 7 8 9 10 11
| vmstat -s
vmstat
vmstat 2
vmstat -m
|
3.2 内存监控脚本
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
| #!/bin/bash
LOG_FILE="/var/log/memory_monitor.log" ALERT_MEMORY_THRESHOLD=80 CHECK_INTERVAL=60
log_message() { echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> $LOG_FILE }
get_memory_usage() { memory_info=$(free | grep "Mem:") total=$(echo $memory_info | awk '{print $2}') used=$(echo $memory_info | awk '{print $3}') usage=$(echo "scale=2; $used * 100 / $total" | bc) echo $usage }
get_swap_usage() { swap_info=$(free | grep "Swap:") total=$(echo $swap_info | awk '{print $2}') used=$(echo $swap_info | awk '{print $3}') if [ $total -eq 0 ]; then echo "0" else usage=$(echo "scale=2; $used * 100 / $total" | bc) echo $usage fi }
get_memory_details() { echo "=== 内存详细信息 ===" total_mem=$(free -h | grep "Mem:" | awk '{print $2}') echo "总内存: $total_mem" used_mem=$(free -h | grep "Mem:" | awk '{print $3}') echo "已使用: $used_mem" available_mem=$(free -h | grep "Mem:" | awk '{print $7}') echo "可用内存: $available_mem" cached_mem=$(free -h | grep "Mem:" | awk '{print $6}') echo "缓存: $cached_mem" swap_total=$(free -h | grep "Swap:" | awk '{print $2}') swap_used=$(free -h | grep "Swap:" | awk '{print $3}') echo "交换分区: $swap_used / $swap_total" }
check_memory_alert() { memory_usage=$(get_memory_usage) if (( $(echo "$memory_usage > $ALERT_MEMORY_THRESHOLD" | bc -l) )); then log_message "WARNING: 内存使用率过高: ${memory_usage}%" top_processes=$(ps aux --sort=-%mem | head -10) log_message "内存使用率最高的进程:" echo "$top_processes" | while read line; do log_message " $line" done fi }
check_swap_alert() { swap_usage=$(get_swap_usage) if (( $(echo "$swap_usage > 10" | bc -l) )); then log_message "WARNING: 交换分区使用率过高: ${swap_usage}%" fi }
check_memory_leak() { uptime_seconds=$(cat /proc/uptime | awk '{print $1}') uptime_hours=$(echo "scale=2; $uptime_seconds / 3600" | bc) if (( $(echo "$uptime_hours > 24" | bc -l) )); then log_message "系统运行时间: ${uptime_hours}小时,建议检查内存使用趋势" fi }
get_memory_stats() { echo "=== 内存使用统计 ===" memory_usage=$(get_memory_usage) echo "内存使用率: ${memory_usage}%" swap_usage=$(get_swap_usage) echo "交换分区使用率: ${swap_usage}%" echo -e "\n=== 页面交换统计 ===" vmstat 1 1 | tail -1 | awk '{print "页面换入: " $7}' vmstat 1 1 | tail -1 | awk '{print "页面换出: " $8}' echo -e "\n=== 内存使用最高的进程 ===" ps aux --sort=-%mem | head -10 }
main() { log_message "内存监控启动" while true; do check_memory_alert check_swap_alert check_memory_leak memory_usage=$(get_memory_usage) swap_usage=$(get_swap_usage) log_message "内存使用率: ${memory_usage}%, 交换分区使用率: ${swap_usage}%" sleep $CHECK_INTERVAL done }
main
|
3.3 内存性能分析
3.3.1 内存使用分析
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
| #!/bin/bash
echo "=== 内存使用分析 ==="
echo "1. 内存使用概况:" free -h
echo -e "\n2. 内存使用率最高的10个进程:" ps aux --sort=-%mem | head -11
echo -e "\n3. 内存使用详情:" cat /proc/meminfo | head -20
echo -e "\n4. 内存映射分析:" cat /proc/meminfo | grep -E "(MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree)"
echo -e "\n5. 页面交换分析:" vmstat 1 1 | tail -1 | awk '{print "页面换入: " $7 ", 页面换出: " $8}'
|
3.3.2 内存泄漏检测
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
| #!/bin/bash
echo "=== 内存泄漏检测 ==="
echo "1. 内存使用趋势检查:" echo "当前内存使用:" free -h | grep "Mem:"
echo -e "\n2. 内存使用率最高的进程:" ps aux --sort=-%mem | head -10
echo -e "\n3. 内存映射检查:" cat /proc/meminfo | grep -E "(MemTotal|MemFree|MemAvailable|Buffers|Cached)"
echo -e "\n4. 交换分区使用检查:" free -h | grep "Swap:"
echo -e "\n5. 内存碎片检查:" cat /proc/buddyinfo
|
4. 综合监控方案
4.1 系统资源综合监控
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
| #!/bin/bash
LOG_FILE="/var/log/system_resource.log" ALERT_CPU_THRESHOLD=80 ALERT_MEMORY_THRESHOLD=80 CHECK_INTERVAL=30
log_message() { echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> $LOG_FILE }
get_system_resources() { cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1) memory_info=$(free | grep "Mem:") total_mem=$(echo $memory_info | awk '{print $2}') used_mem=$(echo $memory_info | awk '{print $3}') memory_usage=$(echo "scale=2; $used_mem * 100 / $total_mem" | bc) disk_usage=$(df -h / | tail -1 | awk '{print $5}' | cut -d'%' -f1) load_1min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | tr -d ' ') echo "$cpu_usage,$memory_usage,$disk_usage,$load_1min" }
check_resource_alerts() { resources=$(get_system_resources) cpu_usage=$(echo $resources | cut -d',' -f1) memory_usage=$(echo $resources | cut -d',' -f2) disk_usage=$(echo $resources | cut -d',' -f3) load_1min=$(echo $resources | cut -d',' -f4) if (( $(echo "$cpu_usage > $ALERT_CPU_THRESHOLD" | bc -l) )); then log_message "WARNING: CPU使用率过高: ${cpu_usage}%" fi if (( $(echo "$memory_usage > $ALERT_MEMORY_THRESHOLD" | bc -l) )); then log_message "WARNING: 内存使用率过高: ${memory_usage}%" fi if [ $disk_usage -gt 80 ]; then log_message "WARNING: 磁盘使用率过高: ${disk_usage}%" fi cpu_cores=$(cat /proc/cpuinfo | grep "processor" | wc -l) load_threshold=$(echo "$cpu_cores * 2" | bc) if (( $(echo "$load_1min > $load_threshold" | bc -l) )); then log_message "WARNING: 系统负载过高: ${load_1min} (CPU核心数: ${cpu_cores})" fi }
generate_resource_report() { echo "=== 系统资源使用报告 ===" echo "生成时间: $(date)" resources=$(get_system_resources) cpu_usage=$(echo $resources | cut -d',' -f1) memory_usage=$(echo $resources | cut -d',' -f2) disk_usage=$(echo $resources | cut -d',' -f3) load_1min=$(echo $resources | cut -d',' -f4) echo "CPU使用率: ${cpu_usage}%" echo "内存使用率: ${memory_usage}%" echo "磁盘使用率: ${disk_usage}%" echo "系统负载: ${load_1min}" echo -e "\n=== 详细资源信息 ===" echo "CPU信息:" echo " 核心数: $(cat /proc/cpuinfo | grep "processor" | wc -l)" echo " 型号: $(cat /proc/cpuinfo | grep "model name" | head -1 | cut -d':' -f2 | tr -d ' ')" echo "内存信息:" free -h | grep "Mem:" | awk '{print " 总内存: " $2 ", 已使用: " $3 ", 可用: " $7}' echo "磁盘信息:" df -h / | tail -1 | awk '{print " 总容量: " $2 ", 已使用: " $3 ", 可用: " $4 ", 使用率: " $5}' echo "系统负载:" uptime | awk -F'load average:' '{print " " $2}' }
main() { log_message "系统资源监控启动" while true; do check_resource_alerts resources=$(get_system_resources) cpu_usage=$(echo $resources | cut -d',' -f1) memory_usage=$(echo $resources | cut -d',' -f2) disk_usage=$(echo $resources | cut -d',' -f3) load_1min=$(echo $resources | cut -d',' -f4) log_message "资源使用情况 - CPU: ${cpu_usage}%, 内存: ${memory_usage}%, 磁盘: ${disk_usage}%, 负载: ${load_1min}" sleep $CHECK_INTERVAL done }
main
|
4.2 实时监控仪表板
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
| #!/bin/bash
clear_screen() { clear }
show_dashboard() { clear_screen echo "==========================================" echo " 系统资源监控仪表板" echo "==========================================" echo "更新时间: $(date '+%Y-%m-%d %H:%M:%S')" echo "" cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1) echo "CPU使用率: ${cpu_usage}%" memory_info=$(free | grep "Mem:") total_mem=$(echo $memory_info | awk '{print $2}') used_mem=$(echo $memory_info | awk '{print $3}') memory_usage=$(echo "scale=2; $used_mem * 100 / $total_mem" | bc) echo "内存使用率: ${memory_usage}%" disk_usage=$(df -h / | tail -1 | awk '{print $5}' | cut -d'%' -f1) echo "磁盘使用率: ${disk_usage}%" load_info=$(uptime | awk -F'load average:' '{print $2}') echo "系统负载: ${load_info}" echo "" echo "==========================================" echo " CPU使用率最高的进程" echo "==========================================" ps aux --sort=-%cpu | head -6 echo "" echo "==========================================" echo " 内存使用率最高的进程" echo "==========================================" ps aux --sort=-%mem | head -6 echo "" echo "==========================================" echo " 系统资源详细信息" echo "==========================================" echo "CPU信息:" echo " 核心数: $(cat /proc/cpuinfo | grep "processor" | wc -l)" echo " 型号: $(cat /proc/cpuinfo | grep "model name" | head -1 | cut -d':' -f2 | tr -d ' ')" echo "内存信息:" free -h | grep "Mem:" | awk '{print " 总内存: " $2 ", 已使用: " $3 ", 可用: " $7}' echo "磁盘信息:" df -h / | tail -1 | awk '{print " 总容量: " $2 ", 已使用: " $3 ", 可用: " $4 ", 使用率: " $5}' echo "" echo "按 Ctrl+C 退出监控" }
main() { while true; do show_dashboard sleep 5 done }
main
|
5. 性能优化策略
5.1 CPU性能优化
5.1.1 CPU优化脚本
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
| #!/bin/bash
echo "=== CPU性能优化 ==="
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1) echo "当前CPU使用率: ${cpu_usage}%"
if (( $(echo "$cpu_usage > 80" | bc -l) )); then echo "CPU使用率过高,开始优化..." echo "1. 检查CPU使用率最高的进程:" ps aux --sort=-%cpu | head -10 echo -e "\n2. 检查系统负载:" uptime echo -e "\n3. 检查CPU队列:" vmstat 1 1 | tail -1 | awk '{print "运行队列: " $1 ", 阻塞队列: " $2}' echo -e "\n4. 检查上下文切换:" vmstat 1 1 | tail -1 | awk '{print "上下文切换: " $12}' echo -e "\n5. 优化建议:" echo " - 检查是否有CPU密集型进程" echo " - 考虑增加CPU核心数" echo " - 优化应用程序性能" echo " - 检查是否有死循环或无限递归" else echo "CPU使用率正常" fi
|
5.1.2 CPU调优参数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
| #!/bin/bash
echo "=== CPU调优参数 ==="
echo "1. 当前CPU调优参数:" echo "CPU调度策略: $(cat /proc/sys/kernel/sched_rt_runtime_us)" echo "CPU亲和性: $(cat /proc/sys/kernel/sched_rt_period_us)"
echo -e "\n2. CPU调度优化:"
echo "设置CPU调度策略为CFS" echo "CFS" > /proc/sys/kernel/sched_rt_runtime_us
echo -e "\n3. CPU亲和性优化:"
echo "设置CPU亲和性" echo "0" > /proc/sys/kernel/sched_rt_period_us
echo -e "\n4. 中断处理优化:"
echo "设置中断处理优先级" echo "1" > /proc/sys/kernel/sched_rt_runtime_us
echo -e "\n5. CPU缓存优化:"
echo "设置CPU缓存策略" echo "0" > /proc/sys/kernel/sched_rt_period_us
|
5.2 内存性能优化
5.2.1 内存优化脚本
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
| #!/bin/bash
echo "=== 内存性能优化 ==="
memory_info=$(free | grep "Mem:") total_mem=$(echo $memory_info | awk '{print $2}') used_mem=$(echo $memory_info | awk '{print $3}') memory_usage=$(echo "scale=2; $used_mem * 100 / $total_mem" | bc) echo "当前内存使用率: ${memory_usage}%"
if (( $(echo "$memory_usage > 80" | bc -l) )); then echo "内存使用率过高,开始优化..." echo "1. 检查内存使用率最高的进程:" ps aux --sort=-%mem | head -10 echo -e "\n2. 检查内存使用详情:" cat /proc/meminfo | head -20 echo -e "\n3. 检查交换分区使用:" free -h | grep "Swap:" echo -e "\n4. 检查页面交换:" vmstat 1 1 | tail -1 | awk '{print "页面换入: " $7 ", 页面换出: " $8}' echo -e "\n5. 优化建议:" echo " - 检查是否有内存泄漏" echo " - 考虑增加内存容量" echo " - 优化应用程序内存使用" echo " - 检查是否有内存碎片" else echo "内存使用率正常" fi
|
5.2.2 内存调优参数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
| #!/bin/bash
echo "=== 内存调优参数 ==="
echo "1. 当前内存调优参数:" echo "内存过度分配: $(cat /proc/sys/vm/overcommit_memory)" echo "内存交换策略: $(cat /proc/sys/vm/swappiness)" echo "内存回收策略: $(cat /proc/sys/vm/dirty_ratio)"
echo -e "\n2. 内存过度分配优化:"
echo "设置内存过度分配策略" echo "1" > /proc/sys/vm/overcommit_memory
echo -e "\n3. 内存交换优化:"
echo "设置内存交换策略" echo "10" > /proc/sys/vm/swappiness
echo -e "\n4. 内存回收优化:"
echo "设置内存回收策略" echo "15" > /proc/sys/vm/dirty_ratio
echo -e "\n5. 内存缓存优化:"
echo "设置内存缓存策略" echo "5" > /proc/sys/vm/dirty_background_ratio
|
6. 故障诊断与处理
6.1 CPU故障诊断
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
| #!/bin/bash
echo "=== CPU故障诊断 ==="
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1) echo "1. CPU使用率: ${cpu_usage}%"
if (( $(echo "$cpu_usage > 90" | bc -l) )); then echo "WARNING: CPU使用率过高" echo "2. CPU使用率最高的进程:" ps aux --sort=-%cpu | head -10 echo -e "\n3. 系统负载:" uptime echo -e "\n4. CPU队列:" vmstat 1 1 | tail -1 | awk '{print "运行队列: " $1 ", 阻塞队列: " $2}' echo -e "\n5. 上下文切换:" vmstat 1 1 | tail -1 | awk '{print "上下文切换: " $12}' echo -e "\n6. 中断处理:" cat /proc/interrupts | head -10 echo -e "\n7. 故障处理建议:" echo " - 检查是否有CPU密集型进程" echo " - 考虑增加CPU核心数" echo " - 优化应用程序性能" echo " - 检查是否有死循环或无限递归" echo " - 考虑使用负载均衡" else echo "CPU使用率正常" fi
|
6.2 内存故障诊断
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
| #!/bin/bash
echo "=== 内存故障诊断 ==="
memory_info=$(free | grep "Mem:") total_mem=$(echo $memory_info | awk '{print $2}') used_mem=$(echo $memory_info | awk '{print $3}') memory_usage=$(echo "scale=2; $used_mem * 100 / $total_mem" | bc) echo "1. 内存使用率: ${memory_usage}%"
if (( $(echo "$memory_usage > 90" | bc -l) )); then echo "WARNING: 内存使用率过高" echo "2. 内存使用率最高的进程:" ps aux --sort=-%mem | head -10 echo -e "\n3. 内存使用详情:" cat /proc/meminfo | head -20 echo -e "\n4. 交换分区使用:" free -h | grep "Swap:" echo -e "\n5. 页面交换:" vmstat 1 1 | tail -1 | awk '{print "页面换入: " $7 ", 页面换出: " $8}' echo -e "\n6. 内存泄漏检查:" ps aux --sort=-%mem | head -5 | while read line; do pid=$(echo $line | awk '{print $2}') if [ "$pid" != "PID" ]; then echo "进程 $pid 内存使用: $(echo $line | awk '{print $4}')%" fi done echo -e "\n7. 故障处理建议:" echo " - 检查是否有内存泄漏" echo " - 考虑增加内存容量" echo " - 优化应用程序内存使用" echo " - 检查是否有内存碎片" echo " - 考虑使用内存压缩" else echo "内存使用率正常" fi
|
7. 监控告警系统
7.1 告警配置
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
| #!/bin/bash
ALERT_EMAIL="admin@company.com" ALERT_WEBHOOK="https://hooks.slack.com/services/xxx" LOG_FILE="/var/log/resource_alerts.log"
CPU_THRESHOLD=80 MEMORY_THRESHOLD=80 DISK_THRESHOLD=80 LOAD_THRESHOLD=2
log_alert() { echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> $LOG_FILE }
send_email_alert() { local subject="$1" local message="$2" echo "$message" | mail -s "$subject" $ALERT_EMAIL log_alert "邮件告警发送: $subject" }
send_slack_alert() { local message="$1" curl -X POST -H 'Content-type: application/json' \ --data "{\"text\":\"$message\"}" \ $ALERT_WEBHOOK log_alert "Slack告警发送: $message" }
check_cpu_alert() { cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1) if (( $(echo "$cpu_usage > $CPU_THRESHOLD" | bc -l) )); then local alert_msg="WARNING: CPU使用率过高: ${cpu_usage}%" send_email_alert "CPU使用率告警" "$alert_msg" send_slack_alert "$alert_msg" fi }
check_memory_alert() { memory_info=$(free | grep "Mem:") total_mem=$(echo $memory_info | awk '{print $2}') used_mem=$(echo $memory_info | awk '{print $3}') memory_usage=$(echo "scale=2; $used_mem * 100 / $total_mem" | bc) if (( $(echo "$memory_usage > $MEMORY_THRESHOLD" | bc -l) )); then local alert_msg="WARNING: 内存使用率过高: ${memory_usage}%" send_email_alert "内存使用率告警" "$alert_msg" send_slack_alert "$alert_msg" fi }
check_disk_alert() { disk_usage=$(df -h / | tail -1 | awk '{print $5}' | cut -d'%' -f1) if [ $disk_usage -gt $DISK_THRESHOLD ]; then local alert_msg="WARNING: 磁盘使用率过高: ${disk_usage}%" send_email_alert "磁盘使用率告警" "$alert_msg" send_slack_alert "$alert_msg" fi }
check_load_alert() { load_1min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | tr -d ' ') cpu_cores=$(cat /proc/cpuinfo | grep "processor" | wc -l) if (( $(echo "$load_1min > $LOAD_THRESHOLD" | bc -l) )); then local alert_msg="WARNING: 系统负载过高: ${load_1min} (CPU核心数: ${cpu_cores})" send_email_alert "系统负载告警" "$alert_msg" send_slack_alert "$alert_msg" fi }
main() { log_alert "资源告警检查启动" while true; do check_cpu_alert check_memory_alert check_disk_alert check_load_alert sleep 60 done }
main
|
7.2 监控报告生成
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
| #!/bin/bash
REPORT_FILE="/var/log/monitoring_report_$(date +%Y%m%d).log" REPORT_EMAIL="admin@company.com"
generate_report() { echo "=== 系统资源监控报告 ===" > $REPORT_FILE echo "生成时间: $(date)" >> $REPORT_FILE echo "" >> $REPORT_FILE echo "=== CPU使用情况 ===" >> $REPORT_FILE cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1) echo "CPU使用率: ${cpu_usage}%" >> $REPORT_FILE echo "CPU核心数: $(cat /proc/cpuinfo | grep "processor" | wc -l)" >> $REPORT_FILE echo "CPU型号: $(cat /proc/cpuinfo | grep "model name" | head -1 | cut -d':' -f2 | tr -d ' ')" >> $REPORT_FILE echo "" >> $REPORT_FILE echo "=== 内存使用情况 ===" >> $REPORT_FILE free -h >> $REPORT_FILE echo "" >> $REPORT_FILE echo "=== 磁盘使用情况 ===" >> $REPORT_FILE df -h >> $REPORT_FILE echo "" >> $REPORT_FILE echo "=== 系统负载 ===" >> $REPORT_FILE uptime >> $REPORT_FILE echo "" >> $REPORT_FILE echo "=== 进程统计 ===" >> $REPORT_FILE echo "CPU使用率最高的进程:" >> $REPORT_FILE ps aux --sort=-%cpu | head -10 >> $REPORT_FILE echo "" >> $REPORT_FILE echo "内存使用率最高的进程:" >> $REPORT_FILE ps aux --sort=-%mem | head -10 >> $REPORT_FILE echo "" >> $REPORT_FILE echo "=== 系统信息 ===" >> $REPORT_FILE echo "操作系统: $(uname -a)" >> $REPORT_FILE echo "内核版本: $(uname -r)" >> $REPORT_FILE echo "系统启动时间: $(uptime -s)" >> $REPORT_FILE echo "系统运行时间: $(uptime -p)" >> $REPORT_FILE echo "" >> $REPORT_FILE echo "=== 网络信息 ===" >> $REPORT_FILE netstat -tuln | head -20 >> $REPORT_FILE echo "" >> $REPORT_FILE echo "=== 服务状态 ===" >> $REPORT_FILE systemctl list-units --type=service --state=running | head -20 >> $REPORT_FILE echo "" >> $REPORT_FILE echo "监控报告生成完成: $REPORT_FILE" }
send_report_email() { if [ -f "$REPORT_FILE" ]; then mail -s "系统资源监控报告 - $(date +%Y-%m-%d)" $REPORT_EMAIL < $REPORT_FILE echo "监控报告邮件发送完成" else echo "监控报告文件不存在" fi }
main() { generate_report send_report_email }
main
|
8. 总结
通过本文的学习和实践,我们掌握了CPU使用率和内存使用率监控的核心技能:
8.1 核心要点
- 监控工具: 熟练掌握top、htop、iostat、vmstat、free等监控工具
- 监控指标: 理解CPU使用率、内存使用率、系统负载等关键指标
- 性能分析: 能够分析系统性能瓶颈和资源使用情况
- 故障诊断: 具备快速定位和解决系统资源问题的能力
- 优化策略: 掌握系统资源优化和调优方法
8.2 最佳实践
- 持续监控: 建立完善的监控体系,实时掌握系统状态
- 阈值设置: 合理设置告警阈值,避免误报和漏报
- 自动化运维: 实现监控、告警、处理的自动化
- 性能优化: 根据监控数据进行系统优化
- 故障预防: 通过监控数据预防系统故障
8.3 技术价值
- 提升系统稳定性: 通过监控及时发现和解决资源问题
- 优化系统性能: 基于监控数据进行性能调优
- 降低运维成本: 自动化监控减少人工干预
- 保障业务连续性: 确保系统稳定运行
- 支持容量规划: 为系统扩容提供数据支持
通过这套完整的CPU和内存监控解决方案,可以显著提升系统的稳定性和性能,为业务发展提供强有力的技术保障。