第191集操作系统监控与性能分析实战 | 字数总计: 7.9k | 阅读时长: 35分钟 | 阅读量:
1. 操作系统监控概述 操作系统监控是运维工作的基础,通过实时监控系统资源使用情况、性能指标和运行状态,可以及时发现系统问题、预防故障发生、优化系统性能。本文将详细介绍操作系统监控的核心指标、监控工具、性能分析方法以及最佳实践。
1.1 监控的重要性
故障预防 : 提前发现系统异常,预防故障发生
性能优化 : 识别性能瓶颈,优化系统配置
容量规划 : 为系统扩容提供数据支持
问题诊断 : 快速定位和解决系统问题
资源管理 : 合理分配和管理系统资源
1.2 核心监控指标
CPU使用率 : 处理器使用情况和负载
内存使用率 : 内存使用情况和交换分区
磁盘I/O : 磁盘读写性能和空间使用
网络I/O : 网络流量和连接状态
进程状态 : 进程运行状态和资源占用
系统负载 : 系统整体负载情况
1.3 监控层次
硬件层 : CPU、内存、磁盘、网络硬件状态
内核层 : 内核参数、系统调用、中断处理
应用层 : 应用程序性能、资源使用
业务层 : 业务指标、用户体验
2. CPU监控与分析 2.1 CPU监控指标 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 #!/bin/bash monitor_cpu () { echo "=== CPU监控信息 ===" echo "CPU使用率:" top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1 echo "CPU负载:" uptime | awk -F'load average:' '{print $2}' echo "CPU核心数:" nproc echo "CPU信息:" cat /proc/cpuinfo | grep "model name" | head -1 | cut -d: -f2 echo "进程CPU使用率TOP10:" ps aux --sort =-%cpu | head -11 } analyze_cpu_performance () { echo "=== CPU性能分析 ===" cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1) if (( $(echo "$cpu_usage > 80 " | bc -l) )); then echo "警告: CPU使用率过高 ($cpu_usage %)" fi load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | cut -d',' -f1) cpu_cores=$(nproc ) if (( $(echo "$load_avg > $cpu_cores " | bc -l) )); then echo "警告: 系统负载过高 ($load_avg > $cpu_cores )" fi if [ -f /sys/class/thermal/thermal_zone0/temp ]; then temp=$(cat /sys/class/thermal/thermal_zone0/temp) temp_c=$((temp/1000 )) if [ $temp_c -gt 70 ]; then echo "警告: CPU温度过高 (${temp_c} °C)" fi fi } main () { case $1 in "monitor" ) monitor_cpu ;; "analyze" ) analyze_cpu_performance ;; *) echo "Usage: $0 {monitor|analyze}" exit 1 ;; esac } main "$@ "
2.2 CPU性能分析工具 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 #!/bin/bash monitor_with_top () { echo "使用top命令监控CPU:" top -bn1 | head -20 } monitor_with_htop () { echo "使用htop命令监控CPU:" if command -v htop >/dev/null 2>&1; then htop -n 1 else echo "htop未安装,请先安装: apt-get install htop" fi } monitor_with_vmstat () { echo "使用vmstat命令监控CPU:" vmstat 1 5 } monitor_with_iostat () { echo "使用iostat命令监控CPU:" if command -v iostat >/dev/null 2>&1; then iostat -c 1 5 else echo "iostat未安装,请先安装: apt-get install sysstat" fi } monitor_with_sar () { echo "使用sar命令监控CPU:" if command -v sar >/dev/null 2>&1; then sar -u 1 5 else echo "sar未安装,请先安装: apt-get install sysstat" fi } analyze_with_perf () { echo "使用perf命令分析CPU:" if command -v perf >/dev/null 2>&1; then perf stat -a sleep 1 else echo "perf未安装,请先安装: apt-get install linux-tools-common" fi } main () { case $1 in "top" ) monitor_with_top ;; "htop" ) monitor_with_htop ;; "vmstat" ) monitor_with_vmstat ;; "iostat" ) monitor_with_iostat ;; "sar" ) monitor_with_sar ;; "perf" ) analyze_with_perf ;; *) echo "Usage: $0 {top|htop|vmstat|iostat|sar|perf}" exit 1 ;; esac } main "$@ "
2.3 CPU优化策略 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 #!/bin/bash optimize_cpu () { echo "=== CPU优化策略 ===" check_cpu_governor check_cpu_affinity check_interrupts check_system_calls provide_optimization_suggestions } check_cpu_governor () { echo "检查CPU频率调节器:" if [ -f /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor ]; then governor=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor) echo "当前调节器: $governor " if [ "$governor " != "performance" ]; then echo "建议: 设置为performance模式以提高性能" echo "命令: echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" fi else echo "CPU频率调节器信息不可用" fi } check_cpu_affinity () { echo "检查CPU亲和性:" echo "当前进程CPU亲和性:" ps -eo pid,psr,comm | head -10 } check_interrupts () { echo "检查中断处理:" echo "中断统计:" cat /proc/interrupts | head -10 } check_system_calls () { echo "检查系统调用:" echo "系统调用统计:" cat /proc/sys/kernel/sysrq } provide_optimization_suggestions () { echo "=== CPU优化建议 ===" echo "1. 使用performance模式的CPU频率调节器" echo "2. 设置进程CPU亲和性" echo "3. 优化中断处理" echo "4. 减少系统调用开销" echo "5. 使用CPU缓存优化" echo "6. 调整进程优先级" echo "7. 使用多线程优化" echo "8. 避免CPU密集型操作" } main () { case $1 in "optimize" ) optimize_cpu ;; *) echo "Usage: $0 optimize" exit 1 ;; esac } main "$@ "
3. 内存监控与分析 3.1 内存监控指标 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 #!/bin/bash monitor_memory () { echo "=== 内存监控信息 ===" echo "内存使用情况:" free -h echo "内存详细信息:" cat /proc/meminfo | head -20 echo "内存使用率:" free | awk 'NR==2{printf "%.2f%%\n", $3*100/$2}' echo "交换分区使用情况:" swapon -s echo "进程内存使用TOP10:" ps aux --sort =-%mem | head -11 } analyze_memory_performance () { echo "=== 内存性能分析 ===" memory_usage=$(free | awk 'NR==2{printf "%.2f", $3*100/$2}' ) if (( $(echo "$memory_usage > 80 " | bc -l) )); then echo "警告: 内存使用率过高 ($memory_usage %)" fi swap_usage=$(free | awk 'NR==3{printf "%.2f", $3*100/$2}' ) if (( $(echo "$swap_usage > 10 " | bc -l) )); then echo "警告: 交换分区使用率过高 ($swap_usage %)" fi check_memory_leak check_memory_fragmentation } check_memory_leak () { echo "检查内存泄漏:" echo "进程内存增长检查:" ps aux --sort =-%mem | head -5 echo "系统内存统计:" cat /proc/meminfo | grep -E "(MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree)" } check_memory_fragmentation () { echo "检查内存碎片:" if [ -f /proc/buddyinfo ]; then echo "内存碎片信息:" cat /proc/buddyinfo fi if [ -f /proc/zoneinfo ]; then echo "内存区域信息:" cat /proc/zoneinfo | head -20 fi } main () { case $1 in "monitor" ) monitor_memory ;; "analyze" ) analyze_memory_performance ;; *) echo "Usage: $0 {monitor|analyze}" exit 1 ;; esac } main "$@ "
3.2 内存性能分析工具 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 #!/bin/bash monitor_with_free () { echo "使用free命令监控内存:" free -h } monitor_with_vmstat () { echo "使用vmstat命令监控内存:" vmstat 1 5 } monitor_with_top () { echo "使用top命令监控内存:" top -bn1 | head -20 } monitor_with_htop () { echo "使用htop命令监控内存:" if command -v htop >/dev/null 2>&1; then htop -n 1 else echo "htop未安装,请先安装: apt-get install htop" fi } monitor_with_sar () { echo "使用sar命令监控内存:" if command -v sar >/dev/null 2>&1; then sar -r 1 5 else echo "sar未安装,请先安装: apt-get install sysstat" fi } analyze_with_valgrind () { echo "使用valgrind分析内存:" if command -v valgrind >/dev/null 2>&1; then echo "valgrind可用于内存泄漏检测" echo "示例: valgrind --leak-check=full ./your_program" else echo "valgrind未安装,请先安装: apt-get install valgrind" fi } analyze_with_pmap () { echo "使用pmap分析内存:" echo "进程内存映射:" pmap -x $$ } main () { case $1 in "free" ) monitor_with_free ;; "vmstat" ) monitor_with_vmstat ;; "top" ) monitor_with_top ;; "htop" ) monitor_with_htop ;; "sar" ) monitor_with_sar ;; "valgrind" ) analyze_with_valgrind ;; "pmap" ) analyze_with_pmap ;; *) echo "Usage: $0 {free|vmstat|top|htop|sar|valgrind|pmap}" exit 1 ;; esac } main "$@ "
3.3 内存优化策略 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 #!/bin/bash optimize_memory () { echo "=== 内存优化策略 ===" check_memory_config check_memory_usage_pattern check_memory_leaks check_memory_fragmentation provide_optimization_suggestions } check_memory_config () { echo "检查内存配置:" echo "内存限制:" ulimit -a | grep memory echo "内存参数:" sysctl -a | grep -E "(vm\.|kernel\.)" | head -10 } check_memory_usage_pattern () { echo "检查内存使用模式:" echo "内存使用趋势:" free -h echo "内存使用详情:" cat /proc/meminfo | grep -E "(MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree)" } check_memory_leaks () { echo "检查内存泄漏:" echo "进程内存使用TOP10:" ps aux --sort =-%mem | head -11 echo "内存增长趋势:" echo "当前内存使用:" free | awk 'NR==2{printf "%.2f%%\n", $3*100/$2}' } check_memory_fragmentation () { echo "检查内存碎片:" if [ -f /proc/buddyinfo ]; then echo "内存碎片信息:" cat /proc/buddyinfo fi } provide_optimization_suggestions () { echo "=== 内存优化建议 ===" echo "1. 增加物理内存" echo "2. 优化内存分配策略" echo "3. 使用内存池技术" echo "4. 减少内存碎片" echo "5. 优化缓存使用" echo "6. 使用内存压缩" echo "7. 调整内存参数" echo "8. 监控内存使用" } main () { case $1 in "optimize" ) optimize_memory ;; *) echo "Usage: $0 optimize" exit 1 ;; esac } main "$@ "
4. 磁盘监控与分析 4.1 磁盘监控指标 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 #!/bin/bash monitor_disk () { echo "=== 磁盘监控信息 ===" echo "磁盘使用情况:" df -h echo "磁盘I/O统计:" iostat -x 1 3 echo "磁盘使用详情:" lsblk echo "磁盘性能测试:" test_disk_performance echo "磁盘健康状态:" check_disk_health } test_disk_performance () { echo "磁盘性能测试:" echo "测试读取性能:" dd if =/dev/zero of=/tmp/test_file bs=1M count=1000 2>&1 | grep -E "(copied|MB/s)" echo "测试写入性能:" dd if =/tmp/test_file of=/dev/null bs=1M 2>&1 | grep -E "(copied|MB/s)" rm -f /tmp/test_file } check_disk_health () { echo "磁盘健康检查:" echo "磁盘错误检查:" dmesg | grep -i "error\|fail" | tail -10 echo "磁盘温度检查:" if command -v hddtemp >/dev/null 2>&1; then hddtemp /dev/sda 2>/dev/null || echo "hddtemp未安装" else echo "hddtemp未安装,请先安装: apt-get install hddtemp" fi echo "磁盘SMART信息:" if command -v smartctl >/dev/null 2>&1; then smartctl -a /dev/sda | head -20 else echo "smartctl未安装,请先安装: apt-get install smartmontools" fi } analyze_disk_performance () { echo "=== 磁盘性能分析 ===" disk_usage=$(df -h | awk 'NR==2{print $5}' | cut -d'%' -f1) if [ $disk_usage -gt 80 ]; then echo "警告: 磁盘使用率过高 ($disk_usage %)" fi iowait=$(iostat -x 1 1 | awk 'NR==4{print $4}' ) if (( $(echo "$iowait > 20 " | bc -l) )); then echo "警告: 磁盘I/O等待过高 ($iowait %)" fi response_time=$(iostat -x 1 1 | awk 'NR==4{print $10}' ) if (( $(echo "$response_time > 100 " | bc -l) )); then echo "警告: 磁盘响应时间过长 (${response_time} ms)" fi } main () { case $1 in "monitor" ) monitor_disk ;; "analyze" ) analyze_disk_performance ;; *) echo "Usage: $0 {monitor|analyze}" exit 1 ;; esac } main "$@ "
4.2 磁盘性能分析工具 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 #!/bin/bash monitor_with_iostat () { echo "使用iostat命令监控磁盘:" if command -v iostat >/dev/null 2>&1; then iostat -x 1 5 else echo "iostat未安装,请先安装: apt-get install sysstat" fi } monitor_with_iotop () { echo "使用iotop命令监控磁盘:" if command -v iotop >/dev/null 2>&1; then iotop -n 1 else echo "iotop未安装,请先安装: apt-get install iotop" fi } monitor_with_sar () { echo "使用sar命令监控磁盘:" if command -v sar >/dev/null 2>&1; then sar -d 1 5 else echo "sar未安装,请先安装: apt-get install sysstat" fi } monitor_with_vmstat () { echo "使用vmstat命令监控磁盘:" vmstat 1 5 } test_with_fio () { echo "使用fio测试磁盘性能:" if command -v fio >/dev/null 2>&1; then echo "随机读取测试:" fio --name=random_read --ioengine=libaio --iodepth=16 --rw=randread --bs=4k --direct=1 --size=1G --numjobs=4 --runtime=60 --group_reporting else echo "fio未安装,请先安装: apt-get install fio" fi } test_with_dd () { echo "使用dd测试磁盘性能:" echo "测试写入性能:" dd if =/dev/zero of=/tmp/test_file bs=1M count=1000 2>&1 | grep -E "(copied|MB/s)" echo "测试读取性能:" dd if =/tmp/test_file of=/dev/null bs=1M 2>&1 | grep -E "(copied|MB/s)" rm -f /tmp/test_file } main () { case $1 in "iostat" ) monitor_with_iostat ;; "iotop" ) monitor_with_iotop ;; "sar" ) monitor_with_sar ;; "vmstat" ) monitor_with_vmstat ;; "fio" ) test_with_fio ;; "dd" ) test_with_dd ;; *) echo "Usage: $0 {iostat|iotop|sar|vmstat|fio|dd}" exit 1 ;; esac } main "$@ "
4.3 磁盘优化策略 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 #!/bin/bash optimize_disk () { echo "=== 磁盘优化策略 ===" check_disk_config check_disk_usage_pattern check_disk_performance check_disk_health provide_optimization_suggestions } check_disk_config () { echo "检查磁盘配置:" echo "磁盘挂载:" mount | grep -E "(ext4|xfs|btrfs)" echo "磁盘参数:" sysctl -a | grep -E "(vm\.|fs\.)" | head -10 } check_disk_usage_pattern () { echo "检查磁盘使用模式:" echo "磁盘使用情况:" df -h echo "磁盘I/O统计:" iostat -x 1 1 } check_disk_performance () { echo "检查磁盘性能:" echo "磁盘响应时间:" iostat -x 1 1 | awk 'NR==4{print $10}' echo "磁盘吞吐量:" iostat -x 1 1 | awk 'NR==4{print $6, $7}' } check_disk_health () { echo "检查磁盘健康:" echo "磁盘错误:" dmesg | grep -i "error\|fail" | tail -5 echo "磁盘SMART信息:" if command -v smartctl >/dev/null 2>&1; then smartctl -a /dev/sda | head -10 else echo "smartctl未安装" fi } provide_optimization_suggestions () { echo "=== 磁盘优化建议 ===" echo "1. 使用SSD存储" echo "2. 优化文件系统参数" echo "3. 使用RAID配置" echo "4. 优化磁盘调度算法" echo "5. 使用磁盘缓存" echo "6. 优化I/O操作" echo "7. 定期清理磁盘" echo "8. 监控磁盘健康" } main () { case $1 in "optimize" ) optimize_disk ;; *) echo "Usage: $0 optimize" exit 1 ;; esac } main "$@ "
5. 网络监控与分析 5.1 网络监控指标 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 #!/bin/bash monitor_network () { echo "=== 网络监控信息 ===" echo "网络接口状态:" ip addr show echo "网络连接状态:" netstat -tuln echo "网络流量统计:" cat /proc/net/dev echo "网络性能测试:" test_network_performance echo "网络健康检查:" check_network_health } test_network_performance () { echo "网络性能测试:" echo "测试网络延迟:" ping -c 4 8.8.8.8 echo "测试网络带宽:" if command -v iperf3 >/dev/null 2>&1; then echo "iperf3可用于带宽测试" else echo "iperf3未安装,请先安装: apt-get install iperf3" fi } check_network_health () { echo "网络健康检查:" echo "网络错误检查:" dmesg | grep -i "network\|eth\|wlan" | tail -10 echo "网络丢包检查:" ping -c 10 8.8.8.8 | grep -E "(packet loss|rtt)" } analyze_network_performance () { echo "=== 网络性能分析 ===" latency=$(ping -c 4 8.8.8.8 | grep "rtt" | awk '{print $4}' | cut -d'/' -f2) if (( $(echo "$latency > 100 " | bc -l) )); then echo "警告: 网络延迟过高 (${latency} ms)" fi packet_loss=$(ping -c 10 8.8.8.8 | grep "packet loss" | awk '{print $6}' | cut -d'%' -f1) if [ $packet_loss -gt 5 ]; then echo "警告: 网络丢包率过高 ($packet_loss %)" fi check_network_bandwidth } check_network_bandwidth () { echo "检查网络带宽:" echo "网络接口统计:" cat /proc/net/dev | grep -E "(eth0|wlan0)" } main () { case $1 in "monitor" ) monitor_network ;; "analyze" ) analyze_network_performance ;; *) echo "Usage: $0 {monitor|analyze}" exit 1 ;; esac } main "$@ "
5.2 网络性能分析工具 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 #!/bin/bash monitor_with_netstat () { echo "使用netstat命令监控网络:" netstat -tuln } monitor_with_ss () { echo "使用ss命令监控网络:" ss -tuln } monitor_with_iftop () { echo "使用iftop命令监控网络:" if command -v iftop >/dev/null 2>&1; then iftop -n 1 else echo "iftop未安装,请先安装: apt-get install iftop" fi } monitor_with_nethogs () { echo "使用nethogs命令监控网络:" if command -v nethogs >/dev/null 2>&1; then nethogs -d 1 else echo "nethogs未安装,请先安装: apt-get install nethogs" fi } monitor_with_tcpdump () { echo "使用tcpdump命令监控网络:" if command -v tcpdump >/dev/null 2>&1; then echo "tcpdump可用于网络包分析" echo "示例: tcpdump -i eth0 -n" else echo "tcpdump未安装,请先安装: apt-get install tcpdump" fi } monitor_with_wireshark () { echo "使用wireshark命令监控网络:" if command -v wireshark >/dev/null 2>&1; then echo "wireshark可用于网络包分析" else echo "wireshark未安装,请先安装: apt-get install wireshark" fi } test_with_iperf3 () { echo "使用iperf3测试网络性能:" if command -v iperf3 >/dev/null 2>&1; then echo "iperf3可用于网络带宽测试" echo "服务器端: iperf3 -s" echo "客户端: iperf3 -c server_ip" else echo "iperf3未安装,请先安装: apt-get install iperf3" fi } main () { case $1 in "netstat" ) monitor_with_netstat ;; "ss" ) monitor_with_ss ;; "iftop" ) monitor_with_iftop ;; "nethogs" ) monitor_with_nethogs ;; "tcpdump" ) monitor_with_tcpdump ;; "wireshark" ) monitor_with_wireshark ;; "iperf3" ) test_with_iperf3 ;; *) echo "Usage: $0 {netstat|ss|iftop|nethogs|tcpdump|wireshark|iperf3}" exit 1 ;; esac } main "$@ "
6. 系统监控工具集成 6.1 综合监控脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 #!/bin/bash monitor_system () { echo "=== 系统监控信息 ===" echo "系统基本信息:" uname -a uptime echo "CPU监控:" monitor_cpu echo "内存监控:" monitor_memory echo "磁盘监控:" monitor_disk echo "网络监控:" monitor_network echo "进程监控:" monitor_processes echo "系统负载:" monitor_load } monitor_cpu () { echo "CPU使用率:" top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1 echo "CPU负载:" uptime | awk -F'load average:' '{print $2}' echo "CPU核心数:" nproc } monitor_memory () { echo "内存使用情况:" free -h echo "内存使用率:" free | awk 'NR==2{printf "%.2f%%\n", $3*100/$2}' } monitor_disk () { echo "磁盘使用情况:" df -h echo "磁盘I/O统计:" iostat -x 1 1 } monitor_network () { echo "网络接口状态:" ip addr show | grep -E "(eth0|wlan0)" echo "网络连接状态:" netstat -tuln | wc -l } monitor_processes () { echo "进程数量:" ps aux | wc -l echo "进程CPU使用TOP5:" ps aux --sort =-%cpu | head -6 echo "进程内存使用TOP5:" ps aux --sort =-%mem | head -6 } monitor_load () { echo "系统负载:" uptime echo "系统负载详情:" cat /proc/loadavg } analyze_system_performance () { echo "=== 系统性能分析 ===" load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | cut -d',' -f1) cpu_cores=$(nproc ) if (( $(echo "$load_avg > $cpu_cores " | bc -l) )); then echo "警告: 系统负载过高 ($load_avg > $cpu_cores )" fi memory_usage=$(free | awk 'NR==2{printf "%.2f", $3*100/$2}' ) if (( $(echo "$memory_usage > 80 " | bc -l) )); then echo "警告: 内存使用率过高 ($memory_usage %)" fi disk_usage=$(df -h | awk 'NR==2{print $5}' | cut -d'%' -f1) if [ $disk_usage -gt 80 ]; then echo "警告: 磁盘使用率过高 ($disk_usage %)" fi latency=$(ping -c 4 8.8.8.8 | grep "rtt" | awk '{print $4}' | cut -d'/' -f2) if (( $(echo "$latency > 100 " | bc -l) )); then echo "警告: 网络延迟过高 (${latency} ms)" fi } main () { case $1 in "monitor" ) monitor_system ;; "analyze" ) analyze_system_performance ;; *) echo "Usage: $0 {monitor|analyze}" exit 1 ;; esac } main "$@ "
6.2 监控告警系统 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 #!/bin/bash alert_system () { echo "=== 系统告警检查 ===" check_cpu_alert check_memory_alert check_disk_alert check_network_alert check_load_alert } check_cpu_alert () { cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1) if (( $(echo "$cpu_usage > 80 " | bc -l) )); then echo "告警: CPU使用率过高 ($cpu_usage %)" send_alert "CPU使用率过高" "$cpu_usage %" fi } check_memory_alert () { memory_usage=$(free | awk 'NR==2{printf "%.2f", $3*100/$2}' ) if (( $(echo "$memory_usage > 80 " | bc -l) )); then echo "告警: 内存使用率过高 ($memory_usage %)" send_alert "内存使用率过高" "$memory_usage %" fi } check_disk_alert () { disk_usage=$(df -h | awk 'NR==2{print $5}' | cut -d'%' -f1) if [ $disk_usage -gt 80 ]; then echo "告警: 磁盘使用率过高 ($disk_usage %)" send_alert "磁盘使用率过高" "$disk_usage %" fi } check_network_alert () { latency=$(ping -c 4 8.8.8.8 | grep "rtt" | awk '{print $4}' | cut -d'/' -f2) if (( $(echo "$latency > 100 " | bc -l) )); then echo "告警: 网络延迟过高 (${latency} ms)" send_alert "网络延迟过高" "${latency} ms" fi } check_load_alert () { load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | cut -d',' -f1) cpu_cores=$(nproc ) if (( $(echo "$load_avg > $cpu_cores " | bc -l) )); then echo "告警: 系统负载过高 ($load_avg > $cpu_cores )" send_alert "系统负载过高" "$load_avg > $cpu_cores " fi } send_alert () { local alert_type=$1 local alert_value=$2 local timestamp=$(date '+%Y-%m-%d %H:%M:%S' ) echo "发送告警: $alert_type - $alert_value - $timestamp " } main () { case $1 in "alert" ) alert_system ;; *) echo "Usage: $0 alert" exit 1 ;; esac } main "$@ "
7. 性能优化最佳实践 7.1 系统优化策略 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 #!/bin/bash optimize_system () { echo "=== 系统优化策略 ===" optimize_cpu optimize_memory optimize_disk optimize_network optimize_system_parameters optimize_services } optimize_cpu () { echo "CPU优化:" echo "设置CPU频率调节器为performance模式:" echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor echo "优化CPU亲和性:" echo "建议使用taskset命令设置进程CPU亲和性" echo "优化中断处理:" echo "建议使用irqbalance服务平衡中断处理" } optimize_memory () { echo "内存优化:" echo "优化内存参数:" echo "vm.swappiness=10" | sudo tee -a /etc/sysctl.conf echo "vm.vfs_cache_pressure=50" | sudo tee -a /etc/sysctl.conf echo "优化内存分配:" echo "建议使用内存池技术" echo "优化缓存使用:" echo "建议合理使用系统缓存" } optimize_disk () { echo "磁盘优化:" echo "优化磁盘调度算法:" echo "建议使用deadline或noop调度算法" echo "优化文件系统参数:" echo "建议调整文件系统参数" echo "优化磁盘缓存:" echo "建议使用磁盘缓存技术" } optimize_network () { echo "网络优化:" echo "优化网络参数:" echo "net.core.rmem_max=16777216" | sudo tee -a /etc/sysctl.conf echo "net.core.wmem_max=16777216" | sudo tee -a /etc/sysctl.conf echo "net.ipv4.tcp_rmem=4096 87380 16777216" | sudo tee -a /etc/sysctl.conf echo "net.ipv4.tcp_wmem=4096 65536 16777216" | sudo tee -a /etc/sysctl.conf echo "优化网络连接:" echo "建议优化TCP连接参数" echo "优化网络缓存:" echo "建议使用网络缓存技术" } optimize_system_parameters () { echo "系统参数优化:" echo "优化系统限制:" echo "* soft nofile 65536" | sudo tee -a /etc/security/limits.conf echo "* hard nofile 65536" | sudo tee -a /etc/security/limits.conf echo "优化内核参数:" echo "kernel.pid_max=4194304" | sudo tee -a /etc/sysctl.conf echo "kernel.threads-max=4194304" | sudo tee -a /etc/sysctl.conf echo "应用系统参数:" sudo sysctl -p } optimize_services () { echo "服务优化:" echo "优化系统服务:" echo "建议禁用不必要的系统服务" echo "优化服务配置:" echo "建议优化服务配置参数" echo "优化服务启动:" echo "建议优化服务启动顺序" } main () { case $1 in "optimize" ) optimize_system ;; *) echo "Usage: $0 optimize" exit 1 ;; esac } main "$@ "
7.2 监控最佳实践 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 #!/bin/bash monitoring_best_practices () { echo "=== 监控最佳实践 ===" select_monitoring_metrics set_monitoring_frequency set_alert_thresholds store_monitoring_data visualize_monitoring_data handle_monitoring_alerts } select_monitoring_metrics () { echo "监控指标选择:" echo "1. CPU使用率、负载、温度" echo "2. 内存使用率、交换分区使用率" echo "3. 磁盘使用率、I/O性能、健康状态" echo "4. 网络延迟、带宽、丢包率" echo "5. 进程状态、资源占用" echo "6. 系统负载、响应时间" } set_monitoring_frequency () { echo "监控频率设置:" echo "1. 实时监控: 1秒间隔" echo "2. 短期监控: 1分钟间隔" echo "3. 中期监控: 5分钟间隔" echo "4. 长期监控: 1小时间隔" echo "5. 历史监控: 1天间隔" } set_alert_thresholds () { echo "告警阈值设置:" echo "1. CPU使用率: 80%" echo "2. 内存使用率: 80%" echo "3. 磁盘使用率: 80%" echo "4. 网络延迟: 100ms" echo "5. 系统负载: CPU核心数" echo "6. 进程数量: 1000" } store_monitoring_data () { echo "监控数据存储:" echo "1. 使用时间序列数据库" echo "2. 设置数据保留策略" echo "3. 定期清理历史数据" echo "4. 备份重要监控数据" echo "5. 使用数据压缩技术" } visualize_monitoring_data () { echo "监控数据可视化:" echo "1. 使用Grafana等可视化工具" echo "2. 创建监控仪表板" echo "3. 设置数据刷新频率" echo "4. 使用图表展示趋势" echo "5. 设置告警可视化" } handle_monitoring_alerts () { echo "监控告警处理:" echo "1. 设置告警级别" echo "2. 配置告警通知方式" echo "3. 设置告警抑制规则" echo "4. 建立告警处理流程" echo "5. 定期回顾告警情况" } main () { case $1 in "practices" ) monitoring_best_practices ;; *) echo "Usage: $0 practices" exit 1 ;; esac } main "$@ "
8. 最佳实践总结 8.1 监控策略
全面监控 : 监控所有关键系统指标
实时监控 : 实时监控系统状态
历史监控 : 保存历史监控数据
告警监控 : 设置合理的告警阈值
可视化监控 : 使用可视化工具展示监控数据
8.2 性能优化
硬件优化 : 使用高性能硬件
软件优化 : 优化软件配置和参数
系统优化 : 优化系统内核参数
应用优化 : 优化应用程序性能
网络优化 : 优化网络配置和参数
8.3 故障处理
预防为主 : 通过监控预防故障发生
快速响应 : 快速响应和处理故障
根因分析 : 深入分析故障根本原因
持续改进 : 持续改进监控和优化策略
知识积累 : 积累故障处理经验
通过合理的监控策略和优化方法,可以有效提升系统性能,预防故障发生,保障系统稳定运行。