以下是在实际生产环境中积累的实用脚本,涵盖监控、备份、诊断、自动化等场景。
脚本使用建议
所有脚本保存到 /opt/scripts/ 目录并设置权限
添加日志记录和错误处理
通过 crontab -e 设置定时任务
一、系统监控与告警类
1.1 综合系统监控脚本
#!/bin/bash
# 文件名:/opt/scripts/system_monitor.sh
# 功能:综合系统状态监控,适合加入cron定时执行
# 执行频率:建议每5分钟一次
log_file="/var/log/system_monitor.log"
threshold_cpu=80 # cpu使用率阈值%
threshold_mem=85 # 内存使用率阈值%
threshold_disk=90 # 磁盘使用率阈值%
# 获取当前时间
echo "====== $(date '+%y-%m-%d %h:%m:%s') ======" | tee -a $log_file
# 1. cpu监控
cpu_usage=$(top -bn1 | grep "cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
if (( $(echo "$cpu_usage > $threshold_cpu" | bc -l) )); then
echo "⚠️ 警告:cpu使用率过高 - ${cpu_usage}%" | tee -a $log_file
# 这里可以添加告警动作,如发送邮件
# /opt/scripts/send_alert.sh "cpu报警" "cpu使用率: ${cpu_usage}%"
fi
# 2. 内存监控
mem_total=$(free -m | awk '/mem:/ {print $2}')
mem_used=$(free -m | awk '/mem:/ {print $3}')
mem_percent=$((mem_used*100/mem_total))
if [ $mem_percent -gt $threshold_mem ]; then
echo "⚠️ 警告:内存使用率过高 - ${mem_percent}%" | tee -a $log_file
# 显示内存占用前10的进程
ps aux --sort=-%mem | head -11 | tee -a $log_file
fi
# 3. 磁盘监控
df -h | grep -e '^/dev/' | while read line; do
usage=$(echo $line | awk '{print $5}' | sed 's/%//')
mount=$(echo $line | awk '{print $6}')
if [ $usage -gt $threshold_disk ]; then
echo "⚠️ 警告:磁盘 $mount 使用率过高 - ${usage}%" | tee -a $log_file
fi
done
# 4. 关键进程检查
process_list=("nginx" "mysql" "redis" "sshd")
for proc in "${process_list[@]}"; do
if ! pgrep -x "$proc" >/dev/null; then
echo "❌ 关键进程 $proc 未运行!" | tee -a $log_file
fi
done
# 5. 连接数监控(针对web服务器)
if command -v netstat &> /dev/null; then
conn_count=$(netstat -ant | grep -c established)
echo "当前established连接数: $conn_count" | tee -a $log_file
fi
echo "监控完成" | tee -a $log_file
1.2 实时进程资源监控
#!/bin/bash
# 文件名:/opt/scripts/process_watch.sh
# 功能:监控指定进程的资源占用,类似简易版top
# 用法:./process_watch.sh <进程名或pid>
process_name=$1
interval=2 # 监控间隔(秒)
if [ -z "$process_name" ]; then
echo "用法: $0 <进程名或pid>"
exit 1
fi
echo "监控进程: $process_name,按ctrl+c退出"
echo "时间戳 pid cpu% mem% 虚拟内存 物理内存 进程名"
echo "----------------------------------------------------------------"
while true; do
# 通过ps获取进程信息
ps aux | grep -e "(pid|$process_name)" | grep -v grep | grep -v $0 | \
awk -v date="$(date '+%h:%m:%s')" '{
printf "%s %6s %5s %6s %10s %10s %s\n",
date, $2, $3, $4, $5, $6, $11
}'
# 显示进程打开的文件数(如果进程存在)
pid=$(ps aux | grep "$process_name" | grep -v grep | head -1 | awk '{print $2}')
if [ ! -z "$pid" ]; then
file_count=$(ls -l /proc/$pid/fd 2>/dev/null | wc -l)
echo " 打开文件数: ${file_count:-n/a}"
fi
sleep $interval
done
二、备份与同步类
2.1 智能增量备份脚本
#!/bin/bash
# 文件名:/opt/scripts/smart_backup.sh
# 功能:增量备份,保留最近7天,每周日全量备份
# 配置:修改source_dir和backup_dir
source_dir="/data/www" # 备份源目录
backup_dir="/backup/www" # 备份目标目录
retention_days=7 # 保留天数
date=$(date '+%y%m%d_%h%m%s')
backup_log="/var/log/backup_${date}.log"
# 创建目录
mkdir -p $backup_dir/{full,incremental}
# 日志函数
log() {
echo "[$(date '+%y-%m-%d %h:%m:%s')] $1" | tee -a $backup_log
}
# 检查磁盘空间
check_disk_space() {
local required=$1
local available=$(df $backup_dir | awk 'nr==2 {print $4}')
if [ $available -lt $required ]; then
log "❌ 磁盘空间不足!可用: ${available}kb, 需要: ${required}kb"
exit 1
fi
}
# 周日做全量备份,其他天做增量备份
if [ $(date '+%u') -eq 7 ]; then
backup_type="full"
backup_path="$backup_dir/full/backup_${date}.tar.gz"
# 估算大小(单位kb)
est_size=$(du -sk $source_dir | awk '{print $1}')
check_disk_space $((est_size * 110 / 100)) # 增加10%缓冲
log "开始全量备份..."
tar -czf $backup_path $source_dir 2>>$backup_log
# 删除旧的全量备份
find $backup_dir/full -type f -mtime +$retention_days -delete
else
backup_type="incremental"
latest_full=$(ls -t $backup_dir/full/*.tar.gz 2>/dev/null | head -1)
if [ -z "$latest_full" ]; then
log "未找到全量备份,执行全量备份..."
$0 --force-full
exit 0
fi
backup_path="$backup_dir/incremental/inc_${date}.tar.gz"
# 查找需要备份的修改文件(最近24小时内)
find $source_dir -type f -mtime -1 -print > /tmp/changed_files.list
if [ -s /tmp/changed_files.list ]; then
tar -czf $backup_path -t /tmp/changed_files.list 2>>$backup_log
log "增量备份完成,文件数: $(wc -l < /tmp/changed_files.list)"
else
log "没有文件变化,跳过备份"
fi
rm -f /tmp/changed_files.list
fi
# 验证备份文件
if [ -f $backup_path ]; then
backup_size=$(du -h $backup_path | awk '{print $1}')
log "✅ 备份成功: $backup_type备份, 大小: $backup_size, 路径: $backup_path"
# 发送成功通知(可选)
# echo "备份成功: $(hostname) - $backup_type" | mail -s "备份成功通知" admin@example.com
else
log "❌ 备份失败!"
exit 1
fi
# 清理旧日志
find /var/log/ -name "backup_*.log" -mtime +30 -delete
2.2 mysql数据库备份
#!/bin/bash
# 文件名:/opt/scripts/mysql_backup.sh
# 功能:mysql数据库备份,支持单库、多库、全库
# 配置前请先设置mysql连接信息
mysql_user="backup"
mysql_pass="your_password"
mysql_host="localhost"
backup_dir="/backup/mysql"
retention_days=30
date=$(date '+%y%m%d')
time=$(date '+%h%m')
# 创建备份目录
mkdir -p $backup_dir/{full,binlog}
# 1. 全量备份
log() {
echo "[$(date '+%y-%m-%d %h:%m:%s')] $1"
}
# 备份所有数据库
log "开始mysql全量备份..."
mysqldump -h$mysql_host -u$mysql_user -p$mysql_pass \
--all-databases \
--single-transaction \
--routines \
--triggers \
--events \
--flush-logs \
--master-data=2 \
| gzip > $backup_dir/full/all_dbs_${date}_${time}.sql.gz
# 检查备份是否成功
if [ ${pipestatus[0]} -eq 0 ]; then
log "✅ 全量备份成功"
# 2. 备份binlog(如果开启了二进制日志)
if mysql -h$mysql_host -u$mysql_user -p$mysql_pass -e "show binary logs" &>/dev/null; then
log "开始备份二进制日志..."
mysql -h$mysql_host -u$mysql_user -p$mysql_pass -e "purge binary logs before date_sub(now(), interval 7 day)"
cp $(mysql -h$mysql_host -u$mysql_user -p$mysql_pass -e "show variables like 'log_bin_basename'" -sn | awk '{print $2}')* $backup_dir/binlog/ 2>/dev/null
fi
# 3. 清理旧备份
find $backup_dir/full -name "*.sql.gz" -mtime +$retention_days -delete
find $backup_dir/binlog -name "mysql-bin.*" -mtime +7 -delete
else
log "❌ 备份失败!"
exit 1
fi
三、系统维护与诊断类
3.1 自动化安全检查脚本
#!/bin/bash
# 文件名:/opt/scripts/security_audit.sh
# 功能:系统安全基线检查
report_file="/var/log/security_audit_$(date '+%y%m%d').txt"
echo "========== 系统安全审计报告 ==========" > $report_file
echo "主机名: $(hostname)" >> $report_file
echo "审计时间: $(date)" >> $report_file
echo "=====================================" >> $report_file
check_item() {
echo -e "\n[检查项 $1] $2" | tee -a $report_file
}
# 1. 检查空密码账户
check_item "1" "检查空密码账户"
awk -f: '($2 == "") {print $1}' /etc/shadow >> $report_file
# 2. 检查suid特殊权限文件
check_item "2" "检查suid权限文件"
find / -perm -4000 -type f 2>/dev/null | head -20 >> $report_file
# 3. 检查最近登录
check_item "3" "最近成功登录"
last -n 10 >> $report_file
check_item "4" "最近失败登录"
lastb -n 10 2>/dev/null >> $report_file
# 4. 检查ssh配置
check_item "5" "ssh配置检查"
grep -e "^permitrootlogin|^passwordauthentication|^protocol" /etc/ssh/sshd_config 2>/dev/null >> $report_file
# 5. 检查开放端口
check_item "6" "监听端口检查"
if command -v ss &> /dev/null; then
ss -tulnp | grep listen >> $report_file
else
netstat -tulnp | grep listen >> $report_file
fi
# 6. 检查系统服务
check_item "7" "危险服务检查"
for service in telnet vsftpd rsh rexec rlogin; do
systemctl is-enabled $service 2>/dev/null | grep -q "enabled" && echo "$service 服务已启用" >> $report_file
done
echo -e "\n========== 审计完成 ==========" >> $report_file
echo "报告已保存至: $report_file"
3.2 日志分析脚本(查找异常)
#!/bin/bash
# 文件名:/opt/scripts/log_analyzer.sh
# 功能:分析指定日志文件的异常情况
# 用法:./log_analyzer.sh <日志文件路径>
log_file=$1
temp_file="/tmp/log_analysis_$$.tmp"
if [ ! -f "$log_file" ]; then
echo "文件不存在: $log_file"
exit 1
fi
echo "分析日志文件: $log_file"
echo "文件大小: $(du -h $log_file | awk '{print $1}')"
echo "最后修改: $(stat -c %y $log_file)"
echo "----------------------------------------"
# 1. 提取错误级别日志
echo -e "\n1. 错误级别日志统计:"
grep -i -e "(error|fatal|failed|exception|segmentation fault)" $log_file | \
awk '{print $1, $2, $3}' | \
sort | uniq -c | sort -rn | head -20
# 2. 统计http状态码(针对web日志)
if echo "$log_file" | grep -q -e "(access|nginx|apache)"; then
echo -e "\n2. http状态码分布:"
awk '{print $9}' $log_file | sort | uniq -c | sort -rn
fi
# 3. 查找访问频率高的ip
echo -e "\n3. 高频访问ip top 10:"
awk '{print $1}' $log_file | sort | uniq -c | sort -rn | head -10
# 4. 查找可能攻击的url模式
echo -e "\n4. 可疑请求模式:"
grep -e "(\.\./|select.*from|union.*select|eval\(|base64_decode|shell_exec)" $log_file | head -10
# 5. 按时间统计请求量
echo -e "\n5. 请求量时间分布:"
if [ -f "$log_file" ]; then
awk -f'[ :]' '{print $2":"$3}' $log_file | sort | uniq -c | tail -24
fi
# 6. 响应时间分析(如果有响应时间字段)
echo -e "\n6. 慢请求统计:"
grep -o 'rt=[0-9]*\.[0-9]*' $log_file 2>/dev/null | \
sed 's/rt=//' | \
awk '{if($1>1) print $1}' | \
sort -n | \
awk 'begin{count=0;sum=0} {count++;sum+=$1} end{if(count>0) print "平均响应时间:" sum/count "秒"}'
四、自动化部署与维护
4.1 批量服务器操作脚本
#!/bin/bash
# 文件名:/opt/scripts/batch_operation.sh
# 功能:通过ssh批量在多台服务器上执行命令
# 配置:提前配置ssh免密登录
server_list=("server1" "server2" "server3" "192.168.1.100")
command="$1"
log_file="/var/log/batch_operation_$(date '+%y%m%d').log"
if [ -z "$command" ]; then
echo "用法: $0 '<要执行的命令>'"
echo "示例: $0 'df -h'"
echo "示例: $0 'systemctl restart nginx'"
exit 1
fi
echo "批量执行命令: $command" | tee -a $log_file
echo "开始时间: $(date)" | tee -a $log_file
echo "======================================" | tee -a $log_file
for server in "${server_list[@]}"; do
echo -e "\n处理服务器: $server" | tee -a $log_file
# 检查服务器是否可达
if ! ping -c 1 -w 2 $server &>/dev/null; then
echo "❌ 服务器不可达" | tee -a $log_file
continue
fi
# 执行远程命令
ssh -o connecttimeout=5 -o batchmode=yes $server "$command" 2>&1 | \
while ifs= read -r line; do
echo "[$server] $line" | tee -a $log_file
done
# 检查执行结果
if [ ${pipestatus[0]} -eq 0 ]; then
echo "✅ 执行成功" | tee -a $log_file
else
echo "❌ 执行失败" | tee -a $log_file
fi
done
echo -e "\n======================================" | tee -a $log_file
echo "完成时间: $(date)" | tee -a $log_file
echo "详细日志: $log_file" | tee -a $log_file
4.2 自动化证书监控和续期检查
#!/bin/bash
# 文件名:/opt/scripts/cert_check.sh
# 功能:检查ssl证书过期时间
domains=(
"example.com:443"
"api.example.com:443"
"blog.example.com:443"
)
days_warning=30 # 提前30天警告
echo "ssl证书过期检查 - $(date)"
echo "================================"
for domain_info in "${domains[@]}"; do
domain=$(echo $domain_info | cut -d: -f1)
port=$(echo $domain_info | cut -d: -f2)
# 获取证书信息
cert_info=$(echo | openssl s_client -servername $domain -connect $domain:$port 2>/dev/null | openssl x509 -noout -dates 2>/dev/null)
if [ -z "$cert_info" ]; then
echo "❌ $domain - 无法获取证书信息"
continue
fi
# 提取过期时间
not_after=$(echo "$cert_info" | grep 'notafter' | cut -d= -f2)
expire_date=$(date -d "$not_after" '+%y-%m-%d')
# 计算剩余天数
expire_timestamp=$(date -d "$not_after" '+%s')
current_timestamp=$(date '+%s')
days_left=$(( (expire_timestamp - current_timestamp) / 86400 ))
# 输出结果
if [ $days_left -lt 0 ]; then
echo "❌ $domain - 证书已过期 $((0 - days_left)) 天! ($expire_date)"
elif [ $days_left -lt $days_warning ]; then
echo "⚠️ $domain - 证书将在 $days_left 天后过期 ($expire_date)"
else
echo "✅ $domain - 证书有效,剩余 $days_left 天 ($expire_date)"
fi
done
五、性能分析与优化
5.1 系统性能快照
#!/bin/bash
# 文件名:/opt/scripts/performance_snapshot.sh
# 功能:一次性收集系统性能快照
snapshot_dir="/var/log/performance_snapshots"
mkdir -p $snapshot_dir
snapshot_file="$snapshot_dir/snapshot_$(date '+%y%m%d_%h%m%s').txt"
echo "性能快照 - $(date)" > $snapshot_file
echo "================================" >> $snapshot_file
# 1. 系统基本信息
echo -e "\n1. 系统基本信息:" >> $snapshot_file
echo "主机名: $(hostname)" >> $snapshot_file
echo "内核版本: $(uname -r)" >> $snapshot_file
echo "运行时间: $(uptime -p)" >> $snapshot_file
# 2. cpu信息
echo -e "\n2. cpu信息:" >> $snapshot_file
echo "cpu型号: $(grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2 | xargs)" >> $snapshot_file
echo "cpu核心数: $(grep -c 'processor' /proc/cpuinfo)" >> $snapshot_file
echo "当前负载: $(uptime | awk -f'load average:' '{print $2}')" >> $snapshot_file
echo "cpu使用率:" >> $snapshot_file
top -bn1 | grep "cpu(s)" >> $snapshot_file
# 3. 内存信息
echo -e "\n3. 内存信息:" >> $snapshot_file
free -h >> $snapshot_file
echo -e "\n内存占用前10进程:" >> $snapshot_file
ps aux --sort=-%mem | head -11 >> $snapshot_file
# 4. 磁盘信息
echo -e "\n4. 磁盘信息:" >> $snapshot_file
df -h >> $snapshot_file
echo -e "\n磁盘io统计:" >> $snapshot_file
iostat -dx 1 2 2>/dev/null || echo "iostat未安装" >> $snapshot_file
# 5. 网络信息
echo -e "\n5. 网络连接:" >> $snapshot_file
if command -v ss &> /dev/null; then
ss -s >> $snapshot_file
else
netstat -s | head -20 >> $snapshot_file
fi
echo -e "\n6. 进程数统计:" >> $snapshot_file
ps aux --no-headers | wc -l >> $snapshot_file
echo "快照已保存至: $snapshot_file"
六、定时任务配置示例
# crontab -e 添加以下内容 # 每5分钟检查系统状态 */5 * * * * /opt/scripts/system_monitor.sh > /dev/null 2>&1 # 每天凌晨2点执行备份 0 2 * * * /opt/scripts/smart_backup.sh # 每周日凌晨3点执行全量备份 0 3 * * 0 /opt/scripts/mysql_backup.sh # 每天凌晨4点清理日志 0 4 * * * find /var/log -name "*.log" -mtime +7 -delete # 每小时检查证书 0 * * * * /opt/scripts/cert_check.sh | mail -s "证书检查报告" admin@example.com # 每月1号凌晨执行安全审计 0 0 1 * * /opt/scripts/security_audit.sh
使用建议
1. 脚本部署步骤
# 1. 创建脚本目录 sudo mkdir -p /opt/scripts # 2. 复制脚本并设置权限 sudo cp *.sh /opt/scripts/ sudo chmod +x /opt/scripts/*.sh # 3. 创建日志目录 sudo mkdir -p /var/log/scripts
2. 调试脚本
# 语法检查 bash -n script.sh # 详细执行(调试模式) bash -x script.sh # 记录执行日志 ./script.sh 2>&1 | tee /var/log/script_exec.log
3. 安全注意事项
- 脚本中避免硬编码密码,使用配置文件或环境变量
- 关键脚本设置只读权限:chmod 400 sensitive_script.sh
- 定期审计脚本内容
- 重要操作前添加确认提示
这些脚本经过生产环境验证,可以根据实际需求进行调整。建议先在小范围测试环境验证后再部署到生产环境。
总结
以上为个人经验,希望能给大家一个参考,也希望大家多多支持代码网。
发表评论