简单粗暴监控系统状态并保存到LOG文件:
vim monitor.bash
#!/bin/bash
LOG_DIR=/home/${USER}/log
mkdir -p ${LOG_DIR}
TIME=$(date '+%Y%m%d_%H%M%S')
# monitor cpu
pkill -2 -f -e -u ${USER} "^mpstat -P ALL 1"
mpstat -P ALL 1 1> ${LOG_DIR}/monitor_cpu_stat.log.${TIME} 2>/dev/null &
# monitor io & pagefault
pkill -2 -f "^pidstat 1 -dr -l"
pidstat 1 -dr -l &> ${LOG_DIR}/monitor_io_pagefault.log.${TIME} &
# monitor pid
pkill -2 -f -e -u ${USER} "^pidstat 1 -l"
pidstat 1 -l 1> ${LOG_DIR}/monitor_pid_stat.log.${TIME} 2>/dev/null &
# monitor disk io
pkill -2 -f -e -u ${USER} "^iostat -xyzdmt 1"
iostat -xyzdmt 1 > ${LOG_DIR}/monitor_disk_io_stat.log.${TIME} 2>/dev/null &
# monitor network
pkill -2 -f -e -u ${USER} "^sar -n DEV 1"
sar -n DEV 1 > ${LOG_DIR}/monitor_network_dev.log.${TIME} 2>/dev/null &
# monitor gpu
pkill -2 -f -e -u ${USER} "^nvidia-smi"
nvidia-smi dmon -s umvt -o T -f ${LOG_DIR}/gpu_dmon.log.${TIME} &> /dev/null &
# monitor other
while (true)
do
# monitor cgroup
date >> ${LOG_DIR}/monitor_cgroup.log.${TIME}
systemd-cgtop -b -n 10 1>> ${LOG_DIR}/monitor_cgroup.log.${TIME} 2>/dev/null
sleep 1
((count=$count+1))
if [ "$count" -gt 6 ]; then
date >> ${LOG_DIR}/monitor_cgls.log.${TIME}
systemd-cgls --no-pager -l >> ${LOG_DIR}/monitor_cgls.log.${TIME}
count=1
fi
# monitor buddyinfo
date '+%Y%m%d %H:%M:%S' >> ${LOG_DIR}/monitor_buddyinfo.log.${TIME}
cat /proc/vmstat >> ${LOG_DIR}/monitor_buddyinfo.log.${TIME}
cat /proc/buddyinfo 1>> ${LOG_DIR}/monitor_buddyinfo.log.${TIME} 2>/dev/null
# cat /proc/net/softnet_stat >> ${LOG_DIR}/monitor_buddyinfo.log.${TIME}
# awk '{for (i=1; i<=NF; i++) printf strtonum("0x" $i) (i==NF?"\n":" ")}' /proc/net/softnet_stat | column -t >> ${LOG_DIR}/monitor_buddyinfo.log.${TIME}
# cat /proc/interrupts | egrep "CPU|axon|basa" >> ${LOG_DIR}/monitor_buddyinfo.log.${TIME}
cat /proc/meminfo >> ${LOG_DIR}/monitor_buddyinfo.log.${TIME}
cat /proc/zoneinfo | grep "pages free" -A8 -B1 >> ${LOG_DIR}/monitor_buddyinfo.log.${TIME}
# cat /proc/vmstat >> ${LOG_DIR}/monitor_buddyinfo.log.${TIME}
sleep 1
done
yan 23.12.27