awips2/pythonPackages/qpid/bin/monitor_qpid_host.sh
Steve Harris 2dc4a4f50f 13.2.1-9 baseline
Former-commit-id: d6dc4694f0 [formerly 915d8edd4670b38df0474de72cf19c6d8b9aa1a3]
Former-commit-id: a339cfb704
2013-02-20 14:11:20 -05:00

314 lines
8.5 KiB
Bash

#!/bin/bash
# script to gather information on a qpid host
# 20 July 2012 - Initial script (kpj)
function setupEnv() {
runTimeDate=$( date +"%Y%m%d %H:%M:%S" )
nowTimeDate=$( date +%A )
logDirectory=/data/fxa/qpid
logName=$( basename $0 .sh ).${nowTimeDate}.log
nasHost=nas1
nasVolName=dataFXA # This is so we can change it for new nas!
lsofCommand="lsof -Pns -p"
platformName=$( hostname | cut -f2 -d'-')
if [[ ${logDirectory}/${logName} -ot ${logDirectory}/$( basename $0 .sh ).$(date --date='1 day ago' +%A).log ]]
then
for myFile in ${logName} ${nowTimeDate}-lsof_qpid.out ${nowTimeDate}-qpid-stat.out ${nowTimeDate}-netstat.out ${nowTimeDate}-ipvsadm.out
do
echo > ${logDirectory}/${myFile}
done
gzip -f ${logDirectory}/*$(date --date='1 day ago' +%A)*
fi
}
function color_echo() {
# To echo something to stdout with color
#
# Usage: color_echo $COLOR $BOLD $STRING
#
# COLOR: Color string from: black, green, blue, red, yellow, white
# BOLD: 1 = true, 0 = false
# STRING: String to echo
sColor=$1
sBold=$2
sString=$3
if [[ -z "${sColor}" || -z "${sBold}" || -z "${sString}" ]] || [[ ${sBold} -ne 1 && ${sBold} -ne 0 ]]
then
echo -e "ERROR IN $FUNCNAME:\t Usage:\t$FUNCNAME $COLOR $BOLD $STRING"
else
case ${sColor} in
"green" ) colorInt=32 ;;
"blue" ) colorInt=34 ;;
"red" ) colorInt=31 ;;
"yellow" ) colorInt=33 ;;
"white" ) colorInt=37 ;;
"black" ) colorInt=30 ;;
* ) colorInt="" ;;
esac
if [[ ${sBold} -eq 0 ]]; then sBold="" ; fi
echo -e "\033[${sBold};${colorInt}m${sString}\033[0m"
fi
return 0
}
function echoDate() {
echo -ne "|-- $( date +"%Y%m%d %H:%M:%S" )"
}
function echoFail() {
echoDate && color_echo red 1 "\t$1"
}
function cleanup() {
if [[ "${hadToMount}" ]]
then
umount /data/fxa
fi
}
function runlsof() {
echo -ne "\n| START " >> ${logDirectory}/${nowTimeDate}-lsof_qpid.out
echoDate >> ${logDirectory}/${nowTimeDate}-lsof_qpid.out
echo -e "----------------------------------------------------------------|\n" >> ${logDirectory}/${nowTimeDate}-lsof_qpid.out
if ${lsofCommand} ${qpidPid} >> ${logDirectory}/${nowTimeDate}-lsof_qpid.out 2>&1
then
return 0
else
return 1
fi
}
function captureQpidStat() {
local returnCode=0
local qpidConnLimit=500
case "${platformName}" in
[a-z][a-z][a-z]n ) qpidConnLimit=1000 ; echo -e "\tNOTE: Setting Max qpidd connection to 1000 due to NCEP site" >> ${logDirectory}/${nowTimeDate}-qpid-stat.out ;;
esac
echo -ne "\n| START " >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
echoDate >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
echo -e "----------------------------------------------------------------|\n" >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
numQpidConnections=$( qpid-stat -c | wc -l )
(( numQpidConnections-=3 ))
echo -e "Total Number of QPID Connections: ${numQpidConnections}" >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
if [[ ${numQpidConnections} -ge $(( qpidConnLimit - 50 )) && ${numQpidConnections} -le $(( qpidConnLimit - 15 )) ]] ; then
echo -e "\tNOTE: Sending Major ITO to NCF because number of connections is between 450 and 485" >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
if [[ -f /opt/OV/bin/OpC/opcmsg ]] ; then
opt/OV/bin/OpC/opcmsg application=QPIDD object=QPIDD msg_text="Number Of Connections To QPID is between 450-485: Please check system health" severity=Major msg_grp=AWIPS
else
echo -e "\tERROR - can not find /opt/OV/bin/OpC/opcmsg on $( hostname )" >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
fi
elif [[ ${numQpidConnections} -gt $(( qpidConnLimit - 15 )) ]] ; then
echo -e "\tNOTE: Sending CRITIAL ITO to NCF because number of connections is > 485" >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
if [[ -f /opt/OV/bin/OpC/opcmsg ]] ; then
/opt/OV/bin/OpC/opcmsg application=QPIDD object=QPIDD msg_text="Number Of Connections To QPID is > 485 -- Take IMMEDIATE action to prevent system failure" severity=Critical msg_grp=AWIPS
else
echo -e "\tERROR - can not find /opt/OV/bin/OpC/opcmsg on $( hostname )" >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
fi
fi
echo >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
for cmdArg in "-b" "-c" "-s" "-e" "-q -Smsg"
do
if ! qpid-stat ${cmdArg} >> ${logDirectory}/${nowTimeDate}-qpid-stat.out 2>&1
then
(( returnCode+=1 ))
echoFail "\tqpid-stat ${cmdArg} returned non-zero exit code"
fi
echo >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
done
return ${returnCode}
}
function captureNetstat() {
echo -ne "\n| START " >> ${logDirectory}/${nowTimeDate}-netstat.out
echoDate >> ${logDirectory}/${nowTimeDate}-netstat.out
echo -e "----------------------------------------------------------------|\n" >> ${logDirectory}/${nowTimeDate}-netstat.out
if netstat -tunape | grep :5672 >> ${logDirectory}/${nowTimeDate}-netstat.out 2>&1
then
return 0
else
return 1
fi
}
function captureIPVS() {
local returnCode=0
echo -ne "\n| START " >> ${logDirectory}/${nowTimeDate}-ipvsadm.out 2>&1
echoDate >> ${logDirectory}/${nowTimeDate}-ipvsadm.out 2>&1
echo -e "----------------------------------------------------------------|\n" >> ${logDirectory}/${nowTimeDate}-ipvsadm.out 2>&1
if ! ipvsadm --list >> ${logDirectory}/${nowTimeDate}-ipvsadm.out 2>&1
then
(( returnCode+=1 ))
fi
echo >> ${logDirectory}/${nowTimeDate}-ipvsadm.out
if ! ipvsadm --list --stats >> ${logDirectory}/${nowTimeDate}-ipvsadm.out 2>&1
then
(( returnCode+=1 ))
fi
echo >> ${logDirectory}/${nowTimeDate}-ipvsadm.out
if ! ipvsadm --list --connection --sort >> ${logDirectory}/${nowTimeDate}-ipvsadm.out 2>&1
then
(( returnCode+=1 ))
fi
echo >> ${logDirectory}/${nowTimeDate}-ipvsadm.out
return ${returnCode}
}
## main()
setupEnv
{
echo -ne "\n| START " && echoDate && echo -e "----------------------------------------------------------------|\n"
if ! grep /data/fxa /proc/mounts | grep nfs 2>&1 > /dev/null
then
# /data/fxa isn't an nfs mount
if mount ${nasHost}:${nasVolName} /data/fxa
then
hadToMount=true
else
echoFail "ERROR:\t Couldn't mount /data/fxa and that is where the log goes!"
exit 1
fi
fi
# now check write permission
if [[ ! -d ${logDirectory} ]]
then
if ! mkdir -p ${logDirectory} > /dev/null 2>&1
then
echoFail "ERROR:\t Couldn't create ${logDirectory}"
exit 1
fi
fi
if ! touch ${logDirectory}/testfile > /dev/null 2>&1
then
echoFail "ERROR:\tNo write permissions to ${logDirectory}"
exit 1
else
rm ${logDirectory}/testfile
fi
if ! qpidPid=$( pidof qpidd )
then
echoFail "ERROR:\tCan't find qpidd on this host (run: pidof qpidd failed)."
exit 1
fi
runlsof &
functionPID=$!
_cnt=0
while ps -p ${functionPID} > /dev/null
do
sleep 1
_cnt=$(($_cnt+1))
if [[ ${_cnt} -ge 10 ]]
then
kill -9 ${functionPID}
echoFail "ERROR: lsof running for more than 10 seconds, killing"
fi
done
if ! wait ${functionPID}
then
echoFail "ERROR: Grabbing of lsof on qpidd failed"
fi
captureQpidStat &
functionPID=$!
_cnt=0
while ps -p ${functionPID} > /dev/null
do
sleep 1
_cnt=$(($_cnt+1))
if [[ ${_cnt} -ge 20 ]]
then
kill -9 ${functionPID}
echoFail "ERROR: qpid-stat running for more than 20 seconds, killing"
fi
done
if ! wait ${functionPID}
then
echoFail "ERROR: Grabbing of qpid-stat failed"
fi
captureNetstat &
functionPID=$!
_cnt=0
while ps -p ${functionPID} > /dev/null
do
sleep 1
_cnt=$(($_cnt+1))
if [[ ${_cnt} -ge 10 ]]
then
kill -9 ${functionPID}
echoFail "ERROR: netstat running for more than 10 seconds, killing"
fi
done
if ! wait ${functionPID}
then
echoFail "ERROR: Grabbing of netstat failed"
fi
if ! pidof pulse > /dev/null 2>&1
then
echoFail "ERROR: IPVS doesn't appear to be running on this host ($( hostname ))"
else
captureIPVS &
functionPID=$!
_cnt=0
while ps -p ${functionPID} > /dev/null
do
sleep 1
_cnt=$(($_cnt+1))
if [[ ${_cnt} -ge 20 ]]
then
kill -9 ${functionPID}
echoFail "ERROR: ipvs capture running for more than 20 seconds, killing"
fi
done
fi
} >> ${logDirectory}/${logName} 2>&1
exit 0