awips2/pythonPackages/qpid/bin/monitor_qpid_host.sh
Sean.Webb a3403f9dc9 ASM #550 - Remove ITO call from monitor_qpid_hosts.sh
Change-Id: I685b7ea1f2ed6d1eeb91e933b76eee051a3332f8

Former-commit-id: fe087a93cb [formerly 3fff8c6164] [formerly fe087a93cb [formerly 3fff8c6164] [formerly 42b78d7ec0 [formerly 43c50503e9aadc3dc66cdbf6b50f18a642547f35]]]
Former-commit-id: 42b78d7ec0
Former-commit-id: 5a7c0a1805 [formerly 542f05aa9a]
Former-commit-id: 84bfcb364c
2014-05-30 14:09:16 -04:00

355 lines
8.8 KiB
Bash

#!/bin/bash
# script to gather information on a qpid host
# 20 July 2012 - Initial script (kpj)
function setupEnv() {
runTimeDate=$( date +"%Y%m%d %H:%M:%S" )
nowTimeDate=$( date +%A )
logDirectory=/data/fxa/qpid
logName=$( basename $0 .sh ).${nowTimeDate}.log
nasHost=nas1
nasVolName=dataFXA # This is so we can change it for new nas!
lsofCommand="lsof -Pns -p"
platformName=$( hostname | cut -f2 -d'-')
if [[ ${logDirectory}/${logName} -ot ${logDirectory}/$( basename $0 .sh ).$(date --date='1 day ago' +%A).log ]]
then
for myFile in ${logName} ${nowTimeDate}-lsof_qpid.out ${nowTimeDate}-qpid-stat.out ${nowTimeDate}-netstat.out ${nowTimeDate}-ipvsadm.out ${nowTimeDate}-captureQpidHeapInfo.out
do
echo > ${logDirectory}/${myFile}
done
gzip -f ${logDirectory}/*$(date --date='1 day ago' +%A)*
fi
}
function color_echo() {
# To echo something to stdout with color
#
# Usage: color_echo $COLOR $BOLD $STRING
#
# COLOR: Color string from: black, green, blue, red, yellow, white
# BOLD: 1 = true, 0 = false
# STRING: String to echo
sColor=$1
sBold=$2
sString=$3
if [[ -z "${sColor}" || -z "${sBold}" || -z "${sString}" ]] || [[ ${sBold} -ne 1 && ${sBold} -ne 0 ]]
then
echo -e "ERROR IN $FUNCNAME:\t Usage:\t$FUNCNAME $COLOR $BOLD $STRING"
else
case ${sColor} in
"green" ) colorInt=32 ;;
"blue" ) colorInt=34 ;;
"red" ) colorInt=31 ;;
"yellow" ) colorInt=33 ;;
"white" ) colorInt=37 ;;
"black" ) colorInt=30 ;;
* ) colorInt="" ;;
esac
if [[ ${sBold} -eq 0 ]]; then sBold="" ; fi
echo -e "\033[${sBold};${colorInt}m${sString}\033[0m"
fi
return 0
}
function echoDate() {
echo -ne "|-- $( date +"%Y%m%d %H:%M:%S" )"
}
function echoFail() {
echoDate && color_echo red 1 "\t$1"
}
function cleanup() {
if [[ "${hadToMount}" ]]
then
umount /data/fxa
fi
}
function runlsof() {
echo -ne "\n| START " >> ${logDirectory}/${nowTimeDate}-lsof_qpid.out
echoDate >> ${logDirectory}/${nowTimeDate}-lsof_qpid.out
echo -e "----------------------------------------------------------------|\n" >> ${logDirectory}/${nowTimeDate}-lsof_qpid.out
if ${lsofCommand} ${qpidPid} >> ${logDirectory}/${nowTimeDate}-lsof_qpid.out 2>&1
then
return 0
else
return 1
fi
}
function captureQpidStat() {
local returnCode=0
local qpidConnLimit=500
local qpidConnMedAlarm=75
local qpidConnHighAlarm=40
local qpidConnCritAlarm=15
case "${platformName}" in
[a-z][a-z][a-z]n ) qpidConnLimit=1000 ; echo -e "\tNOTE: Setting Max qpidd connection to 1000 due to NCEP site" >> ${logDirectory}/${nowTimeDate}-qpid-stat.out ;;
esac
echo -ne "\n| START " >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
echoDate >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
echo -e "----------------------------------------------------------------|\n" >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
# Send ITO alarm to NCF - Thank you Sean Bowser for your guidance here. You are wise.
numQpidConnections=$( qpid-stat -c | wc -l )
(( numQpidConnections-=3 ))
echo -e "Total Number of QPID Connections: ${numQpidConnections}" >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
echo >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
for cmdArg in "-b" "-c" "-s" "-e" "-q -Smsg"
do
if ! qpid-stat ${cmdArg} >> ${logDirectory}/${nowTimeDate}-qpid-stat.out 2>&1
then
(( returnCode+=1 ))
echoFail "\tqpid-stat ${cmdArg} returned non-zero exit code"
fi
echo >> ${logDirectory}/${nowTimeDate}-qpid-stat.out
done
return ${returnCode}
}
function captureNetstat() {
echo -ne "\n| START " >> ${logDirectory}/${nowTimeDate}-netstat.out
echoDate >> ${logDirectory}/${nowTimeDate}-netstat.out
echo -e "----------------------------------------------------------------|\n" >> ${logDirectory}/${nowTimeDate}-netstat.out
if netstat -tunape | grep :5672 >> ${logDirectory}/${nowTimeDate}-netstat.out 2>&1
then
return 0
else
return 1
fi
}
function captureIPVS() {
local returnCode=0
echo -ne "\n| START " >> ${logDirectory}/${nowTimeDate}-ipvsadm.out 2>&1
echoDate >> ${logDirectory}/${nowTimeDate}-ipvsadm.out 2>&1
echo -e "----------------------------------------------------------------|\n" >> ${logDirectory}/${nowTimeDate}-ipvsadm.out 2>&1
if ! ipvsadm --list >> ${logDirectory}/${nowTimeDate}-ipvsadm.out 2>&1
then
(( returnCode+=1 ))
fi
echo >> ${logDirectory}/${nowTimeDate}-ipvsadm.out
if ! ipvsadm --list --stats >> ${logDirectory}/${nowTimeDate}-ipvsadm.out 2>&1
then
(( returnCode+=1 ))
fi
echo >> ${logDirectory}/${nowTimeDate}-ipvsadm.out
if ! ipvsadm --list --connection --sort >> ${logDirectory}/${nowTimeDate}-ipvsadm.out 2>&1
then
(( returnCode+=1 ))
fi
echo >> ${logDirectory}/${nowTimeDate}-ipvsadm.out
return ${returnCode}
}
function captureQpidHeapInfo() {
local returnCode=0
local logFile=${logDirectory}/${nowTimeDate}-$FUNCNAME.out
echo -ne "\n| START " >> ${logFile} 2>&1
echoDate >> ${logFile} 2>&1
echo -e "----------------------------------------------------------------|\n" >> ${logFile} 2>&1
#qpidPid=$( jps -v | grep QPBRKR | awk '{print $1}' )
if ! ps -p ${qpidPid} > /dev/null ; then
echoFail "\tCan not find returned qpidd pid (${qpidPid}): $( jps -v | grep QPBRKR | awk '{print $1}' )"
return 1
fi
echo -e "\tFound qpidd on PID ${qpidPid}" >> ${logFile}
echo -e "\tGetting HEAP usage...........................\n\n" >> ${logFile}
jmap -heap ${qpidPid} | while read line ; do
echo -e "\t\t${line}" >> ${logFile}
done
echo -e "\n\n\tGetting Garbage Collection Information ..................\n\n" >> ${logFile}
jstat -gcutil ${qpidPid} 1000 30 | while read line ; do
echo -e "\t\t${line}" >> ${logFile}
done
return 0
}
## main()
setupEnv
{
echo -ne "\n| START " && echoDate && echo -e "----------------------------------------------------------------|\n"
if ! grep /data/fxa /proc/mounts | grep nfs 2>&1 > /dev/null
then
# /data/fxa isn't an nfs mount
if mount ${nasHost}:${nasVolName} /data/fxa
then
hadToMount=true
else
echoFail "ERROR:\t Couldn't mount /data/fxa and that is where the log goes!"
exit 1
fi
fi
# now check write permission
if [[ ! -d ${logDirectory} ]]
then
if ! mkdir -p ${logDirectory} > /dev/null 2>&1
then
echoFail "ERROR:\t Couldn't create ${logDirectory}"
exit 1
fi
fi
if ! touch ${logDirectory}/testfile > /dev/null 2>&1
then
echoFail "ERROR:\tNo write permissions to ${logDirectory}"
exit 1
else
rm ${logDirectory}/testfile
fi
if ! qpidPid=$( jps -v | grep QPBRKR | awk '{print $1}' )
then
echoFail "ERROR:\tCan't find qpidd on this host (run: jps -v | grep QPBRKR failed)."
exit 1
fi
runlsof &
functionPID=$!
_cnt=0
while ps -p ${functionPID} > /dev/null
do
sleep 1
_cnt=$(($_cnt+1))
if [[ ${_cnt} -ge 10 ]]
then
kill -9 ${functionPID}
echoFail "ERROR: lsof running for more than 10 seconds, killing"
fi
done
if ! wait ${functionPID}
then
echoFail "ERROR: Grabbing of lsof on qpidd failed"
fi
captureQpidStat &
functionPID=$!
_cnt=0
while ps -p ${functionPID} > /dev/null
do
sleep 1
_cnt=$(($_cnt+1))
if [[ ${_cnt} -ge 30 ]]
then
kill -9 ${functionPID}
echoFail "ERROR: qpid-stat running for more than 30 seconds, killing"
fi
done
if ! wait ${functionPID}
then
echoFail "ERROR: Grabbing of qpid-stat failed"
fi
captureQpidHeapInfo &
functionPID=$!
_cnt=0
while ps -p ${functionPID} > /dev/null
do
sleep 1
_cnt=$(($_cnt+1))
if [[ ${_cnt} -ge 60 ]]
then
kill -9 ${functionPID}
echoFail "ERROR: Getting heap infomation running for more than 60 seconds, killing"
fi
done
if ! wait ${functionPID}
then
echoFail "ERROR: Grabbing of heap utilization failed"
fi
captureNetstat &
functionPID=$!
_cnt=0
while ps -p ${functionPID} > /dev/null
do
sleep 1
_cnt=$(($_cnt+1))
if [[ ${_cnt} -ge 10 ]]
then
kill -9 ${functionPID}
echoFail "ERROR: netstat running for more than 10 seconds, killing"
fi
done
if ! wait ${functionPID}
then
echoFail "ERROR: Grabbing of netstat failed"
fi
if ! pidof pulse > /dev/null 2>&1
then
echoFail "ERROR: IPVS doesn't appear to be running on this host ($( hostname ))"
else
captureIPVS &
functionPID=$!
_cnt=0
while ps -p ${functionPID} > /dev/null
do
sleep 1
_cnt=$(($_cnt+1))
if [[ ${_cnt} -ge 20 ]]
then
kill -9 ${functionPID}
echoFail "ERROR: ipvs capture running for more than 20 seconds, killing"
fi
done
fi
} >> ${logDirectory}/${logName} 2>&1
exit 0