Issue #1508: Improve capture to grab screen shots and multiple jstacks to track thread state over time
Change-Id: I2d41e348d1cd4187887ec47c18f3b6d768ced8fd Former-commit-id:af105bfe0f
[formerly 8c232b02fbdd268f19e65676d68311b47469fcf3] Former-commit-id:a1c58ab36d
This commit is contained in:
parent
9a3be4460e
commit
b83e46a345
1 changed files with 131 additions and 93 deletions
|
@ -12,6 +12,7 @@ REMOTE_SERVERS_TO_CHECK="dx1f dx3 dx4"
|
|||
|
||||
# Flags to control what data capure grabs, to enable flag must be YES, anything else will be considered off.
|
||||
RUN_JSTACK="Y"
|
||||
JSTACK_ITERATIONS="15"
|
||||
RUN_JMAP="Y"
|
||||
RUN_QPID_STAT="Y"
|
||||
MOVE_ALL_HS_ERR_PID="Y"
|
||||
|
@ -19,6 +20,7 @@ MOVE_ALL_HS_ERR_PID="Y"
|
|||
GRAB_REMOTE_TOP="Y"
|
||||
GRAB_REMOTE_VMSTAT="Y"
|
||||
GRAB_CAVE_AND_ALERTVIZ_LOGS="Y"
|
||||
GRAB_SCREENSHOT='Y'
|
||||
EDEX_MODE="N"
|
||||
FORCE="N"
|
||||
TGZ_OUTPUT="Y"
|
||||
|
@ -39,6 +41,9 @@ usage() {
|
|||
echo -e "-g {grep string}\tdefault [$grepString]"
|
||||
echo " The grep string used to find the processes"
|
||||
echo
|
||||
echo -e "-screen {y/n}\t\tdefault [$GRAB_SCREENSHOT]"
|
||||
echo " Screen print the current workstation (local user must be running capture)"
|
||||
echo
|
||||
echo -e "-s {y/n}\t\tdefault [$RUN_JSTACK]"
|
||||
echo " Run jstack to grab the thread stack information"
|
||||
echo
|
||||
|
@ -115,6 +120,18 @@ checkYes() {
|
|||
fi
|
||||
}
|
||||
|
||||
# runs import to grab screen shot of users desktop
|
||||
grabScreenShot() {
|
||||
if [ "$GRAB_SCREENSHOT" == "y" ]; then
|
||||
echo "Capturing screen shot of desktop"
|
||||
t1=`date "+%Y%m%d %H:%M:%S"`
|
||||
echo "${t1}: Capturing screen shot of desktop" >> $processFile
|
||||
import -window root -display :0.0 ${dataPath}/screenShot_0.png > ${dataPath}/screenShot_0.log 2>&1 &
|
||||
import -window root -display :0.1 ${dataPath}/screenShot_1.png > ${dataPath}/screenShot_1.log 2>&1 &
|
||||
import -window root -display :0.2 ${dataPath}/screenShot_2.png > ${dataPath}/screenShot_2.log 2>&1 &
|
||||
fi
|
||||
}
|
||||
|
||||
# runs ssh command to grab top on a remote server, requires auto login to be setup
|
||||
grabRemoteTop() {
|
||||
if [ "$GRAB_REMOTE_TOP" == "y" ]; then
|
||||
|
@ -143,7 +160,6 @@ grabRemoteVmstat() {
|
|||
fi
|
||||
}
|
||||
|
||||
|
||||
checkForProcsAsOtherUsers() {
|
||||
if [ ! -z "$procs" ]; then
|
||||
numMyProcs=`echo "$myProcs" | wc -l`
|
||||
|
@ -213,39 +229,104 @@ reasonForCapture() {
|
|||
echo $reason >> ${dataPath}/capture_reason.log
|
||||
}
|
||||
|
||||
# runs jstack in background
|
||||
# for a specified pid run jstack a specified number of times in a row
|
||||
runJstack() {
|
||||
local pid="$1"
|
||||
shift 1
|
||||
local options=$@
|
||||
local prePath="${dataPath}/pid_${pid}_"
|
||||
local t1=`date "+%Y%m%d %H:%M:%S"`
|
||||
local cmd="/awips2/java/bin/jstack $options $pid"
|
||||
echo "${t1}: Running command: ${cmd} >> ${prePath}jstack.log 2>&1 &" >> $processFile
|
||||
if [ "$ACCUM" = "y" ]; then
|
||||
echo >> ${prePath}jstack.log
|
||||
echo >> ${prePath}jstack.log
|
||||
echo "Running for $t1" >> ${prePath}jstack.log
|
||||
local numIterations="$2"
|
||||
local options="-l"
|
||||
|
||||
if [ "$FORCE" == "y" ]; then
|
||||
options="${options} -F"
|
||||
fi
|
||||
$cmd >> ${prePath}jstack.log 2>&1 &
|
||||
|
||||
local cmd="/awips2/java/bin/jstack"
|
||||
local count=1
|
||||
local prePath="${dataPath}/pid_${pid}_"
|
||||
local log=""
|
||||
while [ "$count" -le "$numIterations" ]; do
|
||||
t1=`date "+%Y%m%d %H:%M:%S"`
|
||||
log="${prePath}jstack_${count}.log"
|
||||
|
||||
echo "${t1}: Running command: ${cmd} ${options} ${pid} >> ${log} 2>&1" >> $processFile
|
||||
echo "Running for $t1" >> $log
|
||||
${cmd} ${options} ${pid} >> ${log} 2>&1
|
||||
|
||||
if [[ "$?" != "0" && $FORCE != "y" ]]; then
|
||||
t1=`date "+%Y%m%d %H:%M:%S"`
|
||||
echo "${t1}: jstack for $pid failed to connect, rerunning with -F" >> $processFile
|
||||
${cmd} ${options} -F ${pid} >> ${log} 2>&1
|
||||
fi
|
||||
let "count+=1"
|
||||
done
|
||||
}
|
||||
|
||||
# runs jmap in background
|
||||
# Launchs a background process for each PID to pull jstacks
|
||||
launchJstacks() {
|
||||
# grab all jstacks
|
||||
if [ "${RUN_JSTACK}" == "y" ]; then
|
||||
if [ ! -z ${cavePid} ]; then
|
||||
echo "Capturing thread stack for pid $cavePid"
|
||||
else
|
||||
echo "Capturing all process thread stacks"
|
||||
fi
|
||||
|
||||
local count=0
|
||||
while [ "$count" -lt "$numProcs" ]; do
|
||||
runJstack ${pids[$count]} ${JSTACK_ITERATIONS} &
|
||||
let "count+=1"
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
# runs jmap in background, if it fails will run again with -F
|
||||
runJmap() {
|
||||
local pid=$1
|
||||
shift 1
|
||||
local options=$@
|
||||
local prePath="${dataPath}/pid_${pid}_"
|
||||
local options=""
|
||||
|
||||
if [ "$FORCE" == "y" ]; then
|
||||
options="${options} -F"
|
||||
fi
|
||||
|
||||
local t1=`date "+%Y%m%d %H:%M:%S"`
|
||||
local log="${prePath}dump.log"
|
||||
local dumpPath="${prePath}dump"
|
||||
|
||||
if [ "$ACCUM" = "y" ]; then
|
||||
# accum needs to change hprof by date
|
||||
local t2=`date "+%Y%m%d_%H%M%S"`
|
||||
local cmd="/awips2/java/bin/jmap -dump:format=b,file=${prePath}dump_${t2}.hprof $options $pid"
|
||||
dumpPath="${dumpPath}_${t2}.hprof"
|
||||
else
|
||||
local cmd="/awips2/java/bin/jmap -dump:format=b,file=${prePath}dump.hprof $options $pid"
|
||||
dumpPath="${dumpPath}.hprof"
|
||||
fi
|
||||
|
||||
local cmd="/awips2/java/bin/jmap -dump:format=b,file=${dumpPath}"
|
||||
echo "${t1}: Running command: $cmd $options $pid >> $log 2>&1 &" >> $processFile
|
||||
$cmd $options $pid >> $log 2>&1 &
|
||||
|
||||
if [[ "$?" != "0" && $FORCE != "y" ]]; then
|
||||
t1=`date "+%Y%m%d %H:%M:%S"`
|
||||
echo "${t1}: jmap for $pid failed to connect, rerunning with -F" >> $processFile
|
||||
$cmd $options -F $pid >> $log 2>&1 &
|
||||
fi
|
||||
}
|
||||
|
||||
# Launchs a background process for each PID to pull jmap
|
||||
launchJmaps() {
|
||||
# grab all jmaps
|
||||
if [ "$RUN_JMAP" == "y" ]; then
|
||||
if [ ! -z ${cavePid} ]; then
|
||||
echo "Capturing process heap dump for pid $cavePid"
|
||||
else
|
||||
echo "Capturing all Heap Dumps"
|
||||
fi
|
||||
|
||||
local count=0
|
||||
while [ "$count" -lt "$numProcs" ]; do
|
||||
runJmap ${pids[$count]} &
|
||||
let "count+=1"
|
||||
done
|
||||
fi
|
||||
echo "${t1}: Running command: $cmd >> ${prePath}dump.log 2>&1 &" >> $processFile
|
||||
$cmd >> ${prePath}dump.log 2>&1 &
|
||||
}
|
||||
|
||||
# runs qpid-stat
|
||||
|
@ -253,14 +334,25 @@ runQpidStat() {
|
|||
local qpidHost=cp1f
|
||||
local prePath="${dataPath}/"
|
||||
local t1=`date "+%Y%m%d %H:%M:%S"`
|
||||
local cmd="/awips2/python/bin/qpid-stat -q -Smsg -L100 ${qpidHost}"
|
||||
echo "${t1}: Running command: $cmd >> ${prepath}qpid-stat.log 2>&1 &" >> $processFile
|
||||
local cmd="/awips2/python/bin/qpid-stat -q -Smsg -L500 ${qpidHost}"
|
||||
local log="${prepath}qpid-stat-queues.log"
|
||||
echo "${t1}: Running command: $cmd >> $log 2>&1 &" >> $processFile
|
||||
if [ "$ACCUM" = "y" ]; then
|
||||
echo >> ${prePath}qpid-stat.log
|
||||
echo >> ${prePath}qpid-stat.log
|
||||
echo "Running for $t1" >> ${prePath}qpid-stat.log
|
||||
echo >> $log
|
||||
echo >> $log
|
||||
echo "Running for $t1" >> $log
|
||||
fi
|
||||
$cmd >> ${prePath}qpid-stat.log 2>&1 &
|
||||
$cmd >> $log 2>&1 &
|
||||
|
||||
log="${prepath}qpid-stat-sessions.log"
|
||||
cmd="/awips2/python/bin/qpid-stat -s -Smsg -L500 ${qpidHost}"
|
||||
echo "${t1}: Running command: $cmd >> $log 2>&1 &" >> $processFile
|
||||
if [ "$ACCUM" = "y" ]; then
|
||||
echo >> $log
|
||||
echo >> $log
|
||||
echo "Running for $t1" >> $log
|
||||
fi
|
||||
$cmd >> $log 2>&1 &
|
||||
}
|
||||
|
||||
# runs versions.sh to grab version info
|
||||
|
@ -291,6 +383,7 @@ while [ ! -z "$1" ]; do
|
|||
-e) EDEX_MODE="Y"; edexProcs[$edexProcCount]="$1"; shift 1; let "edexProcCount+=1";;
|
||||
-a) ACCUM="$1"; shift 1;;
|
||||
-v) GRAB_REMOTE_VMSTAT="$1"; shift 1;;
|
||||
-screen) GRAB_SCREENSHOT="$1"; shift 1;;
|
||||
-h|*) usage;;
|
||||
esac
|
||||
done
|
||||
|
@ -308,6 +401,7 @@ checkYes EDEX_MODE $EDEX_MODE
|
|||
checkYes TGZ_OUTPUT $TGZ_OUTPUT
|
||||
checkYes ACCCUM $ACCCUM
|
||||
checkYes RUN_VERSIONS $RUN_VERSIONS
|
||||
checkYes GRAB_SCREENSHOT $GRAB_SCREENSHOT
|
||||
|
||||
# if PID mode don't grab other hs_err_pids
|
||||
if [ ! -z $cavePid ]; then
|
||||
|
@ -383,7 +477,7 @@ if [ "$EDEX_MODE" == "y" ]; then
|
|||
grepString="${grepString}) "
|
||||
fi
|
||||
|
||||
procs=`ps -ef | grep -E "$grepString" | grep -v "grep"`
|
||||
procs=`ps -ef | grep -E "$grepString" | grep -v "grep" | grep -v "cave.sh"`
|
||||
|
||||
if [ ! -z "$cavePid" ]; then
|
||||
# limit cave procs to the requested PID
|
||||
|
@ -399,11 +493,6 @@ echo "" >> $processFile
|
|||
|
||||
checkForProcsAsOtherUsers
|
||||
|
||||
# get reason for running capture
|
||||
if [ "$reason" != "n" ]; then
|
||||
reasonForCapture &
|
||||
fi
|
||||
|
||||
if [ ! -z "${myProcs}" ]; then
|
||||
t1=`date "+%Y%m%d %H:%M:%S"`
|
||||
echo "Processes found for user $user, capturing data to $dataPath"
|
||||
|
@ -424,38 +513,9 @@ if [ ! -z "${myProcs}" ]; then
|
|||
done
|
||||
IFS=$PREV_IFS
|
||||
|
||||
# doing each item in its own loop so we can grab all data for a given type at once
|
||||
launchJstacks
|
||||
|
||||
# grab all jstacks
|
||||
if [ "${RUN_JSTACK}" == "y" ]; then
|
||||
if [ ! -z ${cavePid} ]; then
|
||||
echo "Capturing thread stack for pid $cavePid"
|
||||
else
|
||||
echo "Capturing all process thread stacks"
|
||||
fi
|
||||
|
||||
count=0
|
||||
while [ "$count" -lt "$numProcs" ]; do
|
||||
if [ "$FORCE" == "y" ]; then
|
||||
runJstack ${pids[$count]} -l -F
|
||||
else
|
||||
runJstack ${pids[$count]} -l
|
||||
fi
|
||||
bPids[$count]=$!
|
||||
let "count+=1"
|
||||
done
|
||||
|
||||
count=0
|
||||
while [ "$count" -lt "$numProcs" ]; do
|
||||
wait ${bPids[$count]}
|
||||
if [ "$?" != "0" ]; then
|
||||
t1=`date "+%Y%m%d %H:%M:%S"`
|
||||
echo "${t1}: jstack for ${pids[$count]} failed to connect, rerunning with -F" >> $processFile
|
||||
runJstack ${pids[$count]} -l -F
|
||||
fi
|
||||
let "count+=1"
|
||||
done
|
||||
fi
|
||||
launchJmaps
|
||||
|
||||
runQpidStat
|
||||
|
||||
|
@ -463,36 +523,6 @@ if [ ! -z "${myProcs}" ]; then
|
|||
|
||||
grabRemoteVmstat
|
||||
|
||||
# grab all jmaps
|
||||
if [ "$RUN_JMAP" == "y" ]; then
|
||||
if [ ! -z ${cavePid} ]; then
|
||||
echo "Capturing process heap dump for pid $cavePid"
|
||||
else
|
||||
echo "Capturing all Heap Dumps"
|
||||
fi
|
||||
|
||||
count=0
|
||||
while [ "$count" -lt "$numProcs" ]; do
|
||||
if [ "$FORCE" == "y" ]; then
|
||||
runJmap ${pids[$count]} -F
|
||||
else
|
||||
runJmap ${pids[$count]}
|
||||
fi
|
||||
bPids[$count]=$!
|
||||
let "count+=1"
|
||||
done
|
||||
|
||||
count=0
|
||||
while [ "$count" -lt "$numProcs" ]; do
|
||||
wait ${bPids[$count]}
|
||||
if [ "$?" != "0" ]; then
|
||||
t1=`date "+%Y%m%d %H:%M:%S"`
|
||||
echo "${t1}: jmap for ${pids[$count]} failed to connect, rerunning with -F" >> $processFile
|
||||
runJmap ${pids[$count]} -F
|
||||
fi
|
||||
let "count+=1"
|
||||
done
|
||||
fi
|
||||
else
|
||||
t1=`date "+%Y%m%d %H:%M:%S"`
|
||||
echo "*** NO processes found for user $user, capturing limited data to $dataPath"
|
||||
|
@ -504,6 +534,14 @@ else
|
|||
grabRemoteVmstat
|
||||
fi
|
||||
|
||||
# grab screen shot, spawns background process for each screen
|
||||
grabScreenShot
|
||||
|
||||
# get reason for running capture
|
||||
if [ "$reason" != "n" ]; then
|
||||
reasonForCapture &
|
||||
fi
|
||||
|
||||
# move all hs_err_pid from user's home directory to capture directory
|
||||
if [ "${MOVE_ALL_HS_ERR_PID}" == "y" ]; then
|
||||
numErrFiles=`ls ${HOME}/hs_err_pid* 2> /dev/null | wc -l`
|
||||
|
|
Loading…
Add table
Reference in a new issue