#!/bin/ksh # # License: Please refer to the license file (license.txt) for license and support information # # Purpose: Dump process information from both kernel(kdb) and user space (proctools/dbx). # Usage: See print_usage(). # Last update: See PDUMPVERSION # Contact: Tao Chen (chenttao@cn.ibm.com) export PDUMPVERSION="Version Jan.18.2015" export LASTPDUMPVERSION="Version Jan.08.2013" export LANG=C export LC_ALL=C export PATH=/sbin:/usr/sbin:/usr/bin print_usage() { print "\nUsage: ${0##/*/} [ -k | -d ] " print " -k: skip kdb" print " -p: call proctools" print " -d: call dbx (implies -p)" print " -l: long mode with more output\n" } clean_up() { print -u2 "\nScript is stopped. Partial output is saved in $OFILE.\n" exit } #################################################################################################### general # # This section should only use commands that least likely to hang. # get_general_config() { print "\nGetting general environment data ..." { print "Date: $(date +%d%h%Y-%H.%M.%S)" print "Machine: $(hostname) - $(uname -M)" print "pdump.sh: $PDUMPVERSION" print "User: $(whoami)" ## if not root, BIT is not a must, since we won't run kdb if [[ $(whoami) = "root" ]]; then BIT=$((`bootinfo -K`)) print "Kernel: $BIT-bit" fi print "" lslpp -Lc \ bos.up \ bos.mp \ bos.mp64 \ bos.rte.libc \ bos.rte.libpthreads \ bos.rte.filesystem \ bos.rte.lvm \ bos.adt.debug \ bos.sysmgt.serv_aid 2>&1 | awk ' /^bos/ { gsub(/:/, " ") printf("%-8s - %-s\n", $3, $2) }' print "\nemgr -l:" emgr -l 2>&1 | grep -p ID ## 'C' and 'TIME' give hint on CPU usage (looping or hanging) print "\n# ps -fp $PID" ps -fp $PID if [[ $(whoami) = "root" ]]; then print "\n# svmon -P $PID" svmon -P $PID 2>/dev/null | egrep -v "\-\-" print "\n# svmon -G" svmon -G 2>/dev/null | egrep -v "\-\-" fi if [[ -f /usr/bin/proctree ]]; then print "\n# proctree $PID" proctree $PID 2>&1 fi } >> $OFILE } ## END of get_general_config() #################################################################################################### kdb run_kdb() { print "Dumping process information from kdb ...\n" print "\n# kdb \n" >> $OFILE print "\tdumping process slot $PSLT ..." BK="\n\n\n\n\n" ## 08/08/2006: tpid reports wrong list of threads, use th -p pslot instead printf " th -p %d $BK proc %d $BK lle -v -p %d\n" $PSLT $PSLT $PSLT \ | kdb | sed -n -e '/^(.*)>/,$p' >> $OFILE > pdump.tslot awk ' /^pvthread|^thread/ { sub(/!/, " "); sub(/>/, " "); if ($4 != "ZOMB") print $2 >> "pdump.tslot" }' < $OFILE if [[ ! -s pdump.tslot ]]; then print -u2 "Error getting thread list. Skip other kdb commands." return fi # USER is a proc structure shared by all threads, # in kdb though, 'user' command actually prints u-block of each threads. # ( USER = u-block - uthread ) # so only the first thread's full user output is saved here. # buld a command list print "\tbuilding kdb commands ..." COMMS="" FIRSTT="true" for TSLT in `cat pdump.tslot` do print "\tthread slot $TSLT ..." if [[ $FIRSTT = "true" ]]; then COMMS=$COMMS"$BK th $TSLT $BK user $TSLT $BK f $TSLT $BK f -v $TSLT $BK sw $TSLT $BK mst $BK dr iar $BK sr64 $BK segst64 $BK u -ad" ########## 2008/08/21: add file output ########## filecnt=0 echo "u -f $TSLT" | kdb | awk '/fd .* fp/ {gsub(/fp\.\./, "", $3); print $3}' | sort -u | while read fp do let filecnt=$filecnt+1 if [[ $filecnt -gt 20 ]]; then echo "\n... warning: open file count greater than 20, skip ...\n" >> $OFILE break fi COMMS=$COMMS"$BK file $fp" done FIRSTT="false" else COMMS=$COMMS"$BK th $TSLT $BK user -ut $TSLT $BK f $TSLT $BK f -v $TSLT $BK sw $TSLT $BK mst" fi done rm pdump.tslot >/dev/null 2>&1 ############################################## lock in kdb ############################################## # # # for each thread in this section, f and f -v output are collected. # f is for easy copy/paste into pmr/email # f -v is more comprehensive than "set 10; set 18". # Get lock info if any thread is waiting on lock # Sample: # pvthread+01F300 499 harmad SLEEP 1F3001 03C 9 0 F10000E33E1D8000 slist_table+000800 # pvthread+020900 521 lspv SLEEP 209061 03C 2 0 082522C0 slist_table+000E20 # select threads based on unique name and lock address, # because if two threads have the same name and lock address, # chances are they are in the same stack. # "uniq -f1" means ignore one field (tslot) when dermine uniqueness. COMMS=$COMMS"$BK lq" COMMS=$COMMS"$BK th -w WLOCK" COMMS=$COMMS"$BK th -w WSLOCK" echo "th -w WLOCK \n th -w WSLOCK" | kdb | grep ^pvthread | sort -k9 -k3 > pdump.lock awk '{ print $2, $3, $9 }' < pdump.lock | uniq -f1 | while read tslot tname lkaddr do ## clk command covers slk just fine. COMMS=$COMMS"$BK clk $lkaddr $BK f $tslot $BK f -v $tslot" done rm pdump.lock >/dev/null 2>&1 # Get stack for those holding a lock (in earlier version we only check SLEEP, but now for all but "kdb_64". # Sample: # pvthread+018500 389 scopeux SLEEP 185015 03C 8 0 F10000E33E1D9538 # pvthread+01F300 499 harmad SLEEP 1F3001 03C 9 0 F10000E33E1D8000 slist_table+000800 # pvthread+02EB00 747 sas SLEEP 2EB04F 064 13 0 vmmswpft+77793160 # pvthread+026D00 621!kdb_64 RUN 26D0067 061 24 0 COMMS=$COMMS"$BK th -lk" echo "th -lk" | kdb | awk '/^pvthread|^thread/ { sub(/!/, " "); sub(/>/, " "); if ($3 != "kdb_64") print $2}' | while read tslot do COMMS=$COMMS"$BK f $tslot $BK f -v $tslot" done ## 01/25/2006: dla may generate endless "Out of lock descriptors" error # COMMS=$COMMS"$BK dla" # # ############################################## end of lock ############################################## # I/O # mounted filesystem COMMS=$COMMS"$BK vfs" COMMS=$COMMS"$BK pdt *" COMMS=$COMMS"$BK th -w WPGIN" # runqueue COMMS=$COMMS"$BK rq" COMMS=$COMMS"$BK rqi" COMMS=$COMMS"$BK th -r" # kernel extension: COMMS=$COMMS"$BK lke" if [[ -n $LONGMODE ]] then # library COMMS=$COMMS"$BK lle -l32" COMMS=$COMMS"$BK lle -l64" # IPC : COMMS=$COMMS"$BK ipc 1 1" COMMS=$COMMS"$BK ipc 2 1" COMMS=$COMMS"$BK ipc 3 1" fi # send commands to kdb print "\texecuting kdb commands ..." print $COMMS | kdb | sed -n -e '/^(.*)>/,$p' >> $OFILE } ## END of run_kdb() #################################################################################################### proc run_proctools() { print "\nDumping process information with proc tools ...\n" print "\n# proccred $PID" >> $OFILE proccred $PID >> $OFILE 2>&1 print "\n# procfiles $PID" >> $OFILE procfiles $PID >> $OFILE 2>&1 print "\n# procflags $PID" >> $OFILE procflags $PID >> $OFILE 2>&1 print "\n# procldd $PID" >> $OFILE procldd $PID >> $OFILE 2>&1 print "\n# procmap $PID" >> $OFILE procmap $PID >> $OFILE 2>&1 print "\n# procsig $PID" >> $OFILE procsig $PID >> $OFILE 2>&1 print "\n# procstack $PID" >> $OFILE procstack $PID >> $OFILE 2>&1 # now in general #print "\n# proctree $PID" >> $OFILE #proctree $PID >> $OFILE 2>&1 print "\n# procwdx $PID" >> $OFILE procwdx $PID >> $OFILE 2>&1 } #################################################################################################### dbx run_dbx() { print "\nDumping process information from dbx ...\n" DBXCMD="/usr/bin/dbx -a $PID" print "\n# dbx -a $PID \n" >> $OFILE ## Get the current thread's output first print "\n p '(dbx) where' \n where \ \n p '(dbx) x' \n x \ \n p '(dbx) (\$stkp)/200' \n (\$stkp)/200 \ \n p '(dbx) map' \n map \ \n p '(dbx) p __n_pthreads' \n p __n_pthreads \ \n p '(dbx) p __multi_threaded' \n p __multi_threaded \ \n p '(dbx) mutex' \n mutex \ \n p '(dbx) condition' \n condition \ \n p '(dbx) rwlock' \n rwlock \ \n p '(dbx) dump .' \n dump . \ \n p '(dbx) th' \n th \ \n p '(dbx) detach' \n detach" | $DBXCMD >> $OFILE 2>&1 if [[ $? -ne 0 ]]; then print -u "Can not dbx attach to the process. Skip dbx." return fi sed -n -e '/(dbx) th/,$p' $OFILE | grep "k-tid" >/dev/null 2>&1 ## multi-threaded if [[ $? -eq 0 ]] then ## build a command list BRK='p "."' SECT=">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" COMMS="$BRK\n" COMMS=$COMMS"p '$SECT thread info'\n$BRK\n thread info\n$BRK\n" ## Since thread number is not necessarily continuous, ## we have to get the exact thread number. sed -n -e '/(dbx) th/,$p' $OFILE | awk '/^.\$t[0-9]/ {print $1}' | while read TPT do PT=${TPT#*t} print "\tdumping tid $PT ..." COMMS=$COMMS"p '$SECT thread current $PT' \n$BRK\n thread current $PT\n" COMMS=$COMMS"p '$SECT x $PT' \n$BRK\n x \n$BRK\n" COMMS=$COMMS"p '$SECT where $PT' \n$BRK\n where \n$BRK\n" COMMS=$COMMS"p '$SECT (\$stkp)/100 $PT' \n$BRK\n (\$stkp)/100 \n$BRK\n" done COMMS=$COMMS"p '(dbx) detach' \n detach" print "\n# dbx -a $PID \n" >> $OFILE print $COMMS | $DBXCMD 2>/dev/null >> $OFILE fi print "\tlisting object files ..." print "\n + List of object files: \n" >> $OFILE awk '/Object name: / { print $NF }' $OFILE | /usr/bin/sort | /usr/bin/uniq | while read obj do ## the main program may not locate in the current directory ## so ls -l may return error > /dev/null realobj=$obj /bin/ls -l $realobj >> $OFILE 2>/dev/null /bin/ls -l $realobj 2>/dev/null | /usr/bin/grep ^lrwx >/dev/null 2>&1 while [[ $? -eq 0 ]] do realobj=`/bin/ls -l $realobj 2>/dev/null | awk '{print $NF}'` /bin/ls -l $realobj >> $OFILE 2>/dev/null /bin/ls -l $realobj 2>/dev/null | /usr/bin/grep ^lrwx >/dev/null 2>&1 done done } ## END of run_dbx #################################################################################################### main while getopts :kdplh flag ; do case $flag in k) NOKDB=1;; d) USEDBX=1;; p) PROCTOOL=1;; l) LONGMODE=1;; h) print_usage return 0;; \?) print -u2 "\nInvalid parameter" print_usage return 1;; esac done shift $(($OPTIND -1)) ## check tools { ## check permission to run kdb if [[ -z $NOKDB && $(whoami) != root ]]; then print -u2 "\n'root' authority is required for kdb (use '-k' to skip kdb).\n" return 1 fi ## check dbx tool if [[ -n $USEDBX && (! -f /usr/bin/dbx) ]]; then print -u2 "/usr/bin/dbx doesn't exist. Install bos.adt.debug or use '-d' to skip dbx." return 1 fi } ## validate PID { ## need one parameter if [[ $# -ne 1 ]]; then print_usage return 1 fi ## numeric? if [[ ${1##+([0-9])} != "" ]] ; then print -u2 "\n$1 is not a PID" print_usage return 1 fi ## existing PID? /bin/ps -p $1 > /dev/null 2>&1 if [[ $? -eq 1 ]]; then print -u2 "\nPID $1 doesn't exist.\n" return 1 fi PID=$1 } ## create output file { OFILE="pdump.$(/bin/ps -p $PID -ocomm=).$PID.`date +%d%h%Y-%H.%M.%S`.out" > $OFILE if [[ $? -ne 0 ]]; then print -u2 "\nCannot create output file in the current directory. Please check permission.\n" return 1 fi } trap clean_up TERM INT ## collect output { get_general_config if [[ -z $NOKDB ]]; then ## from PID to PSLT osl=$(oslevel | cut -c1-3) if [[ $BIT -eq 32 ]] then let PSLT=$PID/256 else if [[ $osl = "5.1" || $osl = "4.3" ]] then let PSLT=$PID/8192 else ## PSLT calculation changed from 6.1.5.0 due to feature 716192 let TL=`lslpp -l bos.mp64 | awk '/mp64/ {print $2; exit}' | cut -d'.' -f3` if [[ $osl = "5.3" || ($osl = "6.1" && $TL -lt 5) ]]; then let PSLT=$((`echo $PID/256%16*16384+$PID/4096 | bc`)) else let PSLT=$((`echo $PID/256%256*1024+$PID/65536 | bc`)) fi fi fi # init is an exception (1/$DIVISOR = 0) if [[ $PID -eq 1 ]]; then PSLT=1 fi run_kdb fi if [[ -f /usr/bin/procstack ]]; then if [[ -n $PROCTOOL || -n $USEDBX ]]; then run_proctools ## new in 5.2+ fi fi if [[ -n $USEDBX ]]; then run_dbx fi # extra data { print "\n# ps -mp $PID -o THREAD\n" ps -mp $PID -o THREAD print "\n# ps auxeww $PID\n" ps auxeww $PID print "\n# ps -efk \n" ps -efk if [[ -n $LONGMODE ]]; then print "\n# ipcs -a \n" ipcs -a if [[ `oslevel` = "5.3.0.0" ]]; then print "\n# genld -ld | egrep -p "^Proc_pid:[[:blank:]]+$PID[[:blank:]]"\n" genld -ld | egrep -p "^Proc_pid:[[:blank:]]+$PID[[:blank:]]" print "\n# genkld -d \n" genkld -d else print "\n# genld -l | egrep -p "^Proc_pid:[[:blank:]]+$PID[[:blank:]]"\n" genld -l | egrep -p "^Proc_pid:[[:blank:]]+$PID[[:blank:]]" fi ## network stuff netstat -Aan fi } >> $OFILE } print "\nDone.\nOutput file is $OFILE\n" /bin/ls -l $OFILE print ## The End ## Change Log: ## 07.31.2007 - move some long output to $LONGMODE only, such as ipc, sys loader, netstat, etc., to reduce default output file size ## - add dr iar & segment register commands ## 08.21.2008 - add file output for each ufd entry ## 02.02.2009 - add svmon -G output ## 02.25.2011 - Srinivasa Rao: Effective from 61TL05 the calculations for PROCSLOT has changed due to enhanced affinity feature 716192. ## 03.02.2011 - Jun Kuwahara: $klvl >= "6.1.5.0" does not work, -ge does, ## - change code to be more strict: '=' for oslevel string, '-lt' for TL arithmetic. ## 03.29.2011 - a mistake in PID -> pslot was caught by Shang Li ## 01.08.2013 - Yuta Chiba: kdb doesn't always run on cpu 0, change prompt match from ^(0) to ^(.*)> ## 01.18.2015 - add 'WSLOCK' since WLOCK no longer covers WSLOCK; add stacks for non-SLEEP lock owners