#!/bin/sh

 # Copyright (C) 2007 Dejan Muhamedagic <dmuhamedagic@suse.de>
 # 
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
 # 
 # This software is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 # 
 # You should have received a copy of the GNU General Public
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 #

. /etc/ha.d/shellfuncs
. $HA_NOARCHBIN/utillib.sh

PROG=`basename $0`
# FIXME: once this is part of the package!
PROGDIR=`dirname $0`
echo "$PROGDIR" | grep -qs '^/' || {
	test -f /usr/local/sbin/$PROG &&
		PROGDIR=/usr/local/sbin
	test -f $HA_NOARCHBIN/$PROG &&
		PROGDIR=$HA_NOARCHBIN
}

LOGD_CF=`findlogdcf /etc $HA_DIR`
export LOGD_CF

: ${SSH_OPTS="-T"}
LOG_PATTERNS="CRIT: ERROR:"
PEINPUTS_PATT="peng.*PEngine Input stored"

#
# the instance where user runs hb_report is the master
# the others are slaves
#
if [ x"$1" = x__slave ]; then
	SLAVE=1
fi

#
# if this is the master, allow ha.cf and logd.cf in the current dir
# (because often the master is the log host)
#
if [ "$SLAVE" = "" ]; then
	[ -f ha.cf ] && HA_CF=ha.cf
	[ -f logd.cf ] && LOGD_CF=logd.cf
fi

usage() {
	cat<<EOF
usage: hb_report -f {time|"cts:"testnum} [-t time] [-u user] [-l file] [-p patt]
       [-L patt] [-e prog] [-SDC] dest

	-f time: time to start from or a CTS test number
	-t time: time to finish at (dflt: now)
	-u user: ssh user to access other nodes (dflt: empty, hacluster, root)
	-l file: log file
	-p patt: regular expression to match variables to be removed;
	         this option is additive (dflt: "passw.*")
	-L patt: regular expression to match in log files for analysis;
	         this option is additive (dflt: $LOG_PATTERNS)
	-e prog: your favourite editor
	-D     : don't invoke editor to write description
	-C     : remove the destination directory
	-S     : single node operation; don't try to start report
	         collectors on other nodes
	dest   : destination directory
EOF

[ "$1" != short ] &&
	cat<<EOF

	. the multifile output is first stored in a directory {dest}
	  of which a tarball {dest}.tar.gz is created
	. the time specification is as in either Date::Parse or
	  Date::Manip, whatever you have installed; Date::Parse is
	  preferred
	. we try to figure where is the logfile; if we can't, please
	  clue us in

	Examples

	  hb_report -f 2pm /tmp/report_1
	  hb_report -f "2007/9/5 12:30" -t "2007/9/5 14:00" /tmp/report_2
	  hb_report -f 1:00 -t 3:00 -l /var/log/cluster/ha-debug /tmp/report_3
	  hb_report -f "09sep07 2:00" -u hbadmin /tmp/report_4
	  hb_report -f 18:00 -p "usern.*" -p "admin.*" /tmp/report_5
	  hb_report -f cts:133 /tmp/ctstest_133

	. WARNING . WARNING . WARNING . WARNING . WARNING . WARNING .
	  We try to sanitize the CIB and the peinputs files. If you
	  have more sensitive information, please supply additional
	  patterns yourself. The logs and the crm_mon, ccm_tool, and
	  crm_verify output are *not* sanitized.
	  IT IS YOUR RESPONSIBILITY TO PROTECT THE DATA FROM EXPOSURE!
EOF
	exit
}
#
# these are "global" variables
#
setvarsanddefaults() {
	now=`perl -e 'print time()'`
	# used by all
	DESTDIR=""
	FROM_TIME=""
	CTS=""
	TO_TIME=0
	HA_LOG=""
	UNIQUE_MSG="Mark:HB_REPORT:$now"
	SANITIZE="passw.*"
	REMOVE_DEST=""
	# used only by the master
	NO_SSH=""
	SSH_USER=""
	TRY_SSH="hacluster root"
	SLAVEPIDS=""
	NO_DESCRIPTION=""
}
chkdirname() {
	[ "$1" ] || usage short
	[ $# -ne 1 ] && fatal "bad directory name: $1"
	echo $1 | grep -qs '^/' ||
		fatal "destination directory must be an absolute path"
	[ "$1" = / ] &&
		fatal "no root here, thank you"
}
chktime() {
	[ "$1" ] || fatal "bad time specification: $2"
}
msgcleanup() {
	fatal "destination directory $DESTDIR exists, please cleanup"
}
nodistdirectory() {
	fatal "could not create the destination directory $DESTDIR"
}
time2str() {
	perl -e "use POSIX; print strftime('%x %X',localtime($1));"
}

#
# find log files
#
logmarks() {
	sev=$1 msg=$2
	c="logger -p $HA_LOGFACILITY.$sev $msg"

	for n in `getnodes`; do
		if [ "$n" = "`uname -n`" ]; then
			$c
		else
			[ "$ssh_good" ] &&
				echo $c | ssh $ssh_opts $n
		fi
	done
}
findlog() {
	if [ "$HA_LOGFACILITY" ]; then
		findmsg $UNIQUE_MSG | awk '{print $1}'
	else
		echo ${HA_DEBUGFILE:-$HA_LOGFILE}
	fi
}

#
# find log slices
#
findlogseg() {
	logf=$1
	from_time=$2
	to_time=$3

	FROM_LINE=`findln_by_time $logf $from_time`
	if [ -z "$FROM_LINE" ]; then
		warning "couldn't find line for time $from_time; corrupt log file?"
		return
	fi

	TO_LINE=""
	if [ "$to_time" != 0 ]; then
		TO_LINE=`findln_by_time $logf $to_time`
		if [ -z "$TO_LINE" ]; then
			warning "couldn't find line for time $to_time; corrupt log file?"
			return
		fi
	fi
}

cts_findlogseg() {
	logf=$1
	testnum=$2

	FROM_LINE=`grep -n "Running test.*\[$testnum\]" $logf | sed 's/:.*//'`
	if [ -z "$FROM_LINE" ]; then
		warning "couldn't find line for CTS test $testnum; corrupt log file?"
		return
	else
		FROM_TIME=`linetime $logf $FROM_LINE`
	fi
	TO_LINE=`grep -n "Running test.*\[$((testnum+1))\]" $logf | sed 's/:.*//'`
	[ "$TO_LINE" ] ||
		TO_LINE=`wc -l < $logf`
	TO_TIME=`linetime $logf $TO_LINE`
}

#
# this is how we pass environment to other hosts
#
dumpenv() {
	cat<<EOF
FROM_TIME=$FROM_TIME
CTS=$CTS
TO_TIME=$TO_TIME
HA_LOG=$HA_LOG
MASTER_IS_HOSTLOG=$MASTER_IS_HOSTLOG
DESTDIR=$DESTDIR
UNIQUE_MSG=$UNIQUE_MSG
SANITIZE="$SANITIZE"
REMOVE_DEST="$REMOVE_DEST"
EOF
}
send_config() {
	for node in `getnodes`; do
		[ "$node" = "$WE" ] && continue
		dumpenv |
		ssh $ssh_opts $node "mkdir -p $DESTDIR; cat > $DESTDIR/.env"
	done
}
start_remote_collectors() {
	for node in `getnodes`; do
		[ "$node" = "$WE" ] && continue
		ssh $ssh_opts $node "$PROGDIR/hb_report __slave $DESTDIR" |
			(cd $DESTDIR && tar xf -) &
		SLAVEPIDS="$SLAVEPIDS $!"
	done
}

#
# does ssh work?
#
testsshuser() {
	if [ "$2" ]; then
		ssh -T -o Batchmode=yes $2@$1 true 2>/dev/null
	else
		ssh -T -o Batchmode=yes $1 true 2>/dev/null
	fi
}
findsshuser() {
	for u in "" $TRY_SSH; do
		rc=0
		for n in `getnodes`; do
			[ "$node" = "$WE" ] && continue
			testsshuser $n $u || {
				rc=1
				break
			}
		done
		if [ $rc -eq 0 ]; then
			echo $u
			return 0
		fi
	done
	return 1
}

#
# the usual stuff
#
getbacktraces() {
	flist=`find_files $HA_VARLIB/cores $1 $2`
	[ "$flist" ] &&
		getbt $flist > $3
}
getpeinputs() {
	if [ "$HA_LOGFACILITY" ]; then
		n=`basename $3`
		patt=" $n $PEINPUTS_PATT"
	else
		patt="$PEINPUTS_PATT"
	fi
	flist=$(
	if [ -f $3/ha-log ]; then
		grep "$patt" $3/ha-log | awk '{print $NF}'
	elif [ -f $3/../ha-log ]; then # central log
		grep "$patt" $3/../ha-log | awk '{print $NF}'
	else
		find_files $HA_VARLIB/pengine $1 $2
	fi | sed "s,$HA_VARLIB/,,g"
	)
	[ "$flist" ] &&
		(cd $HA_VARLIB && tar cf - $flist) | (cd $3 && tar xf -)
}
touch_DC_if_dc() {
	dc=`crmadmin -D 2>/dev/null | awk '{print $NF}'`
	if [ "$WE" = "$dc" ]; then
		touch $1/DC
	fi
}

#
# some basic system info and stats
#
sys_info() {
	echo "Heartbeat version: `hb_ver`"
	crm_info
	echo "Platform: `uname`"
	echo "Kernel release: `uname -r`"
	echo "Architecture: `arch`"
	[ `uname` = Linux ] &&
		echo "Distribution: `distro`"
}
sys_stats() {
	set -x
	uptime
	ps axf
	ps auxw
	top -b -n 1
	netstat -i
	arp -an
	set +x
}

#
# replace sensitive info with '****'
#
sanitize() {
	for f in $1/ha.cf $1/cib.xml $1/pengine/*; do
		[ -f "$f" ] && sanitize_one $f
	done
}

#
# remove duplicates if files are same, make links instead
#
consolidate() {
	for n in `getnodes`; do
		if [ -f $1/$2 ]; then
			rm $1/$n/$2
		else
			mv $1/$n/$2 $1
		fi
		ln -s ../$2 $1/$n
	done
}

#
# some basic analysis of the report
#
checkcrmvfy() {
	for n in `getnodes`; do
		if [ -s $1/$n/crm_verify.txt ]; then
			echo "WARN: crm_verify reported warnings at $n:"
			cat $1/$n/crm_verify.txt
		fi
	done
}
checkbacktraces() {
	for n in `getnodes`; do
		[ -s $1/$n/backtraces.txt ] && {
			echo "WARN: coredumps found at $n:"
			egrep 'Core was generated|Program terminated' \
					$1/$n/backtraces.txt |
				sed 's/^/	/'
		}
	done
}
checklogs() {
	logs=`find $1 -name ha-log`
	[ "$logs" ] || return
	pattfile=`maketempfile` ||
		fatal "cannot create temporary files"
	for p in $LOG_PATTERNS; do
		echo "$p"
	done > $pattfile
	echo ""
	echo "Log patterns:"
	for n in `getnodes`; do
		cat $logs | grep -f $pattfile
	done
	rm -f $pattfile
}

#
# check if files have same content in the cluster
#
cibdiff() {
	d1=`dirname $1`
	d2=`dirname $2`
	if [ -f $d1/RUNNING -a -f $d2/RUNNING ] ||
		[ -f $d1/STOPPED -a -f $d2/STOPPED ]; then
		crm_diff -c -n $1 -o $2
	else
		echo "can't compare cibs from running and stopped systems"
	fi
}
txtdiff() {
	diff $1 $2
}
diffcheck() {
	[ -f "$1" ] || {
		echo "$1 does not exist"
		return 1
	}
	[ -f "$2" ] || {
		echo "$2 does not exist"
		return 1
	}
	case `basename $1` in
	ccm_tool.txt)
		txtdiff $1 $2;; # worddiff?
	cib.xml)
		cibdiff $1 $2;;
	ha.cf)
		txtdiff $1 $2;; # confdiff?
	crm_mon.txt|sysinfo.txt)
		txtdiff $1 $2;;
	esac
}
analyze_one() {
	rc=0
	node0=""
	for n in `getnodes`; do
		if [ "$node0" ]; then
			diffcheck $1/$node0/$2 $1/$n/$2
			rc=$((rc+$?))
		else
			node0=$n
		fi
	done
	return $rc
}
analyze() {
	flist="ccm_tool.txt cib.xml crm_mon.txt ha.cf logd.cf sysinfo.txt"
	for f in $flist; do
		perl -e "printf \"Diff $f... \""
		ls $1/*/$f >/dev/null 2>&1 || continue
		if analyze_one $1 $f; then
			echo "OK"
			[ "$f" != cib.xml ] &&
				consolidate $1 $f
		else
			echo "varies"
		fi
	done
	checkcrmvfy $1
	checkbacktraces $1
	checklogs $1
}

#
# description template, editing, and other notes
#
mktemplate() {
	cat<<EOF
Please edit this template and describe the issue/problem you
encountered. Then, post to
	Linux-HA@lists.linux-ha.org
or file a bug at
	http://old.linux-foundation.org/developer_bugzilla/

See http://linux-ha.org/ReportingProblems for detailed
description on how to report problems.

Thank you.

Date: `date`
By: $PROG $userargs
Subject: [short problem description]
Severity: [choose one] enhancement minor normal major critical blocking
Component: [choose one] CRM LRM CCM RA fencing heartbeat comm GUI tools other

Detailed description:
---
[...]
---

EOF

	if [ -f $DESTDIR/sysinfo.txt ]; then
		echo "Common system info found:"
		cat $DESTDIR/sysinfo.txt
	else
		for n in `getnodes`; do
			if [ -f $DESTDIR/$n/sysinfo.txt ]; then
				echo "System info $n:"
				sed 's/^/	/' $DESTDIR/$n/sysinfo.txt
			fi
		done
	fi
}
edittemplate() {
	if ec=`pickfirst $EDITOR vim vi emacs nano`; then
		$ec $1
	else
		warning "could not find a text editor"
	fi
}
finalword() {
	cat<<EOF
The report is saved in $DESTDIR.tar.gz.

Thank you for taking time to create this report.
EOF
}
checksize() {
	ls -s $DESTDIR.tar.gz | awk '$1>=100{exit 1}' ||
		cat <<EOF

NB: size of the tarball exceeds 100kb and if posted to the
mailing list will have to be first approved by the moderator.
Try reducing the period (use the -f and -t options).
EOF
}

[ $# -eq 0 ] && usage

# check for the major prereq for a) parameter parsing and b)
# parsing logs
#
NO_str2time=""
t=`str2time "12:00"`
if [ "$t" = "" ]; then
	NO_str2time=1
	[ "$SLAVE" ] ||
		fatal "please install the perl Date::Parse module"
fi

WE=`uname -n`  # who am i?
THIS_IS_NODE=""
getnodes | grep -wqs $WE && # are we a node?
	THIS_IS_NODE=1
getlogvars

#
# part 1: get and check options; and the destination
#
if [ "$SLAVE" = "" ]; then
	setvarsanddefaults
	userargs="$@"
	while getopts f:t:l:u:p:L:e:SDCh o; do
		case "$o" in
			h) usage;;
			f)
				if echo "$OPTARG" | grep -qs '^cts:'; then
					FROM_TIME=0 # to be calculated later
					CTS=`echo "$OPTARG" | sed 's/cts://'`
				else
					FROM_TIME=`str2time "$OPTARG"`
					chktime "$FROM_TIME" "$OPTARG"
				fi
			;;
			t) TO_TIME=`str2time "$OPTARG"`
			    chktime "$TO_TIME" "$OPTARG"
			;;
			u) SSH_USER="$OPTARG";;
			l) HA_LOG="$OPTARG";;
			e) EDITOR="$OPTARG";;
			p) SANITIZE="$SANITIZE $OPTARG";;
			L) LOG_PATTERNS="$LOG_PATTERNS $OPTARG";;
			S) NO_SSH=1;;
			D) NO_DESCRIPTION=1;;
			C) REMOVE_DEST=1;;
			[?]) usage short;;
		esac
	done
	shift $((OPTIND-1))
	[ $# -ne 1 ] && usage short
	DESTDIR=`echo $1 | sed 's,/*$,,'`
	chkdirname $DESTDIR
	[ "$FROM_TIME" ] || usage short
fi

# this only on master
if [ "$SLAVE" = "" ]; then
#
# part 2: ssh business
#
	# find out if ssh works
	ssh_good=""
	if [ -z "$NO_SSH" ]; then
		[ "$SSH_USER" ] ||
			SSH_USER=`findsshuser`
		if [ $? -eq 0 ]; then
			ssh_good=1
			if [ "$SSH_USER" ]; then
				ssh_opts="-l $SSH_USER $SSH_OPTS"
			else
				ssh_opts="$SSH_OPTS"
			fi
		fi
	fi
# final check: don't run if the destination directory exists
	[ -d $DESTDIR ] && msgcleanup
	[ "$ssh_good" ] &&
		for node in `getnodes`; do
			[ "$node" = "$WE" ] && continue
			ssh $ssh_opts $node "test -d $DESTDIR" &&
				msgcleanup
		done
fi

if [ "$SLAVE" ]; then
	DESTDIR=$2
	[ -d $DESTDIR ] || nodistdirectory
	. $DESTDIR/.env
else
	mkdir -p $DESTDIR
	[ -d $DESTDIR ] || nodistdirectory
fi

if [ "$SLAVE" = "" ]; then
#
# part 3: log marks to be searched for later
#         important to do this now on _all_ nodes
# 
	if [ "$HA_LOGFACILITY" ]; then
		sev="info"
		cfdebug=`getcfvar debug` # prefer debuglog if set
		isnumber $cfdebug || cfdebug=0
		[ "$cfdebug" -a $cfdebug -gt 0 ] &&
			sev="debug"
		logmarks $sev $UNIQUE_MSG
	fi
fi

# only cluster nodes need their own directories
[ "$THIS_IS_NODE" ] && mkdir -p $DESTDIR/$WE

#
# part 4: find the logs and cut out the segment for the period
#
if [ "$HA_LOG" ]; then  # log provided by the user?
	[ -f "$HA_LOG" ] || {  # not present
		[ "$SLAVE" ] ||  # warning if not on slave
			warning "$HA_LOG not found; we will try to find log ourselves"
		HA_LOG=""
	}
fi
if [ "$HA_LOG" = "" ]; then
	HA_LOG=`findlog`
	[ "$HA_LOG" ] &&
		cnt=`fgrep -c $UNIQUE_MSG < $HA_LOG`
fi
nodecnt=`getnodes | wc -w`
if [ "$cnt" ] && [ $cnt -eq $nodecnt ]; then
	MASTER_IS_HOSTLOG=1
	info "found the central log!"
fi

if [ -f "$HA_LOG" ]; then
	if [ "$NO_str2time" ]; then
		warning "a log found; but we cannot slice it"
		warning "please install the perl Date::Parse module"
	else
		if [ "$CTS" ]; then
			cts_findlogseg $HA_LOG $CTS
		else
			findlogseg $HA_LOG $FROM_TIME $TO_TIME
		fi
		dumplog $HA_LOG $FROM_LINE $TO_LINE |
		if [ "$THIS_IS_NODE" ]; then
			cat > $DESTDIR/$WE/ha-log
		else
			cat > $DESTDIR/ha-log # we are log server, probably
		fi
	fi
else
	[ "$MASTER_IS_HOSTLOG" ] ||
		warning "could not find the log file on $WE"
fi

#
# part 5: start this program on other nodes
#
if [ ! "$SLAVE" ]; then
	if [ "$ssh_good" ]; then
		send_config
		start_remote_collectors
	else
		[ -z "$NO_SSH" -a `getnodes | wc -w` -gt 1 ] &&
			warning "ssh does not work to all nodes"
	fi
fi

#
# part 6: get all other info (config, stats, etc)
#
if [ "$THIS_IS_NODE" ]; then
	getconfig $DESTDIR/$WE
	getpeinputs $FROM_TIME $TO_TIME $DESTDIR/$WE
	getbacktraces $FROM_TIME $TO_TIME $DESTDIR/$WE/backtraces.txt
	touch_DC_if_dc $DESTDIR/$WE
	sanitize $DESTDIR/$WE
	sys_info > $DESTDIR/$WE/sysinfo.txt
	sys_stats > $DESTDIR/$WE/sysstats.txt 2>&1
fi

#
# part 7: endgame:
#         slaves tar their results to stdout, the master waits
#         for them, analyses results, asks the user to edit the
#         problem description template, and prints final notes
#
if [ "$SLAVE" ]; then
	(cd $DESTDIR && tar cf - $WE)
else
	wait $SLAVEPIDS
	analyze $DESTDIR > $DESTDIR/analysis.txt
	mktemplate > $DESTDIR/description.txt
	[ "$NO_DESCRIPTION" ] || {
		echo press enter to edit the problem description...
		read junk
		edittemplate $DESTDIR/description.txt
	}
	cd $DESTDIR/..
	tar czf $DESTDIR.tar.gz `basename $DESTDIR`
	finalword
	checksize
fi

[ "$REMOVE_DEST" ] &&
	rm -r $DESTDIR
