#!/bin/bash
#
# cs_show_cluster_patterns
#
# (c) 2011-2017 SUSE Linux GmbH, Germany.
# (c) 2018-2022 SUSE LLC
# Author: L.Pinne.
# GNU General Public License v2. No warranty.
# http://www.gnu.org/licenses/gpl.html
#
# Version: 2022-02-11
#
# shellcheck disable=SC1090,SC2140,SC2125
# SC2140 and SC2125 are needed because of the ongoing pattern declaration
# discussion.
#

EXE="$0"

CFG="/etc/ClusterTools2/cs_show_cluster_patterns"
test -s $CFG && source $CFG

RDM=$RANDOM
JCTL=""

test -z "/etc/corosync/corosync.conf" &&\
test -z "${CLUSTER_LOG}" &&\
	 CLUSTER_LOG=$(grep "^[^#]*logfile:.*/" /etc/corosync/corosync.conf |\
		 tr -d "    " | cut -d":" -f2)
test -z "${CLUSTER_LOG}" &&\
	CLUSTER_LOG="/var/log/messages"
ZIPPED_LOG="${CLUSTER_LOG}*bz2"

if [ ! -s "${CLUSTER_LOG}" ]; then
	if command -v journalctl 2>/dev/null; then
		JCTL="/tmp/journalctl.$RDM"
		CLUSTER_LOG=${JCTL}
	fi
fi

# TODO: what error msgs. make sense?
# TODO: xfs_repair
# TODO function to check for unwanted blanks: grep "^\".*\ .*\"$" $EXE
#
# Please use dot "." instead of blanks " " in patterns!
test -z "${OSERR_PATTERN}" &&\
	 OSERR_PATTERN="
"NMI.received"
"NMI.*error"
"dlm.*link.*down"
"ocfs2.*has.evicted"
"ocfs2.*ERR"
"ocfs2.*not.unmounted.cleanly"
"OCFS2.*kernel.interface.loaded"
"o2dlm_eviction_cb"
"fsck.*recommended"
"EXT.-fs.error"
"XFS.*Shutting.down"
"multipath.*path.removed"
"shutdown.*shutting.down"
"syslog-ng.shutting.down"
"syslog-ng.starting.up"
"kernel.*bootconsole.*enabled"
"systemd.*Stopping.Restore.*shutdown"
"oft[Dd]og.*not"
"

# TODO: verify cluster log msgs. maybe more generic possible?
# TODO: unmanaged FAILED
# TODO: event start, event end
# TODO: update config file from built-in pattern list
test -z "${CLUSTR_PATTERN}" &&\
	 CLUSTR_PATTERN="
"CLM.*Members.Left:"
"TOTEM.*failed.*receive"
"TOTEM.*processor.failed"
"TOTEM.*Incrementing.problem"
"TOTEM.*Retransmit.List"
"pcmk.*Invalid.destination"
"pcmk.*Assertion.failure"
"pcmk.*Child.process.*exited"
"pcmk.*Sending.message.*failed"
"stonith.*Device.*not.found"
"stonith.*Found.[01].matching.devices"
"stonith.*failed"
"stonith.*timed.out"
"error.executing.ipmitool"
"pengine.fenced.because.*.un-expectedly.down"
"pengine.*Node.*unclean"
"pengine.*fenced.*resource.failure"
"pengine.*Forcing.*away.*failures"
"pengine.*LogActions:.Start"
"pengine.*LogActions:.Stop"
"pengine.*LogActions:.Recover"
"pengine.*LogActions:.Restart"
"pengine.*LogActions:.Monitor"
"pengine.*LogActions:.Move"
"pengine.*LogActions:.Leave.*\(Started\)"
"pengine.*LogActions:.Leave.*\(Stopped\)"
"pengine.*LogActions:.Leave.*\(Started.unmanaged\)"
"pengine.*LogActions:.Leave.*\(Stopped.unmanaged\)"
"cib.*failed"
"cib.*wanted.to.shut.down"
"High.CIB.load"
"crm_resource.*Error"
"crmd.*quorum.lost"
"crmd.*our.DC.is.dead"
"crmd.*Resource.*active.at.shutdouwn"
"crmd.*Action.*not.supported"
"crmd.*State.transition.S_IDLE"
"crmd.*State.transition.S_POLICY_ENGINE"
"crmd.*State.transition.S_TRANSITION_ENGINE"
"crmd.*State.transition.S_STARTING"
"crmd.*State.transition.S_PENDING"
"crmd.*State.transition.S_ELECTION"
"crmd.*State.transition.S_INTEGRATION"
"crmd.*State.transition.S_FINALIZE_JOIN"
"crmd.*State.transition.S_NOT_DC"
"crmd.*ERROR:"
"crmd.*Updating.failcount.*stop"
"crmd.*Updating.failcount.*start"
"crmd.*cluster.nodes.*eligible"
"crmd.*High.CPU.load"
"lrmd.*probe"
"lrmd.*RA.output.*failed"
"lrmd.*RA.output.*error"
"sbd.*Initializing"
"sbd.*Monitoring"
"sbd.*Using.watchdog"
"sbd.*Set.watchdog.timeout"
"sbd.*Writing.*to"
"sbd.*successfully.delivered"
"sbd.*Received.command"
"sbd.*mbox.read.failed"
"sbd.*Latency"
"sbd.*terminated"
"sbd.*Rebooting.system"
"stonith-ng.*Failed"
"

# TODO: SAPInstance, SAPDatabase patterns, See /etc/ClusterTools2/cs_show_cluster_patterns
test -z "${RESRC_PATTERN}" &&\
	 RESRC_PATTERN="
"Volume.group.*error"
"LVM.*not.*correctly"
"CTDB.*not.start"
"Timeout.*CTDB"
"not.in.Secondary.mode"
"demote.*still.primary"
"Clone.options.misconfigured"
"ERS_InstanceName.parameter.is.mandatory"
"Cannot.find.sapstartsrv"
"sapstartsrv.*for.instance"
"Expected.*instance.*profile"
"Expected.DIR_PROFILE.parameter"
"SAP.*Unauthorized"
"SAPHana.*Secure.store.users"
"SAPHana.*RA.*rc=[1-7,9]"
"SAPHana.*ERR:"
"SAPHana.*systemd.service.*not.active"
"HANA_CALL.timed.out"
"SAPHanaTopology.*failed"
"eDirectory.*failed.*stop"
"eDirectory.*not.*configured"
"eDir.configuration.error"
"ndsd.*no.*socket"
"eDirectory.isn.*running"
"eDirectory.configuration.not"
"Couldn.*find.device"
"Couldn.*fsck"
"Couldn.*mount"
"DANGER.*NOT.cluster-aware"
"

CFGVAR="
CLUSTER_LOG
OSERR_PATTERN
CLUSTR_PATTERN
RESRC_PATTERN
"


function help(){
	echo "	$(basename "$EXE") OPTION"
	echo
	echo " --help		show help."
	echo " --version	show version."
	echo " --writecfg	show some internal variables."
	echo " --show		show all patterns."
	echo " --fencing	show fencing patterns."
	echo " --migration	show migration patterns."
	echo
	echo " --count	count total for each pattern."
	echo " --zip		count from compressed logs, too."
}


function writecfg(){
	echo -e "# $CFG \n# For SLES12 and SLES15.\n#"
	for c in $CFGVAR; do
		echo "${c}=\""
		# shellcheck disable=SC2086
		echo ${!c} | tr " " "\n"
		echo "\""
		echo "#"
	done
}


function run_grep_show(){
	echo "logs: $LOG" >/dev/stderr

        # TODO: more efficient loop
	# shellcheck disable=SC2086
	for e in ${OSERR_PATTERN} ${CLUSTR_PATTERN} ${RESRC_PATTERN}; do
		for f in ${LOG}; do
			test -r "$f" && cat "$f"
		done | grep -i $e
	done | sort -M -k1,3
	# TODO: sort by timestamp OK ? "Sep 27 17:04:11 "
	# TODO: sort error msgs.?
}


function run_grep_count(){
	echo "logs: $LOG" >/dev/stderr

	# TODO: more efficient loop
	# shellcheck disable=SC2086
	for e in ${OSERR_PATTERN} ${CLUSTR_PATTERN} ${RESRC_PATTERN}; do
        	echo -n "$e = "
		for f in ${LOG}; do
			test -r "$f" && cat "$f"
		done | zgrep -ic $e
	done
}


# TODO: resources move success, fail
#Sep 26 16:57:22 ext2300s pengine: [1079]: notice: LogActions: Move    rsc_fs_NFS_host     (Started ext2301s -> ext2300s)
function awk_migration(){
	echo "logs: $LOG" >/dev/stderr
	for f in ${LOG}; do
		test -r "$f" && cat "$f"
	done | sort -M -k1,3 |\
	awk '$0~/pengine.*LogActions:.Move/ {print}' 
}


# TODO: nodes fence, leave, join
#Sep 27 16:07:14 ext2300s pengine: [1079]: WARN: pe_fence_node: Node ext2301s will be fenced because it is un-expectedly down
#Sep 27 16:08:15 ext2300s stonith: [18720]: CRIT: external_run_cmd: Calling '/usr/lib64/stonith/plugins/external/sbd reset ext2301s' returned 
# TODO: evtl. "stonith-ng:.*succe"
function awk_fencing(){
	echo "logs: $LOG" >/dev/stderr
	for f in ${LOG}; do
		test -r "$f" && cat "$f"
	done | sort -M -k1,3 |\
	grep -e "pengine:.*Node.*fenced" -e "sbd:.*reset"
}

# TODO: reading from pipe

# main()
case $1 in
	-v|--version)
		echo -n "$(basename "$EXE") "
		head -11 "$EXE" | grep "^# Version: "
		exit
	;;
	-w|--writecfg)
		writecfg
		exit
	;;
	-z|--zip)
		test -n "${JCTL}" && journalctl --no-pager >$JCTL
		LOG=""
		for z in ${ZIPPED_LOG}; do
			test -s "$z" && LOG="${LOG} ${z}"
		done
		# unzipped log has to be last in loop :-/
		{ test -n "${CLUSTER_LOG}" && test -s "${CLUSTER_LOG}"; } && LOG="${LOG} ${CLUSTER_LOG}"
		run_grep_count
		exit
	;;
	-c|--count)
		test -n "${JCTL}" && journalctl --no-pager >$JCTL
		{ test -n "${CLUSTER_LOG}" && test -s "${CLUSTER_LOG}"; } && LOG="${LOG} ${CLUSTER_LOG}"
		run_grep_count
		exit		
	;;
	-f|--fencing)
		test -n "${JCTL}" && journalctl --no-pager >$JCTL
		{ test -n "${CLUSTER_LOG}" && test -s "${CLUSTER_LOG}"; } && LOG="${LOG} ${CLUSTER_LOG}"
		awk_fencing
		exit
	;;
	-m|--migration)
		test -n "${JCTL}" && journalctl --no-pager >$JCTL
		{ test -n "${CLUSTER_LOG}" && test -s "${CLUSTER_LOG}"; } && LOG="${LOG} ${CLUSTER_LOG}"
		awk_migration
		exit
	;;
	-s|--show)
		test -n "${JCTL}" && journalctl --no-pager >$JCTL
		{ test -n "${CLUSTER_LOG}" && test -s "${CLUSTER_LOG}"; } && LOG="${LOG} ${CLUSTER_LOG}"
		run_grep_show
		exit		
	;;
	*)
		help
		exit
	;;
esac
#
