#!/bin/bash
#
# cs_show_error_patterns
#
# (c) 2011-2017 SUSE Linux GmbH, Germany.
# (c) 2018-2022 SUSE LLC
# Author: L.Pinne.
# GNU General Public License v2. No warranty.
# http://www.gnu.org/licenses/gpl.html
#
# Version: SLES12 0.2 2022-07-12
#
# shellcheck disable=SC1090,SC2140,SC2125,SC2037,SC2211
# SC2140,SC2125,SC2037 and SC2211 are needed because of the ongoing
# pattern declaration discussion.
#

EXE="$0"

CFG="/etc/ClusterTools2/cs_show_error_patterns"
test -s $CFG && source $CFG

RDM=$RANDOM
DMSG="/tmp/dmesg.$RDM"
JCTL=""

CFGVAR="
ERROR_PATTERN
SVCERR_PATTERN
CLUERR_PATTERN
APPERR_PATTERN
CLUSTER_LOG
SYSTEM_LOG
ZIPPED_LOG
"

test -z "${SYSTEM_LOG}" &&\
	 SYSTEM_LOG="/var/log/messages /var/log/boot.msg /var/log/boot.omsg /var/spool/mail/root /proc/mdstat /proc/net/bonding/bond0 /var/log/mcelog"

# TODO SLES12 bond ...?
if command -v journalctl 2>/dev/null; then
	JCTL="/tmp/journalctl.$RDM"
	test -z "${SYSTEM_LOG}" &&\
		SYSTEM_LOG="/var/log/messages /var/spool/mail/root /proc/mdstat /proc/net/bonding/bond0 /var/log/mcelog"
fi
SYSTEM_LOG="${SYSTEM_LOG} ${DMSG} ${JCTL}"

test -z "/etc/corosync/corosync.conf" &&\
test -z "${CLUSTER_LOG}" &&\
	 CLUSTER_LOG=$(grep "^[^#]*logfile:.*/" /etc/corosync/corosync.conf |\
		 tr -d "    " | cut -d":" -f2)

test -z "${ZIPPED_LOG}" &&\
	ZIPPED_LOG="/var/log/messages*bz2"
test -n "${CLUSTER_LOG}" &&\
	ZIPPED_LOG="${ZIPPED_LOG} $(echo "${CLUSTER_LOG}"|tr -d \\n)*bz2"

# TODO add more HBA errors: hpsa cciss
# TODO function to check for unwanted blanks: grep "^\".*\ .*\"$" $EXE
#
# Please use dot "." instead of blanks " " in patterns!
test -z "${ERROR_PATTERN}" &&\
	 ERROR_PATTERN="
"taint.flag"
"signature.*missing"
"no.device.*found"
"be2net.*Error"
"bnx2x.*failed"
"mlx4_en.*error"
"mlx5_core.*failed"
"bfa.*err"
"lpfc.*err"
"lpfc.*failed"
"lpfc.*Not.supported"
"ibmvfc.*failed"
"ixgbe.*rror"
"qla.*err"
"qla.*failed"
"qla.Unable"
"qlge.*error"
"hpilo.*could.not"
"Ramping.down.queue"
"hfcldd.*Err"
"unsupported.SFP.*module"
"Unable.*SFP"
"IO_PAGE_FAULT"
"Power.key.pressed"
"NMI.received"
"NMI.*error"
"irq.*nobody.cared"
"kthread.starved.for.*jiffies"
"urandom.warning"
"Signaling.PME.*interrupt"
"Error.DRAM"
"nvme.*error"
"Uncorrectable.*rror"
"error.detected.during.POST"
"mce:.CPU.supports.0.MCE.banks"
"CPU.bug.present"
"Processor.*temperature"
"SMP.disabled"
"No.hypervisor.support.for.SR-IOV"
"Firmware.Bug"
"BIOS.bug"
"Broken.BIOS"
"BMC.*not"
"kernel.*task.*abort"
"kernel.*ACPI.Error"
"kernel.*Hardware.Error"
"kernel.*page.allocation.failure"
"kernel.*failed.to.assign"
"kernel.*medium.error"
"kernel.*mmu.*suppressed"
"kernel.*supported.only"
"kernel.*pci.*failed"
"kernel.*lost.*interrupt"
"kernel.*not.have.kernel.memory.protection"
"kernel.*clocksource:.*witched"
"kernel.*NUMA.turned.off"
"kernel.Lockdown*"
"kernel.*Duplicate.cookie"
"kernel.*NFS.*failed"
"BUG:.*workqueue.lockup"
"Section.*circular.dependency"
"double.fault:"
"target.failure"
"reservation.conflict"
"SCSI.*reset"
"LUN.larger"
"I/O.error"
"Broken.pipe"
"Dropped.frame"
"disabling.*barrier"
"doesn.*support.*DPO.*FUA"
"BBU.disabled"
"Warning.*LUN.assignments.*changed"
"[Ff]ail.*/dev/md"
"U_\|_U"
"found.0.matching.devices"
"/dev/md.*failed"
"duplicate.VG"
"duplicate.PV"
"not.found"
"mount.over.is.not.empty"
"fsck.*recommended"
"EXT.-fs.error"
"BTRFS.error"
"Starting.XFS.recovery"
"xfs_log_force.*error"
"XFS.*allocation.deadlock"
"XFS.*head.behind.tail"
"sdrServ:.*err"
"server.*not.responding"
"iSCSI.connection.*error"
"Sense.*error"
"bond.*without.any.active.interface"
"bond.*link.*down"
"bond.*duplicate.address"
"bond.*not.detect.*failures"
"bond.*permanent.HWaddr.*use"
"bond.*No.*response"
"link.*not.ready"
"ink..ailure"
"ERROR.*egress"
"not.*offload"
"RPC.*multiple.fragments"
"martian.source.*dev"
"unexpectedly.shrunk.window"
"TCP:.Possible.SYN.flooding.on.port"
"net_ratelimit:"
"Host.*is.not.allowed.to.talk.to.us"
"COMPLAIN.ACCEPT.*MAC"
"POSSIBLE.BREAK-IN.ATTEMPT"
"error:.PAM:.User.not.known"
"conversation.failed"
"refused.mount.request"
"nfs..server.*not.responding"
"Failed.to.read"
"exits.with.status.*[1-9]"
"cgroup:.*rejected"
"shutdown.*shutting.down"
"systemd.*Reached.target.Shutdown"
"dbus-daemon.*ailed"
"kernel.*segfault.at"
"Out.of.memory"
"invoked.oom-killer"
"Call.Trace"
"soft.lockup"
"kernel.*deprecated"
"annot.open.*file"
"nable.to.open.*file"
"nable.to.load"
"Potential.spurious.wakeup"
"annot.create..ocket"
"end.*failed"
"ead.*failed"
"pagecache.limit.set"
"

test -z "${SVCERR_PATTERN}" &&\
	 SVCERR_PATTERN="
"systemd.*Received.SIGINT"
"systemd.*start-limit"
"systemd.*nknown"
"systemd.*Proceeding.anyway"
"systemd.*No.such.file"
"systemd.*ailed"
"systemd.*dumped.core"
"wickedd.*ouldn"
"SMART.*failure"
"multipathd.*timed.out"
"multipathd.*mpath.*unable"
"multipathd.*reinstated"
"multipathd.*too.much"
"multi.*path.*down"
"lvmetad.*failed"
"fstrim:.*FITRIM.ioctl.failed:"
"wicked.*failed"
"ntpd.*failed"
"ntpd.*time.reset"
"ntpd.TIME_ERROR"
"chronyd.*clock.wrong"
"chronyd.*time.jump"
"chronyd.*no.selectable.sources"
"time.sync.status.changed"
"Frequency.format.error"
"failed.to.bind.to.LDAP"
"logrotate.*error"
"Failed.services.*runlevel"
"syslog-ng.shutting.down"
"postfix.*fatal"
"cgroup:.fork.rejected"
"sfcbd.*Fatal"
"libvirt.error.code"
"

# TODO raid.set.*active.with.2.out.of.2.mirrors ?
# TODO check for un-matched userfriendly names (mpath[a-z])
# TODO pattern for failed module (could not load)
# TODO async IO from grep kio /proc/slabinfo ?
# TODO direct IO from grep ...,DIR, /proc/slabinfo ?
# TODO verify cluster log msgs. maybe more generic possible ?

test -z "${CLUERR_PATTERN}" &&\
	 CLUERR_PATTERN="
"TOTEM.*Falied.*receive"
"TOTEM.*processor.failed"
"TOTEM.*Incrementing.problem"
"TOTEM.*Retransmit.List"
"corosync.*not.scheduled"
"quorum.lost"
"fenced.because.*.un-expectedly.down"
"stonith.*warning"
"stonith.*Device.*not.fond"
"Node.*unclean"
"pcmk.*Child.process.*exited"
"pcmk.*message.*local.cib.failed"
"pengine.*fenced.*resource.failure"
"pengine.*Forcing.*away.*failures"
"High.CIB.load"
"crm_resource.*Error"
"crmd.*ERROR:"
"crmd.*Updating.failcount.*stop"
"crmd.*Updating.failcount.*start"
"crmd.*High.CPU.load"
"sbd.*mbox.read.failed"
"sbd.*Latency"
"sbd.*Header.*changed"
"oft.og.*not"
"dlm.*link.*down"
"ocfs2.*has.evicted"
"ocfs2.*ERR"
"ocfs2.*not.unmounted.cleanly"
"OCFS2.*kernel.interface.loaded"
"o2dlm_eviction_cb"
"

test -z "${APPERR_PATTERN}" &&\
	APPERR_PATTERN="
"Serviceguard.daemon.*exit"
"GAB.*failure"
"Symantec.Technical.Support"
"error.*VRT"
"VCS.CRITICAL"
"vxvm.*volume.*not"
"mfeaack.*loading.module"
"Hangcheck.*clock"
"lin_tape.*failed"
"lin-tape.*device.reset"
"vasd.*hutting.down"
"vasd.will.not.synchronize.time"
"vascache.*error"
"sapstartsrv.*pam_systemd.*Failed"
"Printing:.*Problem"
"Basis.System:.*error"
"Basis.System:.*canceled"
"Basis.System:.*SYSFAIL"
"Basis.System:.*lost"
"bluestore.*FAILED"
"ceph.*Failed"
"ceph.*_fail"
"ceph.*ERROR"
"tclink.*ERROR"
"tclink.*error"
"salt.*ERROR"
"discagent.*Error"
"cmclconfd.*does.not"
"

function help() {
	echo "usage:	$(basename "$EXE")"
	echo "	$(basename "$EXE") [OPTION]"
	echo
	echo " --help		show help."
	echo " --version	show version."
	echo " --writecfg	show some internal variables."
	echo " --zip		search compressed logs, too."
}


function writecfg(){
	echo -e "# $CFG \n# For SLES12 and SLES15.\n#"
	for c in $CFGVAR; do
		echo "${c}=\""
		# shellcheck disable=SC2086
		echo ${!c} | tr " " "\n"
		echo "\""
		echo "#"
	done
}


function run_grep() {
	echo -e "logs: $LOG\n" >/dev/stderr

	dmesg >$DMSG
	command -v journalctl 2>/dev/null && journalctl --no-pager >$JCTL

	# TODO more efficient loop
	# shellcheck disable=SC2086
	for e in ${ERROR_PATTERN} ${SVCERR_PATTERN}; do
        	echo -n "$e = "
		for f in ${LOG}; do
			test -r "$f" && cat "$f"
		done | zgrep -ic $e
	done
	# shellcheck disable=SC2086
	for e in ${CLUERR_PATTERN} ${APPERR_PATTERN}; do
        	echo -n "$e = "
		for f in ${LOG}; do
			test -r "$f" && cat "$f"
		done | zgrep -ic $e
	done

	rm $DMSG $JCTL
}


# main()
case $1 in
	-v|--version)
		echo -n "$(basename "$EXE") "
		head -11 "$EXE" | grep "^# Version: "
		exit
	;;
	-h|--help)
		help
		exit
	;;
	-w|--writecfg)
		writecfg
		exit
	;;
	-z|--zip)
		LOG="${SYSTEM_LOG}"
		{ test -n "${CLUSTER_LOG}" && test -s "${CLUSTER_LOG}"; } && LOG="${LOG} ${CLUSTER_LOG}"
		for z in ${ZIPPED_LOG}; do
			test -s "$z" && LOG="${LOG} ${z}"
		done
		run_grep
		exit
	;;
	*)
		LOG="${SYSTEM_LOG}"
		{ test -n "${CLUSTER_LOG}" && test -s "${CLUSTER_LOG}"; } && LOG="${LOG} ${CLUSTER_LOG}"
		run_grep
		exit		
	;;
esac
#
