#!/bin/bash
#
# cs_clusterstate 
#
# (c) 2011-2017 SUSE Linux GmbH, Germany.
# (c) 2018-2020 SUSE LLC
# Author: L.Pinne.
# GNU General Public License v2. No warranty.
# http://www.gnu.org/licenses/gpl.html
#
# Version: 2020-07-28 SLES12
#
# shellcheck disable=SC1090,SC2126

EXE="$0"

CFG="/etc/ClusterTools2/cs_clusterstate"
test -s $CFG && source $CFG

test -z "${ETH}" &&\
	 ETH="bond0 eth3"
test -z "${SCF}" &&\
	 SCF="/etc/sysconfig/sbd"
test -z "${SBD}" && test -s "$SCF" &&\
	SBD=$( awk -F"=" '$1=="SBD_DEVICE" {print $2}' "$SCF" |\
		tr -d "\""|tr ";" "\n" )
test -z "${CLPROC}" &&\
# for SLE 11, 12 see $CFG
# TODO double-check SLE 15
CLPROC="
pacemaker-based
pacemaker-fenced
pacemaker-execd
pacemaker-attrd
pacemaker-schedulerd
pacemaker-controld
pacemakerd
corosync
sbd
"
HST=$(hostname -f)
DAT=$(date +"%Y-%m-%d %T")


function help() {
	echo "usage:    $(basename "$0")"
	echo "usage:    $(basename "$0") OPTION"
	echo
	echo "OPTION:"
	echo " --help		show help"
	echo " --all		show all info"
	echo " --basics	show basic info"
	echo " --idle		show idle info"
	echo " --network	show network info"
	echo " --storage	show storage info"
	echo " --replication	show SAPHanaSR info"
	echo " --version	show version info"
}

# run a crmadmin query and obtain its --quiet output
function crmadmin_quiet() {
	# Note: crmadmin in Pacemaker versions before 2.1.0 (released on
	# 8 Jun 2021) printed the --quiet result on stderr, hence checking this
	# output sink first.
	local out
	out=$(crmadmin --quiet "$@" 2>&1 1>/dev/null) || return $?
	[ -n "$out" ] || out=$(crmadmin --quiet "$@" 2>/dev/null) || return $?
	echo "$out"
}

# look up the designated controller node and print its state
function get_idle() {
	local s_dc s_state
	if s_dc=$(crmadmin_quiet --dc_lookup) &&
	   s_state=$(crmadmin_quiet "--status=$s_dc"); then
		echo "Cluster state: $s_state"
	else
		echo "Cluster state: unknown"
	fi
}


function get_basics() {
crm_mon -r1 |\
 awk '$1=="Current"||$1=="Online:"||$1=="OFFLINE:"||$3=="UNCLEAN"{print $0}'
# TODO show [standby] as well
# TODO efficient awk
get_idle
# TODO call crm_mon once and re-use output
echo -n "Stopped/FAILED resources: "
crm_mon -r1 | awk '$3=="Stopped"||$0~/FAILED/{print $0}' | wc -l
# TODO count clone and multistate as well
echo -n "Started primitive resources: "
crm_mon -r1 | awk '$3=="Started"{print $0}' | wc -l

echo -n "Left-over constraints: "
crm configure show | grep "cli-" | wc -l
echo

PSAUX=$(ps aux | awk '{print $11}')
echo -n "HA processes: "
for f in $CLPROC; do
	echo "$PSAUX" | grep -c "$f" >/dev/null && echo "$f"
done | wc -l
echo
echo "TOTEM: "
corosync-cfgtool -s | grep -B1 "status.*=" | grep -v "^--$"
echo
echo "SBDs:"
# TODO crm_mon -r -1 | grep "stonith:external/sbd.:.Started"
for s in $SBD; do
	echo "${s}"
	sbd -d "${s}" list 2>/dev/null
done
echo
}


function get_network() {
echo
# limit to 64 ETHs
ETH64=$(echo "$ETH" | cut -d" " -f1-16)
for i in $ETH64; do
	ip -o a s "$i" | awk '$0~/state/{print $2,$3}'
	case $i in
	bond?)
		grep -B1 "MII.Status" /proc/net/bonding/"$i" | grep -v "^--$"
	 	# TODO ethtool
	;;
	eth?)
		# TODO ethtool
	;;
	esac
done
echo
# TODO network
#ip a s | grep "^[0-9].*:.\<.*\>"
echo -n "IPs: "
	# TODO only one "ip a s"
	ip a s | grep -c "inet.*[0-9]\..*\..*\..*[0-9]\/"
	ip a s | awk '$0~/inet.*[0-9]\..*\..*\..*[0-9]\// {print "    ",$2,$NF}'
echo
}


function get_storage() {
# TODO check for WWIDs 3600...=SAN/SCSI, 1494...=iSCSI 
# TODO configurable pattern for SAN LUNs (f.e "HP,OPEN-V", "HP,HSV"), local LUNs (f.e. "HP,LOGICAL.VOLUME"), iSCSI LUNs (f.e. "IET,VIRTUAL-DISK")
# TODO call multipath only once and put ouput in variable
# echo -n "total LUNs: "
#	 multipath -l | grep -c "^[0-9].*dm-[0-9]"
# TODO check for un-matched userfriendly names (mpath[a-z])

echo
echo -n "SCSI LUNs: "
	multipath -l | grep -Ec "[\^(]3600.*dm-"
echo -n "iSCSI LUNs: "
	multipath -l | grep -Ec "[\^(]1494.*dm-"
echo

# TODO no grep-grep
#grep -v "Personalities.:" /proc/mdstat | grep -v "bitmap.*chunk$" | tr -s "\n" 
echo -n "MDs total: "
	grep -c "^md[0-9].*:" /proc/mdstat
echo -n "MDs complete: "
	grep -c -e "\[UU\]$" -e "\[U\]$" /proc/mdstat
# TODO MDs syncing / in-sync
#echo -n "MDs syncing: "
#	grep -c <TODO> /proc/mdstat
echo -n "MDs split: "
	grep -c "U_\|_U" /proc/mdstat
echo

vgs
echo
echo -n "Filesystems rw: "
	grep -c "^/dev/.*rw" /proc/mounts
echo
}


function get_saphanasr() {
# TODO error handling
# TODO better filter
# TODO call crm_mon once and re-use output
# TODO awk instead of multiple grep
HANA=$( crm_mon -1 |\
 grep  -e "Clone.Set:.*_SAPHana" -e "Masters:.\[" -e "Slaves:.\[" -e "Started:.\[" |\
 tr "\n" "§" )
echo -n "SAPHanaSR started masters: "
echo "${HANA}" | tr "§" "\n" | grep "Masters:.\[" | wc -l
echo -n "SAPHanaSR started slaves: "
echo "${HANA}" | tr "§" "\n" | grep "Slaves:.\[" | wc -l
echo -n "SAPHanaSR master node: "
P_HANA=$(crm_resource list | awk '$5=="(promotable)" {print $4}' | tr -d "[]")
crm_resource -W --master -r "$P_HANA" | awk '$7=="Master" {print $6}'
# TODO started Topology clones
#echo "SAPHanaSR resources:"
#echo ${HANA} | tr "§" "\n"
echo "SAPHanaSR attributes:"
test -x /usr/sbin/SAPHanaSR-showAttr && SAPHanaSR-showAttr | tr -s "\n"
}


# main()

case $1 in
	-v|--version)
		echo -n "$(basename "$EXE") "
		head -11 "$EXE" | grep "^# Version: "
		exit
	;;
	-h|--help)
		help
		exit
	;;
	-s|--storage)
		echo "### $HST - $DAT ###"
		get_storage
		exit
	;;	
	-n|--network)
		echo "### $HST - $DAT ###"
		get_network
		exit
	;;	
	-a|--all)
	# TODO get_processes (number, ram, load), get_groups ?
		echo "### $HST - $DAT ###"
		get_basics
		get_network
		get_storage
		get_saphanasr
		exit
	;;
	-i|--idle)
		echo "### $HST - $DAT ###"
		get_idle
		exit
	;;
	-r|--replication)
		echo "### $HST - $DAT ###"
		get_saphanasr
		get_idle
		exit
	;;
	-b|--basics|*)
		echo "### $HST - $DAT ###"
		get_basics
		exit
	;;
esac
#
