Sometime you have applications that can only run in one instance and you do not have clustering software or can run this in a Microservices environment with orchestration support with kubernetes.
Requirement definition;
- Application can only have one instant running at any point in time
- We must be able to handle a linux host failure
- We must be able to handle a data center failure
- We must be able to set the system in maintenance mode
Solution design
- We use a heartbeat software to store heartbeats of a running instance of the application
- We store heartbeats on a storage accessible by all hosts
- Heartbeat software start application if it shall run on this host
- If application is running on another host it will stop any local instances
- We have a way to define maintenance by storing a maintenance state on the same storage as we store heartbeats which is available to all hosts
Solution technical design with a shared disk
- We have two linux servers one in each data center (we have 2 data centers)
- We have a heartbeat process installed as a service (daemon) on two linux servers
- We have the application installed on both linux servers
- We use a shared disk for heartbeats and maintenance information
This is how it could look like before a failover
If the linux server Active goes down or the Data Center 1 goes down we want the process to be started in Data Center 2
Below is a HACMP shell process code that you install on both the active and the passive linux host
You configure script to understand how to start and stop your application
You point out where heartbeat directory is located, if you do not use a shared filesystem you need to update the code to reflect the change, but the simplest is to use NFS or a shared SAN disk
You can download the code from
https://github.com/maxbback/hacmp
#!/bin/bash
#D Instructions
#D make this program a deamon service
#D Add following to /etc/systemd/system/mydaemon.service
#D [Unit]
#D Description=Monitor application
#D
#D [Service]
#D ExecStart=/xxxx/heartbeat.sh
#D Restart=on-failure
#D
#D [Install]
#D WantedBy=multi-user.target
#
#
#D variable pgrepProgramName holds search string for the program to monitor
pgrepProgramName='sh.*exmapleApplication.sh.*'
#D variable startCmd contains information of how to start the program
startCmd='./exmapleApplication.sh'
startCmdLogFile='./startCmdLogFile.log'
#D variable maintenanceFile is the location of a maintenance file, which will make this program stop monitoring
maintenanceFile='./maintenance'
#D variable hostName contains information of the local hostname
hostName=`hostname`
#D variable heartBeatDir defines where HB files are stored, a HB file has the name of the host running the program
heartBeatDir='heartbeat'
#D variable faileOverTime defines how long time we wait for a heartbeat before we consider it dead
faileOverTime=10
#D variable loopTime defines how long each monitor sleep time should be, prefereable 1/3 of failOverTime
loopTime=3
#D function sendIncident notifies that we was soposed to be up but has died
function sendIncident
{
echo "####"
echo "#### ERROR, we sent last heartbeat but is not alive"
echo "####"
}
#D function isAlive test if monitored program is running
function isAlive
{
#D Check if at least one process is running
pgrep -f "$pgrepProgramName"
return $?
}
# function hasHeartBeat takes a hostname and checks if that host is alive av run the monitored program
function hasHeartBeat
{
#First argument is the host to monitor
local hbHostName="$1"
if [ -z "$hbHostName" ]
then
#echo hasHeartBeat argument missing
return 1
fi
if [ ! -f $heartBeatDir/$hbHostName ]
then
#echo hasHeartBeat HB file not found $heartBeatDir/$hbHostName
return 1
fi
# stat checks the status of a specific file
# eval sets variables for each result from stat, st_mtime contains last modified time
eval $(stat -s $heartBeatDir/$hbHostName)
st_mtime=`stat -c %Y $heartBeatDir/$hbHostName`
curentTime=`date +%s`
echo $curentTime
echo $st_mtime
echo $(($curentTime - $st_mtime))
echo $(($curentTime - $st_mtime - $faileOverTime))
if [ $(($curentTime - $st_mtime - $faileOverTime)) -lt 0 ]
then
echo "Instance is alive on node $1"
return 0
else
echo "Instance down"
return 1
fi
}
function killInstance
{
echo "Killing server Instance"
#D find all running instanes and stop them nicly
pgrep -f "$pgrepProgramName" | while read pid
do
kill $pid
done
#D wait 30 seconds for processes to die before we kill them hard
sleep 30
#D find all running instanes and kill them
pgrep -f "$pgrepProgramName" | while read pid
do
kill -9 $pid
done
}
function startInstance
{
echo "in Starting server"
echo "Sleep 5 second to give room for avoiding conflicts"
sleep 5
echo "Check if we are still master"
latestHB=`ls -1tr $heartBeatDir|tail -1`
if [ "$latestHB" == "$hostName" ]
then
echo "We are master so start server"
nohup $startCmd > $startCmdLogFile 2>&1 &
fi
}
#Main
echo "starting monitoring"
while [ 1 ]
do
echo "In monitoring loop sleep $loopTime"
sleep $loopTime
if [ -f $maintenanceFile ]
then
echo "We are in maintenance mode, so we do nothing"
continue
fi
latestHB=`ls -1tr $heartBeatDir|tail -1`
#echo "last HB sent By :$latestHB:"
if [ "$latestHB" == "" ]
then
#echo "No master, lets be master"
touch $heartBeatDir/$hostName
startInstance
continue
else
#are we master?
if [ "$latestHB" == "$hostName" ]
then
# We sent last heart beat, check if we are alive
isAlive
if [ $? -eq 0 ]
then
# echo "we sent last HB and is alive"
touch $heartBeatDir/$hostName
continue
fi
# echo "we have died, sent incident notification"
sendIncident
# echo "We will sleep 2 * failoverTime to allow someone else take over before we try to start up"
sleep $faileOverTime
sleep $faileOverTime
else
echo "We are not master, so kill any running instances"
killInstance
fi
fi
# Collect new status of last heartbeat
latestHB=`ls -1tr $heartBeatDir|tail -1`
echo $latestHB
echo "checking last heartbeat if we shll tak eover"
hasHeartBeat $latestHB
hostAlive=$?
if [ $hostAlive -eq 1 ]
then
echo "no master taking over"
echo "trying to be master"
touch $heartBeatDir/$hostName
startInstance
fi
done