High Availability Linux/Unix

Creating a cluster. Pacemaker

csync2.

Csync2 is the configuration manager of the cluster.
The service mantains the same configuration for all the files involved in a cluster deployment.

Software installation.

apt-get install openssh-server
apt-get install lynx
apt-get update
apt-get install openssh-server
apt-get install gcc
apt-get install make
apt-get install csync2
apt-get install pacemaker
apt-get install xinetd

Configuration.

file: etc/corosync/corosync.conf

# Please read the openais.conf.5 manual page

totem {
version: 2

# How long before declaring a token lost (ms)
token: 3000

# How many token retransmits before forming a new configuration
token_retransmits_before_loss_const: 10

# How long to wait for join messages in the membership protocol (ms)
join: 60

# How long to wait for consensus to be achieved before starting a new round of membership configuration (ms)
consensus: 5000

# Turn off the virtual synchrony filter
vsftype: none

# Number of messages that may be sent by one processor on receipt of the token
max_messages: 20

# Limit generated nodeids to 31-bits (positive signed integers)
clear_node_high_bit: yes

# Disable encryption
secauth: off

# How many threads to use for encryption/decryption
threads: 0

# Optionally assign a fixed node id (integer)
# nodeid: 1234

# This specifies the mode of redundant ring, which may be none, active, or passive.
rrp_mode: none

interface {
# The following values need to be set based on your environment
# change these parameters to fit your machine network configuration.
ringnumber: 0
bindnetaddr: 192.168.0.101
mcastaddr: 224.0.0.1
mcastport: 5605
}
}

amf {
mode: disabled
}

service {
# Load the Pacemaker Cluster Resource Manager
ver:       0
name:      pacemaker
}

aisexec {
user:   root
group:  root
}

logging {
fileline: off
to_stderr: yes
to_logfile: no
to_syslog: yes
syslog_facility: daemon
debug: on
timestamp: on
logger_subsys {
subsys: AMF
debug: off
tags: enter|leave|trace1|trace2|trace3|trace4|trace6
}
}

/etc/csync2.cfg

group ha
{
       host ubuntu ubuntu2;
       key /etc/csync2.key_mygroup;

   # Configuration files to keep synchronized.
       include /etc/csync2.cfg;

   # Executables (scripts) to keep synchronized.
   # include /usr/sbin/LinkCheck;

   # Directories to keep synchronized.
   #    include /shared_stuff/;
   #    include %homedir%/weibullguy;

   # Excluded items.
   #    exclude *~ .*;
   #    exclude /shared_stuff/;

   # What action to take when the csync2.cfg file is synced.
   # Execute the mailer script to send me an e-mail.
   # Log this action in the csync2 log file.
       action
       {
               pattern /etc/csync2.cfg;
               #exec "/etc/csync2/mailer";
               logfile "/var/log/csync2_action.log";
               do-local;
       }

   # Create backups.
#       backup-directory /var/backups/csync2;
#       backup-generations 3;

       auto none;
}

# The homedir variable on frodo and legolas (hosts 2 and 3)
# is different than it is on all other hosts (only aragorn in this case).
#prefix homedir
#{
#       on host[23]: /home/users;
#       on *:        /home;
#}

HealthCheck

/etc/services
servermon       9999/tcp                        # fidonet EMSI over TCP



/etc/xinet.d/servermon
service servermon
{
       socket_type    = stream
       wait           = no
       user           = root
       server         = /usr/sbin/checkstatus
       disable        = no
}

/usr/sbin/checkstatus
#!/bin/bash
export tss=`date +%Y%m%dT%H%M%S`
export host_conn="192.168.0.101"
export name=`cat /etc/hostname`
echo "server_name=$name"
echo "timestamp=$tss"
ping -c 1 -w 1 $host_conn>/dev/null
export network_stat=$?
if [ $network_stat == 0 ]
then echo "network_stat=up"
else
echo "network_stat=down"

fi



/etc/ha/nodes.conf
nodes=ubuntu2 ubuntu3



/usr/sbin/nodes_stat
#!/bin/bash
function getParam(){
export param=$1
export file=$2
cat $file|grep -i $param|cut -d = -f2
}
export host_list=`getParam nodes /etc/ha/nodes.conf`
for host_n in $host_list
do
nc ${host_n} 9999 > /dev/null
export exit_stat=$?
if [ $exit_stat == 0 ]
then
echo "${host_n}=up"
else
echo "${host_n}=down"
exit 2
fi
done
exit 0

Monitoring Watchdog

/etc/ha/service.conf
[monitor definition]
wdgd_log_dir=/var/log/
process_monitor=ssh
crash_notification=echo
monitor_services_ssh=/etc/init.d/ssh
monitor_services_option_ssh= restart
monitor_script_ssh=/usr/sbin/port_mon localhost 22
monitor_postaction_ssh=sleep 5

/usr/sbin/show_wdgd
#!/bin/bash
export proc=$1
while true; do clear; cat /var/log/wdgd_${proc}; sleep 1; done


/usr/sbin/port_mon
#!/bin/bash
nc $1 $2 -w 1
if [ "$?" == "0" ]
then
exit 0
else
exit 1
fi

/usr/sbin/wdgd_srv
#!/bin/bash
export lock_file="/tmp/lock_wdgd.lock"
while true
do
if [ -f $lock_file ]
then
watchdog
else
exit 1
fi
done

/etc/init.d/wdgd

#!/bin/bash

function getParam(){
export param=$1
export file=$2
cat $file|grep -i $param|cut -d = -f2
}

export logDir=`getParam wdgd_log_dir /etc/ha/service.conf`
export tdate=`date +%Y%m%dT%H%M%S`
# debug
# export logFile="${logDir}/wdgd_${tdate}.log"


export wdg_home=/usr/sbin/
case $1
in
start)
export logFile=/dev/null
touch /tmp/lock_wdgd.lock
${wdg_home}wdgd_srv 2>${logFile} 1>>${logFile} &
exit 0;;

debug)
export logFile="${logDir}/wdgd_${tdate}.log"
touch /tmp/lock_wdgd.lock
${wdg_home}wdgd 2>${logFile} 1>>${logFile} &
exit 0;;

stop)
rm /tmp/lock_wdgd.lock
exit 0;;

status)
if [ -f "/tmp/lock_wdgd.lock" ]
then
echo "running"

else
echo "stopped"
fi
exit 0;;

*)
echo "please type one of the following options: start, debug, stop, status "
exit 2;;

esac

/usr/sbin/watchdog
#!/bin/bash

function getParam(){
export param=$1
export file=$2
cat $file|grep -i $param|cut -d = -f2
}
###
function testMonScript()
{
export nombre=$1
export script_mon=`getParam monitor_script_${nombre} /etc/ha/service.conf`
export script_cut=${script_mon}|awk {'print $1'}
if [ -x $script_cut ]
then
echo "$script_mon"
else
echo "NOT_FOUND"

fi
}
###
function logService()
{
export service_name=$1
export base_log=`getParam wdgd_log_dir /etc/ha/service.conf`
export base_dir_log=$base_log
export message=$2
export file_output="${base_dir_log}/wdgd_${service_name}"
export tdate=`date`
echo "${tdate}:> ${message}">$file_output
}
export srvlist=`getParam process_monitor /etc/ha/service.conf`
echo "================================================================"
## for every service in process_monitor we are going to launch the monitor script

for service in $srvlist
do

## first we test if the service to monitor exists, else there is nothing to mon

export process=`getParam monitor_services_${service} /etc/ha/service.conf`
export pars_p=`getParam monitor_services_option_${service} /etc/ha/service.conf`

echo "(*) launching monitor for:  ${service}"
if [ -x $process ]
then
echo "file exist $process"

export test_monitor_scr=`testMonScript $service`
echo "monitor script  $test_monitor_scr"

## if the monitor is not found exit the process and show the error

if [ "$test_monitor_scr" == "NOT_FOUND" ]
then
echo "monitor_services_$service value in service.conf is not executable or does not exist"
exit 5
else

## the monitor exists and is an executable file

echo "ready to lauch monitor: $test_monitor_scr"

$test_monitor_scr
export status=$?
if [ $status == "0" ]
then

## the monitor returned 0 so it seems the monitor to process is ok

echo "monitor for $service ok"
echo "----------------------->"
logService $service  "${service}=ok"
else

## the monitor returned something else than 0 so it seems the monitor to process is down

logService $service $base_dir_log "${service}=down"
echo "*******************************************************************"

echo "WARNING !!!! service $service is dead "
echo "launching $process with the following params: $pars_p"
export rescue_cmd="$process $pars_p"
$rescue_cmd
logService $service "${service}=starting"
export stat_cm=$?
echo "Command status: $stat_cm"
## we must check the exit status for the rescue command
export command_post_start=`getParam monitor_postaction_${service} /etc/ha/service.conf`
echo "launching post-startup command $command_post_start"

$command_post_start
if [ $stat_cm == "0" ]
then
## we check that really the server startup was sucessfull
$test_monitor_scr
export status_r=$?
if [ $status_r == "0" ]
then
## it was !!! :)
logService $service "${service}=started"
echo "success"
else
## it was not !!! :(
export ldate=`date`
export message="CRITICAL: service ${service} was not successfully started after automatic restart. time ( $ldate )"
echo $message
logService $service "${service}=ko"

export crash_cmd=`getParam crash_notification /etc/ha/service.conf`
$crash_cmd $message
fi
else
## ups!!!
export ldate=`date`
export message="CRITICAL: service ${service} was not successfully started after crash. time ( $ldate )"
echo $message
export crash_cmd=`getParam crash_notification /etc/ha/service.conf`
$crash_cmd $message
export command_post_start=`getParam monitor_postaction_${service} /etc/ha/service.conf`
echo "launching post-startup command on fault $command_post_start"
$command_post_start
fi
echo "*******************************************************************"
fi
fi
else
echo "file does not exist"
fi
done

eDirectory Command Wrapper

 #!/bin/bash


#############Vars
export DEBUG="debug"
export INFO="info"
export DEFAULT_LOG_LEVEL="debug"
export DIR_NDS=`cat /etc/opt/novell/eDirectory/conf/.edir/instances.0`
export cmd_stop="/opt/novell/eDirectory/bin/ndsmanage stop --config-file ${DIR_NDS}"
export cmd_start="/opt/novell/eDirectory/bin/ndsmanage start --config-file ${DIR_NDS}"

#export cmd_stop="/root/test.sh stop ${DIR_NDS}"
#export cmd_start="/root/test.sh start ${DIR_NDS}"

#### exit 0

############Funciones

function countProcesses {
export proc=`ps -ef |grep -i ndsd|grep -v "grep" |wc|awk {'print $1'}`
echo $proc
}


function log {
export tlevel=$1
export mesg=$2
export level=$DEFAULT_LOG_LEVEL
export fdat=`date +%s`
if [ $tlevel == $level ];
then
echo "${fdat} [${tlevel}]: ${mesg} "
elif [ $tlevel == "info" ];
then
echo "${fdat} [info]: ${mesg} "

fi
}

function mailroot {
echo $1|mail root
}
#####################Logica general

case $1 in
start)
$cmd_start
export stat_code=$?
if [ $stat_code -eq 0 ];
then
log $INFO "ejecucion script arranque correcta"
else
log $INFO "ejecucion script arranque incorrecta"
mailroot "El script ndsmanage ha mostrado un error."
exit 1
fi
log $INFO "Esperando 5 segundos para comprobar los procesos de eDirectory"
sleep 5
export num=`countProcesses`
log $DEBUG "contando procesos: $num"
if [ $num -eq 1 ];
then
log $INFO "el servicio ha arrancado correctamente"

exit 0
else
mailroot "El servicio no se ha arrancado correctamente."
exit 1
fi
## FIN CASE START
;;
stop)
trap 'echo "salida anormal";exit 126' 2
log $INFO "se va a parar el servicio del directorio"
$cmd_stop
export stat_code=$?
if [ $stat_code -eq 0 ];
then
log $INFO "ejecucion script parada correcta"
else
log $INFO "ejecucion script parada incorrecta"

mailroot "El script mdsmanage de eDirectory ha mostrado un error mientras se lanzaba la parada del directorio. Se va a abortar la copia"
exit 1
fi
log $INFO "Esperando 60 segundos para dar tiempo a la parada ordenada de eDirectory"
sleep 10
log $INFO "Lanzando comprobacion"
export num=`countProcesses`
log $DEBUG "contando procesos: $num"
if [ $num -eq 1 ];
then
log $INFO "el servicio ndsd no se ha parado todavia"

for try in `seq 5`
do
sleep 5
log $INFO "comprobacion numero: $try "

export inum=`countProcesses`
if [ $inum -eq 0 ];
then
echo "el directorio ha parado correctamente"
exit 0
else
echo "el directorio todavia no ha parado"
fi
done
mailroot "El directorio no se ha parado. Abortamos copia de seguridad"
exit 1
else
## salida ok!!!
log $INFO "el servicio de directorio se ha parado. Devolvemos codigo de status correcto"
exit 0
fi
### FIN CASE STOP
;;
rep_t)
ndsrepair -T
;;
rep_s)
ndsrepair -E
;;
status)

export inum=`countProcesses`
if [ $inum -eq 0 ];
then
echo "el directorio esta parado"
exit 3
else
echo "el servicio de directorio esta arrancado"

exit 0
fi
### FIN STATUS
;;
*)
echo "se debe pasar un argumento de tipo start/stop/rep_t/rep_s"
;;
esac

Unix Recipes

domingo, 19 de febrero de 2012

HA linux