High Availability Linux/Unix
Creating a cluster. Pacemaker
csync2.
Csync2 is the configuration manager of the cluster.
The service mantains the same configuration for all the files involved in a cluster deployment.
Software installation.
apt-get install openssh-server
apt-get install lynx
apt-get update
apt-get install openssh-server
apt-get install gcc
apt-get install make
apt-get install csync2
apt-get install pacemaker
apt-get install xinetd
Configuration.
file: etc/corosync/corosync.conf
# Please read the openais.conf.5 manual page totem { version: 2 # How long before declaring a token lost (ms) token: 3000 # How many token retransmits before forming a new configuration token_retransmits_before_loss_const: 10 # How long to wait for join messages in the membership protocol (ms) join: 60 # How long to wait for consensus to be achieved before starting a new round of membership configuration (ms) consensus: 5000 # Turn off the virtual synchrony filter vsftype: none # Number of messages that may be sent by one processor on receipt of the token max_messages: 20 # Limit generated nodeids to 31-bits (positive signed integers) clear_node_high_bit: yes # Disable encryption secauth: off # How many threads to use for encryption/decryption threads: 0 # Optionally assign a fixed node id (integer) # nodeid: 1234 # This specifies the mode of redundant ring, which may be none, active, or passive. rrp_mode: none interface { # The following values need to be set based on your environment # change these parameters to fit your machine network configuration. ringnumber: 0 bindnetaddr: 192.168.0.101 mcastaddr: 224.0.0.1 mcastport: 5605 } } amf { mode: disabled } service { # Load the Pacemaker Cluster Resource Manager ver: 0 name: pacemaker } aisexec { user: root group: root } logging { fileline: off to_stderr: yes to_logfile: no to_syslog: yes syslog_facility: daemon debug: on timestamp: on logger_subsys { subsys: AMF debug: off tags: enter|leave|trace1|trace2|trace3|trace4|trace6 } }
/etc/csync2.cfg
group ha { host ubuntu ubuntu2; key /etc/csync2.key_mygroup; # Configuration files to keep synchronized. include /etc/csync2.cfg; # Executables (scripts) to keep synchronized. # include /usr/sbin/LinkCheck; # Directories to keep synchronized. # include /shared_stuff/; # include %homedir%/weibullguy; # Excluded items. # exclude *~ .*; # exclude /shared_stuff/; # What action to take when the csync2.cfg file is synced. # Execute the mailer script to send me an e-mail. # Log this action in the csync2 log file. action { pattern /etc/csync2.cfg; #exec "/etc/csync2/mailer"; logfile "/var/log/csync2_action.log"; do-local; } # Create backups. # backup-directory /var/backups/csync2; # backup-generations 3; auto none; } # The homedir variable on frodo and legolas (hosts 2 and 3) # is different than it is on all other hosts (only aragorn in this case). #prefix homedir #{ # on host[23]: /home/users; # on *: /home; #}
HealthCheck
/etc/services servermon 9999/tcp # fidonet EMSI over TCP /etc/xinet.d/servermon service servermon { socket_type = stream wait = no user = root server = /usr/sbin/checkstatus disable = no } /usr/sbin/checkstatus #!/bin/bash export tss=`date +%Y%m%dT%H%M%S` export host_conn="192.168.0.101" export name=`cat /etc/hostname` echo "server_name=$name" echo "timestamp=$tss" ping -c 1 -w 1 $host_conn>/dev/null export network_stat=$? if [ $network_stat == 0 ] then echo "network_stat=up" else echo "network_stat=down" fi /etc/ha/nodes.conf nodes=ubuntu2 ubuntu3 /usr/sbin/nodes_stat #!/bin/bash function getParam(){ export param=$1 export file=$2 cat $file|grep -i $param|cut -d = -f2 } export host_list=`getParam nodes /etc/ha/nodes.conf` for host_n in $host_list do nc ${host_n} 9999 > /dev/null export exit_stat=$? if [ $exit_stat == 0 ] then echo "${host_n}=up" else echo "${host_n}=down" exit 2 fi done exit 0
Monitoring Watchdog
/etc/ha/service.conf [monitor definition] wdgd_log_dir=/var/log/ process_monitor=ssh crash_notification=echo monitor_services_ssh=/etc/init.d/ssh monitor_services_option_ssh= restart monitor_script_ssh=/usr/sbin/port_mon localhost 22 monitor_postaction_ssh=sleep 5 /usr/sbin/show_wdgd #!/bin/bash export proc=$1 while true; do clear; cat /var/log/wdgd_${proc}; sleep 1; done /usr/sbin/port_mon #!/bin/bash nc $1 $2 -w 1 if [ "$?" == "0" ] then exit 0 else exit 1 fi
/usr/sbin/wdgd_srv #!/bin/bash export lock_file="/tmp/lock_wdgd.lock" while true do if [ -f $lock_file ] then watchdog else exit 1 fi done /etc/init.d/wdgd #!/bin/bash function getParam(){ export param=$1 export file=$2 cat $file|grep -i $param|cut -d = -f2 } export logDir=`getParam wdgd_log_dir /etc/ha/service.conf` export tdate=`date +%Y%m%dT%H%M%S` # debug # export logFile="${logDir}/wdgd_${tdate}.log" export wdg_home=/usr/sbin/ case $1 in start) export logFile=/dev/null touch /tmp/lock_wdgd.lock ${wdg_home}wdgd_srv 2>${logFile} 1>>${logFile} & exit 0;; debug) export logFile="${logDir}/wdgd_${tdate}.log" touch /tmp/lock_wdgd.lock ${wdg_home}wdgd 2>${logFile} 1>>${logFile} & exit 0;; stop) rm /tmp/lock_wdgd.lock exit 0;; status) if [ -f "/tmp/lock_wdgd.lock" ] then echo "running" else echo "stopped" fi exit 0;; *) echo "please type one of the following options: start, debug, stop, status " exit 2;; esac
/usr/sbin/watchdog #!/bin/bash function getParam(){ export param=$1 export file=$2 cat $file|grep -i $param|cut -d = -f2 } ### function testMonScript() { export nombre=$1 export script_mon=`getParam monitor_script_${nombre} /etc/ha/service.conf` export script_cut=${script_mon}|awk {'print $1'} if [ -x $script_cut ] then echo "$script_mon" else echo "NOT_FOUND" fi } ### function logService() { export service_name=$1 export base_log=`getParam wdgd_log_dir /etc/ha/service.conf` export base_dir_log=$base_log export message=$2 export file_output="${base_dir_log}/wdgd_${service_name}" export tdate=`date` echo "${tdate}:> ${message}">$file_output } export srvlist=`getParam process_monitor /etc/ha/service.conf` echo "================================================================" ## for every service in process_monitor we are going to launch the monitor script for service in $srvlist do ## first we test if the service to monitor exists, else there is nothing to mon export process=`getParam monitor_services_${service} /etc/ha/service.conf` export pars_p=`getParam monitor_services_option_${service} /etc/ha/service.conf` echo "(*) launching monitor for: ${service}" if [ -x $process ] then echo "file exist $process" export test_monitor_scr=`testMonScript $service` echo "monitor script $test_monitor_scr" ## if the monitor is not found exit the process and show the error if [ "$test_monitor_scr" == "NOT_FOUND" ] then echo "monitor_services_$service value in service.conf is not executable or does not exist" exit 5 else ## the monitor exists and is an executable file echo "ready to lauch monitor: $test_monitor_scr" $test_monitor_scr export status=$? if [ $status == "0" ] then ## the monitor returned 0 so it seems the monitor to process is ok echo "monitor for $service ok" echo "----------------------->" logService $service "${service}=ok" else ## the monitor returned something else than 0 so it seems the monitor to process is down logService $service $base_dir_log "${service}=down" echo "*******************************************************************" echo "WARNING !!!! service $service is dead " echo "launching $process with the following params: $pars_p" export rescue_cmd="$process $pars_p" $rescue_cmd logService $service "${service}=starting" export stat_cm=$? echo "Command status: $stat_cm" ## we must check the exit status for the rescue command export command_post_start=`getParam monitor_postaction_${service} /etc/ha/service.conf` echo "launching post-startup command $command_post_start" $command_post_start if [ $stat_cm == "0" ] then ## we check that really the server startup was sucessfull $test_monitor_scr export status_r=$? if [ $status_r == "0" ] then ## it was !!! :) logService $service "${service}=started" echo "success" else ## it was not !!! :( export ldate=`date` export message="CRITICAL: service ${service} was not successfully started after automatic restart. time ( $ldate )" echo $message logService $service "${service}=ko" export crash_cmd=`getParam crash_notification /etc/ha/service.conf` $crash_cmd $message fi else ## ups!!! export ldate=`date` export message="CRITICAL: service ${service} was not successfully started after crash. time ( $ldate )" echo $message export crash_cmd=`getParam crash_notification /etc/ha/service.conf` $crash_cmd $message export command_post_start=`getParam monitor_postaction_${service} /etc/ha/service.conf` echo "launching post-startup command on fault $command_post_start" $command_post_start fi echo "*******************************************************************" fi fi else echo "file does not exist" fi done
eDirectory Command Wrapper
#!/bin/bash #############Vars export DEBUG="debug" export INFO="info" export DEFAULT_LOG_LEVEL="debug" export DIR_NDS=`cat /etc/opt/novell/eDirectory/conf/.edir/instances.0` export cmd_stop="/opt/novell/eDirectory/bin/ndsmanage stop --config-file ${DIR_NDS}" export cmd_start="/opt/novell/eDirectory/bin/ndsmanage start --config-file ${DIR_NDS}" #export cmd_stop="/root/test.sh stop ${DIR_NDS}" #export cmd_start="/root/test.sh start ${DIR_NDS}" #### exit 0 ############Funciones function countProcesses { export proc=`ps -ef |grep -i ndsd|grep -v "grep" |wc|awk {'print $1'}` echo $proc } function log { export tlevel=$1 export mesg=$2 export level=$DEFAULT_LOG_LEVEL export fdat=`date +%s` if [ $tlevel == $level ]; then echo "${fdat} [${tlevel}]: ${mesg} " elif [ $tlevel == "info" ]; then echo "${fdat} [info]: ${mesg} " fi } function mailroot { echo $1|mail root } #####################Logica general case $1 in start) $cmd_start export stat_code=$? if [ $stat_code -eq 0 ]; then log $INFO "ejecucion script arranque correcta" else log $INFO "ejecucion script arranque incorrecta" mailroot "El script ndsmanage ha mostrado un error." exit 1 fi log $INFO "Esperando 5 segundos para comprobar los procesos de eDirectory" sleep 5 export num=`countProcesses` log $DEBUG "contando procesos: $num" if [ $num -eq 1 ]; then log $INFO "el servicio ha arrancado correctamente" exit 0 else mailroot "El servicio no se ha arrancado correctamente." exit 1 fi ## FIN CASE START ;; stop) trap 'echo "salida anormal";exit 126' 2 log $INFO "se va a parar el servicio del directorio" $cmd_stop export stat_code=$? if [ $stat_code -eq 0 ]; then log $INFO "ejecucion script parada correcta" else log $INFO "ejecucion script parada incorrecta" mailroot "El script mdsmanage de eDirectory ha mostrado un error mientras se lanzaba la parada del directorio. Se va a abortar la copia" exit 1 fi log $INFO "Esperando 60 segundos para dar tiempo a la parada ordenada de eDirectory" sleep 10 log $INFO "Lanzando comprobacion" export num=`countProcesses` log $DEBUG "contando procesos: $num" if [ $num -eq 1 ]; then log $INFO "el servicio ndsd no se ha parado todavia" for try in `seq 5` do sleep 5 log $INFO "comprobacion numero: $try " export inum=`countProcesses` if [ $inum -eq 0 ]; then echo "el directorio ha parado correctamente" exit 0 else echo "el directorio todavia no ha parado" fi done mailroot "El directorio no se ha parado. Abortamos copia de seguridad" exit 1 else ## salida ok!!! log $INFO "el servicio de directorio se ha parado. Devolvemos codigo de status correcto" exit 0 fi ### FIN CASE STOP ;; rep_t) ndsrepair -T ;; rep_s) ndsrepair -E ;; status) export inum=`countProcesses` if [ $inum -eq 0 ]; then echo "el directorio esta parado" exit 3 else echo "el servicio de directorio esta arrancado" exit 0 fi ### FIN STATUS ;; *) echo "se debe pasar un argumento de tipo start/stop/rep_t/rep_s" ;; esac
No hay comentarios:
Publicar un comentario