libdb/test/tcl/repmgr026.tcl

244 lines
7.3 KiB
Tcl
Raw Normal View History

2011-09-13 17:44:24 +00:00
# See the file LICENSE for redistribution information.
#
# Copyright (c) 2009, 2011 Oracle and/or its affiliates. All rights reserved.
#
# TEST repmgr026
# TEST Test of "full election" timeouts.
# TEST 1. Cold boot with all sites present.
# TEST 2. Cold boot with some sites missing.
# TEST 3. Partial-participation election with one client having seen a master,
# TEST but another just starting up fresh.
# TEST 4. Partial participation, with all participants already having seen a
# TEST master.
# TEST
proc repmgr026 { { tnum 026 } } {
source ./include.tcl
if { $is_freebsd_test == 1 } {
puts "Skipping replication manager test on FreeBSD platform."
return
}
foreach use_leases {no yes} {
foreach client_down {no yes} {
puts "Repmgr$tnum: Full election test, \
client_down: $client_down; leases: $use_leases"
repmgr026_sub $tnum $client_down $use_leases
}
}
}
proc repmgr026_sub { tnum client_down use_leases } {
global testdir
global repfiles_in_memory
global rep_verbose
global verbose_type
set verbargs ""
if { $rep_verbose == 1 } {
set verbargs " -verbose {$verbose_type on} "
}
set repmemargs ""
if { $repfiles_in_memory } {
set repmemargs "-rep_inmem_files "
}
env_cleanup $testdir
file mkdir [set dira $testdir/SITE_A]
file mkdir [set dirb $testdir/SITE_B]
file mkdir [set dirc $testdir/SITE_C]
file mkdir [set dird $testdir/SITE_D]
file mkdir [set dire $testdir/SITE_E]
foreach { porta portb portc portd porte } [available_ports 5] {}
# First, just create/establish the group.
puts -nonewline "Repmgr$tnum: Create a group of 5 sites: "
set common "-create -txn $verbargs $repmemargs \
-rep -thread -event"
if { $use_leases } {
append common " -rep_lease {[list 3000000]} "
}
set cmda "berkdb_env_noerr $common -errpfx SITE_A -home $dira"
set cmdb "berkdb_env_noerr $common -errpfx SITE_B -home $dirb"
set cmdc "berkdb_env_noerr $common -errpfx SITE_C -home $dirc"
set cmdd "berkdb_env_noerr $common -errpfx SITE_D -home $dird"
set cmde "berkdb_env_noerr $common -errpfx SITE_E -home $dire"
set common_mgr " -start elect \
-timeout {connection_retry 5000000} \
-timeout {election_retry 2000000} \
-timeout {full_election 60000000} \
-timeout {election 5000000} -timeout {ack 3000000}"
set enva [eval $cmda]
eval $enva repmgr $common_mgr \
-local {[list localhost $porta creator]}
puts -nonewline "." ; flush stdout
set envb [eval $cmdb]
eval $envb repmgr $common_mgr \
-local {[list localhost $portb]} -remote {[list localhost $porta]}
await_startup_done $envb
puts -nonewline "." ; flush stdout
set envc [eval $cmdc]
eval $envc repmgr $common_mgr \
-local {[list localhost $portc]} -remote {[list localhost $porta]}
await_startup_done $envc
puts -nonewline "." ; flush stdout
set envd [eval $cmdd]
eval $envd repmgr $common_mgr \
-local {[list localhost $portd]} -remote {[list localhost $porta]}
await_startup_done $envd
puts -nonewline "." ; flush stdout
set enve [eval $cmde]
eval $enve repmgr $common_mgr \
-local {[list localhost $porte]} -remote {[list localhost $porta]}
await_startup_done $enve
puts "."
$enve close
$envd close
$envc close
$envb close
$enva close
# Cold boot the group (with or without site E), giving site A a
# high priority.
#
# The wait_limit's are intended to be an amount that is way more than
# the expected timeout, used for nothing more than preventing the test
# from hanging forever. The leeway amount should be enough less than
# the timeout to allow for any imprecision introduced by the test
# mechanism.
#
set elect_wait_limit 25
set full_secs_leeway 59
set full_wait_limit 85
puts "\tRepmgr$tnum.a: Start first four sites."
set enva [eval $cmda]
eval $enva repmgr $common_mgr -pri 200 -local {[list localhost $porta]}
set envb [eval $cmdb]
eval $envb repmgr $common_mgr -pri 100 -local {[list localhost $portb]}
set envc [eval $cmdc]
eval $envc repmgr $common_mgr -pri 90 -local {[list localhost $portc]}
set envd [eval $cmdd]
eval $envd repmgr $common_mgr -pri 80 -local {[list localhost $portd]}
if { $client_down } {
set enve NONE
} else {
puts "\tRepmgr$tnum.b: Start fifth site."
set enve [eval $cmde]
eval $enve repmgr $common_mgr -pri 50 \
-local {[list localhost $porte]}
}
# wait for results, and make sure they're correct
#
set envlist [list $enva $envb $envc $envd]
if { $enve != "NONE" } {
lappend envlist $enve
}
set limit $full_wait_limit
puts "\tRepmgr$tnum.c: wait (up to $limit seconds) for first election."
set t [repmgr026_await_election_result $envlist $limit]
if { $client_down } {
error_check_good slow_election [expr $t > $full_secs_leeway] 1
} else {
# When all sites participate, the election should finish in way
# less than 60 seconds.
#
error_check_good timely_election [expr $t < $full_secs_leeway] 1
}
puts "\tRepmgr$tnum.d: first election completed in $t seconds"
puts "\tRepmgr$tnum.e: wait for start-up done"
$enva event_info -clear
await_startup_done $envb
$envb event_info -clear
await_startup_done $envc
$envc event_info -clear
await_startup_done $envd
$envd event_info -clear
if { $enve != "NONE" } {
await_startup_done $enve
$enve event_info -clear
}
# Shut down site A, in order to test elections with less than the whole
# group voting. However, normally repmgr's reaction to losing master
# connection is to try a "fast election" (the n-1 trick). So we must do
# something to mitigate that (see below).
#
puts "\tRepmgr$tnum.f: shut down master site A"
if { $client_down } {
# The fifth site is already down, so now we'll have just B, C,
# and D running. Therefore, even with repmgr pulling its "fast
# election" (n-1) trick, we don't have enough votes for a
# full-participation short circuit; so this is a valid test of
# the "normal" election timeout.
#
$enva close
} else {
# Here all sites are running, so if we just killed the master
# repmgr would invoke its "fast election" trick, resulting in no
# timeout. Since the purpose of this test is to ensure the
# correct use of timeouts, that's no good. Instead, let's first
# kill one more other site.
$enve close
$enva close
}
# wait for results, and check them
#
set envlist [list $envb $envc $envd]
set limit $elect_wait_limit
puts "\tRepmgr$tnum.h: wait (up to $limit seconds) for second election."
set t [repmgr026_await_election_result $envlist $limit]
error_check_good normal_election [expr $t < $full_secs_leeway] 1
puts "\tRepmgr$tnum.i: second election completed in $t seconds"
$envd close
$envc close
$envb close
}
# Wait (a limited amount of time) for the election to finish. The first env
# handle in the list is the expected winner, and the others are the remaining
# clients. Returns the approximate amount of time (in seconds) that the
# election took.
#
proc repmgr026_await_election_result { envlist limit } {
set begin [clock seconds]
set deadline [expr $begin + $limit]
while { true } {
set t [clock seconds]
if { $t > $deadline } {
error "FAIL: time limit exceeded"
}
if { [repmgr026_is_ready $envlist] } {
return [expr $t - $begin]
}
tclsleep 1
}
}
proc repmgr026_is_ready { envlist } {
set winner [lindex $envlist 0]
if {![is_elected $winner]} {
return false
}
foreach client [lrange $envlist 1 end] {
if {![is_event_present $client newmaster]} {
return false
}
}
return true
}