libdb/test/tcl/repmgr029.tcl
2012-11-14 15:13:24 -05:00

1748 lines
56 KiB
Tcl

# See the file LICENSE for redistribution information.
#
# Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
#
# $Id$
#
# TEST repmgr029
# TEST Test repmgr group membership: create, join, re-join and remove from
# TEST repmgr group and observe changes in group membership database.
# TEST
proc repmgr029 { } {
puts "Repmgr029: Repmgr Group Membership operations."
z1
z2
z3
z4
z5
z6
z7
z8
z9
z10
z11
z12
z13
z14
z15
z16
z17
z18
z19
}
# See that a joining site that names a non-master as helper gets a
# "forward" response, and manages to then get to the true master.
#
# Note: there's a bit of a race here, depending on the condition of
# site B at the time C tries to join. That should eventually be
# tightened up.
proc z3 {} {
global rep_verbose
global testdir
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
foreach {port0 port1 port2 port3 port4 port5} [available_ports 6] {}
set masterdir $testdir/MASTERDIR
set clientdir $testdir/CLIENTDIR
set clientdir2 $testdir/CLIENTDIR2
set clientdir3 $testdir/CLIENTDIR3
set clientdir4 $testdir/CLIENTDIR4
set clientdir5 $testdir/CLIENTDIR5
file mkdir $masterdir
file mkdir $clientdir
file mkdir $clientdir2
file mkdir $clientdir3
file mkdir $clientdir4
file mkdir $clientdir5
puts "\tRepmgr029.z3.a: Primordial creation, Start Master site 0"
set env1 [berkdb env -create -errpfx MASTER -home $masterdir -txn \
-rep -thread -recover -verbose [list rep $rv]]
$env1 repmgr -local [list 127.0.0.1 $port0] -start master
error_check_good nsites_A [$env1 rep_get_nsites] 1
puts "\tRepmgr029.z3.b: Simple join request, \
client 1 points directly at master"
set env2 [berkdb env -create -errpfx CLIENT -home $clientdir -txn \
-rep -thread -recover -verbose [list rep $rv]]
$env2 rep_config {mgr2sitestrict on}
$env2 repmgr -local [list 127.0.0.1 $port1] \
-remote [list 127.0.0.1 $port0] -start client
await_startup_done $env2
error_check_good nsites_A2 [$env1 rep_get_nsites] 2
error_check_good nsites_B2 [$env2 rep_get_nsites] 2
puts "\tRepmgr029.z3.c: Join request forwarding, start client 2."
set env3 [berkdb env -create -errpfx CLIENT2 -home $clientdir2 -txn \
-rep -thread -recover -verbose [list rep $rv]]
$env3 rep_config {mgr2sitestrict on}
$env3 repmgr -local [list 127.0.0.1 $port2] \
-remote [list 127.0.0.1 $port1]
set done no
while {!$done} {
if {[catch {$env3 repmgr -start client} msg]} {
puts $msg
tclsleep 1
} else {
set done yes
}
}
await_startup_done $env3
error_check_good nsites_A3 [$env1 rep_get_nsites] 3
error_check_good nsites_B3 [$env2 rep_get_nsites] 3
error_check_good nsites_C3 [$env3 rep_get_nsites] 3
puts "\tRepmgr029.z3.d: Master cannot be removed \
(by itself, or as requested from a client)"
set ret [catch {$env1 repmgr -remove [list 127.0.0.1 $port0]} result]
error_check_bad no_failure $ret 0
error_check_match unavail $result "*DB_REP_UNAVAIL*"
set ret [catch {$env2 repmgr -remove [list 127.0.0.1 $port0]} result]
error_check_bad no_failure2 $ret 0
error_check_match unavail2 $result "*DB_REP_UNAVAIL*"
set db [berkdb open -env $env1 -thread __db.rep.system __db.membership]
puts "\tRepmgr029.z3.e: Join request rejected for lack of acks"
puts "\t\tRepmgr029.z3.e.1: Close client 1 and 2."
error_check_good s_3_close [$env3 close] 0
error_check_good s_2_close [$env2 close] 0
puts "\t\tRepmgr029.z3.e.2: Start client 3."
set env4 [berkdb env -create -errpfx CLIENT3 -home $clientdir3 \
-txn -rep -thread -recover -verbose [list rep $rv]]
set ret [catch {$env4 repmgr -local [list 127.0.0.1 $port3] \
-remote [list 127.0.0.1 $port0] -start client} result]
error_check_bad no_failure3 $ret 0
error_check_match unavail3 $result "*DB_REP_UNAVAIL*"
set prev_vers [repmgr029_gmdb_version $db]
puts "\t\tRepmgr029.z3.e.3: Check previous GMDB version $prev_vers"
set SITE_ADDING 1
set SITE_PRESENT 4
error_check_good site_3_adding [repmgr029_gmdb_status $db 127.0.0.1 $port3] \
$SITE_ADDING
puts "\t\tRepmgr029.z3.e.4: limbo resolution, restart client 1."
set env2 [berkdb env -create -errpfx CLIENT -home $clientdir -txn \
-rep -thread -recover -verbose [list rep $rv] -event]
# no helper should be needed this time.
$env2 repmgr -local [list 127.0.0.1 $port1] -start client
await_startup_done $env2 50
puts "\t\tRepmgr029.z3.e.5: normal txn at master"
set niter 1
rep_test btree $env1 NULL $niter 0 0 0
set new_vers [repmgr029_gmdb_version $db]
puts "\t\tRepmgr029.z3.e.6: NEW GMDB version $new_vers"
error_check_good version_incr $new_vers [expr $prev_vers + 1]
error_check_good site_3_added [repmgr029_gmdb_status $db 127.0.0.1 $port3] \
$SITE_PRESENT
puts "\t\tRepmgr029.z3.e.7: client 3 rejoins."
$env4 repmgr -start client
await_startup_done $env4 60
# To verify that the GMDB has been updated on client side.
puts "\t\tRepmgr029.z3.e.8: Verify the GMDB on the client 3."
set db3 [berkdb open -env $env4 -thread __db.rep.system __db.membership]
error_check_good vers [repmgr029_gmdb_version $db3] $new_vers
$db3 close
# This test case verify a scenario where (1) try another (different)
# join request, still with insufficient acks, and see that it doesn't
# load up another limbo; and then (2) with acks working,
# a second request finishes off the first and then succeeds.
# I guess we also need to try simply retrying the first addition.
puts "\tRepmgr029.z3.f: Join request rejected for lack of acks"
puts "\t\tRepmgr029.z3.f.1: Close client 1."
error_check_good s_1_close [$env2 close] 0
set prev_vers [repmgr029_gmdb_version $db]
puts "\t\tRepmgr029.z3.f.2: Check current GMDB version $prev_vers"
puts "\t\tRepmgr029.z3.f.3: Start client 4."
set env5 [berkdb env -create -errpfx CLIENT4 -home $clientdir4 \
-txn -rep -thread -recover -verbose [list rep $rv]]
set ret [catch {$env5 repmgr -local [list 127.0.0.1 $port4] \
-remote [list 127.0.0.1 $port0] -start client} result]
error_check_bad no_failure4 $ret 0
error_check_match unavail4 $result "*DB_REP_UNAVAIL*"
set prev_vers [repmgr029_gmdb_version $db]
puts "\t\tRepmgr029.z3.f.4: Check current GMDB version $prev_vers"
error_check_good site_4_adding [repmgr029_gmdb_status $db 127.0.0.1 $port4] \
$SITE_ADDING
puts "\t\tRepmgr029.z3.f.5: Start client 5."
set env6 [berkdb env -create -errpfx CLIENT5 -home $clientdir5 \
-txn -rep -thread -recover -verbose [list rep $rv]]
set ret [catch {$env6 repmgr -local [list 127.0.0.1 $port5] \
-remote [list 127.0.0.1 $port0] -start client} result]
error_check_bad no_failure5 $ret 0
error_check_match unavail5 $result "*DB_REP_UNAVAIL*"
set prev_vers [repmgr029_gmdb_version $db]
puts "\t\tRepmgr029.z3.f.6: Check current GMDB version $prev_vers"
# [M]: There is no gm status for client 5 so far. Let alone the "ADDING".
#error_check_good site_5_adding [repmgr029_gmdb_status $db 127.0.0.1 $port5] \
# $SITE_ADDING
puts "\t\tRepmgr029.z3.f.7: limbo resolution, restart client 1."
set env2 [berkdb env -create -errpfx CLIENT -home $clientdir -txn \
-rep -thread -recover -verbose [list rep $rv] -event]
# no helper should be needed this time.
$env2 repmgr -local [list 127.0.0.1 $port1] -start client
await_startup_done $env2 50
puts "\t\tRepmgr029.z3.f.8: normal txn at master"
set niter 1
rep_test btree $env1 NULL $niter 0 0 0
set new_vers [repmgr029_gmdb_version $db]
puts "\t\tRepmgr029.z3.f.9: NEW GMDB version $new_vers"
error_check_good version_incr $new_vers [expr $prev_vers + 1]
puts "\t\tRepmgr029.z3.f.10: client 5 rejoins."
$env6 repmgr -start client
await_startup_done $env6 60
set new_vers [repmgr029_gmdb_version $db]
puts "\t\tRepmgr029.z3.f.11: NEW GMDB version $new_vers"
# Check for client 5, which has gm status as "ADDED"
error_check_good site_5_added [repmgr029_gmdb_status $db 127.0.0.1 $port5] \
$SITE_PRESENT
#[M]: So far gm status for client 4 is "ADDED"
error_check_good site_4_added [repmgr029_gmdb_status $db 127.0.0.1 $port4] \
$SITE_PRESENT
#[M]: We'd like to check the gm status on the client 4 sides.
# No Way! as client 4 has not been start up and sync.
# puts "\t\tRepmgr029.z3.e.8: Verify the GMDB on the client 4."
# set db4 [berkdb open -env $env5 -thread __db.rep.system __db.membership]
# error_check_good vers [repmgr029_gmdb_version $db4] $new_vers
puts "\t\tRepmgr029.z3.f.12: client 4 rejoins."
$env5 repmgr -start client
await_startup_done $env5 100
set new_vers [repmgr029_gmdb_version $db]
puts "\t\tRepmgr029.z3.f.13: NEW GMDB version $new_vers"
puts "\tRepmgr029.z3.h: Remove (downed) client 3, from master"
$env1 repmgr -remove [list 127.0.0.1 $port3]
error_check_good site_3_removed [repmgr029_gmdb_status $db 127.0.0.1 $port3] 0
error_check_good db_close [$db close] 0
error_check_good s_1_close [$env2 close] 0
error_check_good s_3_close [$env4 close] 0
error_check_good s_4_close [$env5 close] 0
error_check_good s_5_close [$env6 close] 0
error_check_good s_0_close [$env1 close] 0
puts "\tRepmgr029.z3.i: End OF Repmgr029"
}
# Remove a live site from a group, and see that the site gets a
# LOCAL_SITE_REMOVED event, and the other sites get SITE_REMOVED.
#
proc z6 { } {
global rep_verbose
global testdir
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
foreach {portA portB portC} [available_ports 3] {}
set dirA $testdir/A
set dirB $testdir/B
set dirC $testdir/C
file mkdir $dirA
file mkdir $dirB
file mkdir $dirC
puts -nonewline "\tRepmgr029.z6.a: Build basic 3-site group"
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-verbose [list rep $rv] -event]
$envA repmgr -local [list 127.0.0.1 $portA creator] -start elect
error_check_good nsites_a [$envA rep_get_nsites] 1
puts -nonewline "."; flush stdout
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-verbose [list rep $rv] -event]
$envB repmgr -local [list 127.0.0.1 $portB] \
-remote [list 127.0.0.1 $portA] -start elect
await_startup_done $envB
error_check_good nsites_b [$envB rep_get_nsites] 2
puts -nonewline "."; flush stdout
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-verbose [list rep $rv] -event]
$envC repmgr -local [list 127.0.0.1 $portC] \
-remote [list 127.0.0.1 $portA] -start elect
await_startup_done $envC
error_check_good nsites_c [$envC rep_get_nsites] 3
puts "."; flush stdout
set eid_C_at_A [repmgr029_get_eid $envA $portC]
set eid_C_at_B [repmgr029_get_eid $envB $portC]
puts "\tRepmgr029.z6.b: Remove (live) site C from a request originating at B."
$envB repmgr -remove [list 127.0.0.1 $portC]
set db [berkdb open -env $envA -thread __db.rep.system __db.membership]
error_check_good site_c_removed [repmgr029_gmdb_status $db 127.0.0.1 $portC] 0
set master_ev [find_event [$envA event_info] site_removed]
error_check_good site_a_event [llength $master_ev] 2
error_check_good site_a_event_eid [lindex $master_ev 1] $eid_C_at_A
error_check_good site_a_list [llength [repmgr029_get_eid $envA $portC]] 0
await_event $envC local_site_removed
error_check_good s_c_close [$envC close] 0
await_condition {[expr [string length [repmgr029_site_list_status $envB $portC]] == 0]}
set b_ev [find_event [$envB event_info] site_removed]
error_check_good site_b_event [llength $b_ev] 2
error_check_good site_b_event_eid [lindex $b_ev 1] $eid_C_at_B
error_check_good site_b_list [llength [repmgr029_get_eid $envB $portC]] 0
error_check_good s_b_close [$envB close] 0
$db close
error_check_good s_a_close [$envA close] 0
}
# See that SITE_ADDED events are fired appropriately.
proc z8 { } {
global rep_verbose
global testdir
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
foreach {portA portB portC} [available_ports 3] {}
set dirA $testdir/A
set dirB $testdir/B
set dirC $testdir/C
file mkdir $dirA
file mkdir $dirB
file mkdir $dirC
puts "\tRepmgr029.z8: Create primordial site"
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-verbose [list rep $rv] -event]
$envA repmgr -local [list 127.0.0.1 $portA creator] -start elect
puts "\tRepmgr029.z8: Add client, check for event at master"
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-verbose [list rep $rv] -event]
$envB repmgr -local [list 127.0.0.1 $portB] \
-remote [list 127.0.0.1 $portA] -start elect
set ev [find_event [$envA event_info] site_added]
error_check_good ev_a [llength $ev] 2
set eid [lindex $ev 1]
error_check_good ev_a_eid $eid [repmgr029_get_eid $envA $portB]
await_startup_done $envB
puts "\tRepmgr029.z8: Add another client, check for events at both other sites"
$envA event_info -clear
$envB event_info -clear
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-verbose [list rep $rv] -event]
$envC repmgr -local [list 127.0.0.1 $portC] \
-remote [list 127.0.0.1 $portA] -start elect
await_startup_done $envC
set ev [find_event [$envA event_info] site_added]
error_check_good ev_a2 [llength $ev] 2
set eid [lindex $ev 1]
error_check_good ev_a_eid2 $eid [repmgr029_get_eid $envA $portC]
await_event $envB site_added
set ev [find_event [$envB event_info] site_added]
error_check_good ev_b [llength $ev] 2
set eid [lindex $ev 1]
error_check_good ev_b_eid $eid [repmgr029_get_eid $envB $portC]
$envC close
$envB close
$envA close
}
# Remove a site, starting at the site to be removed. See that we at least shut
# down threads (if not also fire event in this case).
#
proc z7 { } {
global rep_verbose
global testdir
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
foreach {portA portB portC} [available_ports 3] {}
set dirA $testdir/A
set dirB $testdir/B
set dirC $testdir/C
file mkdir $dirA
file mkdir $dirB
file mkdir $dirC
puts -nonewline "\tRepmgr029.z7: Set up a group of 3, A (master), B, C"
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-recover -verbose [list rep $rv] -event]
$envA repmgr -local [list 127.0.0.1 $portA] -start master
puts -nonewline "." ; flush stdout
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-recover -verbose [list rep $rv] -event]
$envB repmgr -local [list 127.0.0.1 $portB] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envB
puts -nonewline "." ; flush stdout
set eid_B_at_A [repmgr029_get_eid $envA $portB]
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-recover -verbose [list rep $rv] -event]
$envC repmgr -local [list 127.0.0.1 $portC] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envC
puts "."
puts "\tRepmgr029.z7: Remove site B itself"
$envB repmgr -remove [list 127.0.0.1 $portB]
await_event $envB local_site_removed
set master_ev [find_event [$envA event_info] site_removed]
error_check_good site_a_event_eid [lindex $master_ev 1] $eid_B_at_A
error_check_good site_a_list [llength [repmgr029_get_eid $envA $portB]] 0
$envB close
$envC close
$envA close
}
# See that a join request is rejected if insufficient acks. (It should
# remain in the db as "adding" though, and apps should be able to query
# nsites to find out that it's been incremented.)
#
proc z4 {} {
global rep_verbose
global testdir
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
foreach {port0 port1 port2} [available_ports 3] {}
set masterdir $testdir/MASTERDIR
set clientdir $testdir/CLIENTDIR
set clientdir2 $testdir/CLIENTDIR2
file mkdir $masterdir
file mkdir $clientdir
file mkdir $clientdir2
puts -nonewline "\tRepmgr029.z4.a: Start the master."
set env1 [berkdb_env -create -errpfx MASTER -home $masterdir \
-txn -rep -thread -recover -verbose [list rep $rv]]
$env1 repmgr -local [list 127.0.0.1 $port0] -start master
error_check_good nsites_1 [$env1 rep_get_nsites] 1
puts "."; flush stdout
puts -nonewline "\tRepmgr029.z4.b: Start first client."
set env2 [berkdb_env -create -errpfx CLIENT -home $clientdir -txn \
-rep -thread -recover -verbose [list rep $rv]]
$env2 rep_config {mgr2sitestrict on}
$env2 repmgr -local [list 127.0.0.1 $port1] \
-remote [list 127.0.0.1 $port0] -start client
await_startup_done $env2
error_check_good nsites_2 [$env2 rep_get_nsites] 2
puts "."; flush stdout
puts "\tRepmgr029.z4.c: Close the first client."
error_check_good s_2_close [$env2 close] 0
puts "\tRepmgr029.z4.d: Start the second client."
set env3 [berkdb_env -create -errpfx CLIENT2 -home $clientdir2 \
-txn -rep -thread -recover -verbose [list rep $rv]]
$env3 rep_config {mgr2sitestrict on}
set ret [catch {$env3 repmgr -local [list 127.0.0.1 $port2] \
-remote [list 127.0.0.1 $port0] -start client} result]
error_check_bad no_failure $ret 0
error_check_match unavail $result "*DB_REP_UNAVAIL*"
puts "\tRepmgr029.z4.e: The second join failed as expected, \
since the first client is down"
puts -nonewline "\tRepmgr029.z4.f: restart the first client."
set env2 [berkdb_env -errpfx CLIENT -home $clientdir -txn -rep \
-thread -recover -create -verbose [list rep $rv]]
$env2 rep_config {mgr2sitestrict on}
$env2 repmgr -local [list 127.0.0.1 $port1] -start client
await_startup_done $env2
puts "."; flush stdout
puts "\tRepmgr029.z4.g: try to join the second client again"
if {[catch {$env3 repmgr -start client} result] && \
[string match "*REP_UNAVAIL*" $result]} {
puts "\tRepmgr029.z4.h: pause and try again"
tclsleep 3
$env3 repmgr -start client
}
await_startup_done $env3 100
error_check_good nsites_3 [$env3 rep_get_nsites] 3
error_check_good s_3_close [$env3 close] 0
error_check_good s_2_close [$env2 close] 0
error_check_good s_1_close [$env1 close] 0
}
# Cold-boot an established group, without specifying any remote sites, and see
# that they can elect a master (demonstrating that they have recorded each
# others' addresses).
#
proc z5 {} {
global rep_verbose
global testdir
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
foreach {port0 port1 port2} [available_ports 3] {}
set masterdir $testdir/MASTERDIR
set clientdir $testdir/CLIENTDIR
set clientdir2 $testdir/CLIENTDIR2
file mkdir $masterdir
file mkdir $clientdir
file mkdir $clientdir2
puts -nonewline "\tRepmgr029.z5.a: Set up a group of 3, one master and two clients."
set env1 [berkdb env -create -errpfx MASTER -home $masterdir \
-txn -rep -thread -recover -verbose [list rep $rv]]
$env1 repmgr -local [list 127.0.0.1 $port0] -start master
error_check_good nsites_1 [$env1 rep_get_nsites] 1
puts -nonewline "." ; flush stdout
set env2 [berkdb env -create -errpfx CLIENT -home $clientdir \
-txn -rep -thread -recover -verbose [list rep $rv]]
$env2 repmgr -local [list 127.0.0.1 $port1] \
-remote [list 127.0.0.1 $port0] -start client
await_startup_done $env2
error_check_good nsites_2 [$env2 rep_get_nsites] 2
puts -nonewline "." ; flush stdout
set env3 [berkdb env -create -errpfx CLIENT2 -home $clientdir2 \
-txn -rep -thread -recover -verbose [list rep $rv]]
$env3 repmgr -local [list 127.0.0.1 $port2] \
-remote [list 127.0.0.1 $port0] -start client
await_startup_done $env3
error_check_good nsites_3 [$env1 rep_get_nsites] 3
puts "." ; flush stdout
puts "\tRepmgr029.z5: Shut down all sites and then restart with election"
error_check_good s_2_close [$env2 close] 0
error_check_good s_3_close [$env3 close] 0
error_check_good s_1_close [$env1 close] 0
set env1 [berkdb env -create -errpfx A -home $masterdir \
-txn -rep -thread -recover -verbose [list rep $rv]]
$env1 repmgr -local [list 127.0.0.1 $port0] -start elect -pri 100
set env2 [berkdb env -create -errpfx B -home $clientdir \
-txn -rep -thread -recover -verbose [list rep $rv]]
$env2 repmgr -local [list 127.0.0.1 $port1] -start elect -pri 200
set env3 [berkdb env -create -errpfx C -home $clientdir2 \
-txn -rep -thread -recover -verbose [list rep $rv]]
$env3 repmgr -local [list 127.0.0.1 $port2] -start elect -pri 140
puts "\tRepmgr029.z5: Wait for election to choose a new master"
await_condition {[repmgr029_known_master $env1 $env2 $env3]}
error_check_good nsites_1 [$env1 rep_get_nsites] 3
error_check_good nsites_2 [$env2 rep_get_nsites] 3
error_check_good nsites_3 [$env3 rep_get_nsites] 3
error_check_good s_3_close [$env3 close] 0
error_check_good s_1_close [$env1 close] 0
error_check_good s_2_close [$env2 close] 0
}
# Remove a site while it is disconnected, and see if it can get an event when it
# tries to reconnect. (2nd try)
proc z2 { } {
global rep_verbose
global testdir
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
foreach {portA portB portC portD portE} [available_ports 5] {}
set dirA $testdir/A
set dirB $testdir/B
set dirC $testdir/C
set dirD $testdir/D
set dirE $testdir/E
file mkdir $dirA
file mkdir $dirB
file mkdir $dirC
file mkdir $dirD
file mkdir $dirE
puts -nonewline "\tRepmgr029.z2.a: Set up a group of 5: A, B, C, D, E"
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envA repmgr -local [list 127.0.0.1 $portA] -start master
error_check_good nsites_a [$envA rep_get_nsites] 1
puts -nonewline "."; flush stdout
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-recover -verbose [list rep $rv] -event]
$envB repmgr -local [list 127.0.0.1 $portB] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envB
error_check_good nsites_b [$envB rep_get_nsites] 2
puts -nonewline "."; flush stdout
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envC repmgr -local [list 127.0.0.1 $portC] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envC
error_check_good nsites_c [$envC rep_get_nsites] 3
puts -nonewline "." ; flush stdout
# It is ideal to increase the await time when the group size is large.
set envD [berkdb env -create -errpfx D -home $dirD -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envD repmgr -local [list 127.0.0.1 $portD] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envD 100
error_check_good nsites_d [$envD rep_get_nsites] 4
puts -nonewline "." ; flush stdout
set envE [berkdb env -create -errpfx E -home $dirE -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envE repmgr -local [list 127.0.0.1 $portE] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envE 200
error_check_good nsites_e [$envE rep_get_nsites] 5
puts "." ; flush stdout
puts "\tRepmgr029.z2.b: shut down sites D and E"
error_check_good s_d_close [$envD close] 0
error_check_good s_e_close [$envE close] 0
puts "\tRepmgr029.z2.c: remove site D from the group"
$envA repmgr -remove [list 127.0.0.1 $portD]
error_check_good rm_at_a \
[string length [repmgr029_site_list_status $envA $portD]] 0
puts "\tRepmgr029.z2.d: shut down all remaining sites"
error_check_good s_b_close [$envB close] 0
error_check_good s_c_close [$envC close] 0
error_check_good s_a_close [$envA close] 0
puts -nonewline "\tRepmgr029.z2.e: start up just D and E \
(neither of which know that D has been removed)"
set envD [berkdb env -create -errpfx D -home $dirD -txn -rep -thread \
-recover -verbose [list rep $rv] -event]
$envD repmgr -local [list 127.0.0.1 $portD] -start elect\
-timeout {connection_retry 2000000}
# Should comments out the await here, otherwise, envD cannot join
#await_startup_done $envD
puts -nonewline "."; flush stdout
set envE [berkdb env -create -errpfx E -home $dirE -txn -rep -thread \
-recover -verbose [list rep $rv] -event]
$envE repmgr -local [list 127.0.0.1 $portE] -start elect
await_condition {[expr [stat_field $envD \
rep_stat "Messages processed"] > 0]}
puts "."; flush stdout
puts -nonewline "\tRepmgr029.z2.f: Start sites A, B, and C"
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envA repmgr -local [list 127.0.0.1 $portA] -start elect -pri 200
puts -nonewline "." ; flush stdout
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-recover -verbose [list rep $rv] -event]
$envB repmgr -local [list 127.0.0.1 $portB] -start elect -pri 150
puts -nonewline "." ; flush stdout
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envC repmgr -local [list 127.0.0.1 $portC] -start elect -pri 100
await_startup_done $envC 1000
puts "." ; flush stdout
puts "\tRepmgr029.z2.g: wait for site D to notice that it has been removed"
await_event $envD local_site_removed
# Yikes! This is not going to work! Site D will be rejected before it
# gets a chance to have its database updated! :-(
# [M] NOW: A is the master, B, C, E get sync with A, but D does not.
error_check_good s_d_close [$envD close] 0
error_check_good s_e_close [$envE close] 0
error_check_good s_c_close [$envC close] 0
error_check_good s_b_close [$envB close] 0
error_check_good s_a_close [$envA close] 0
}
# Remove a site while it is down. When it starts up again, it should rejoin.
proc z1 { } {
global rep_verbose
global testdir
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
foreach {portA portB portC} [available_ports 3] {}
set dirA $testdir/A
set dirB $testdir/B
set dirC $testdir/C
file mkdir $dirA
file mkdir $dirB
file mkdir $dirC
puts -nonewline "\tRepmgr029.z1.a: Set up a group of 3: A, B, C"
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envA repmgr -local [list 127.0.0.1 $portA] -start master
error_check_good nsitesA [$envA rep_get_nsites] 1
puts -nonewline "."; flush stdout
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-recover -verbose [list rep $rv] -event]
$envB repmgr -local [list 127.0.0.1 $portB] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envB
error_check_good nsitesB [$envB rep_get_nsites] 2
puts -nonewline "."; flush stdout
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envC repmgr -local [list 127.0.0.1 $portC] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envC
error_check_good nsitesC [$envC rep_get_nsites] 3
puts "."; flush stdout
puts "\tRepmgr029.z1.b: Shut down site B, and remove it from the group."
error_check_good s_b_close [$envB close] 0
$envA repmgr -remove [list 127.0.0.1 $portB]
error_check_good rm_at_a \
[string length [repmgr029_site_list_status $envA $portB]] 0
puts "\tRepmgr029.z1.c: restart B"
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-recover -verbose [list rep $rv] -event]
$envB repmgr -local [list 127.0.0.1 $portB] \
-timeout {connection_retry 2000000} -start client
await_startup_done $envB
# make sure we haven't fired a LOCAL_SITE_REMOVED event to B
set ev [find_event [$envB event_info] local_site_removed]
error_check_good site_b_not_removed [string length $ev] 0
# Now try it again, only this time the auto-rejoin fails due to lack of
# acks, so B should shut down and fire LOCAL_SITE_REMOVED event. TODO:
# should we have some sort of stat query so that the application can
# tell whether threads are running? Or is that just redundant with the
# event?
puts "\tRepmgr029.z1.d: shut down and remove site B again"
error_check_good s_b_close [$envB close] 0
$envA repmgr -remove [list 127.0.0.1 $portB]
puts "\tRepmgr029.z1.e: shut down site C, and then restart B"
error_check_good s_c_close [$envC close] 0
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-recover -verbose [list rep $rv] -event]
$envB repmgr -local [list 127.0.0.1 $portB] \
-timeout {connection_retry 2000000} -start client
await_event $envB local_site_removed
error_check_good s_b_close [$envB close] 0
error_check_good s_a_close [$envA close] 0
}
# Test "sharing", by constructing a situation where a site that's been down for
# a while has an obsolete, too-high notion of nsites. On a cold boot, if that
# site is needed, it would spoil the election by requiring too many votes,
# unless it gets a hint from other sites.
#
# Create a group of 6 sites, A, B, C, D, E, F. Make sure F knows nsites is 6;
# then shut it down. Remove E; now nsites is 5 (A, B, C, D, f). Then remove D;
# nsites is 4 (A, B, C, f). Now shut down everyone, and then reboot only A, B,
# and F (leave C down). Try to elect a master.
#
proc z9 { } {
global rep_verbose
global testdir
set rv [ expr $rep_verbose ? on : off ]
env_cleanup $testdir
foreach {portA portB portC portD portE portF} [available_ports 6] {}
set dirA $testdir/A
set dirB $testdir/B
set dirC $testdir/C
set dirD $testdir/D
set dirE $testdir/E
set dirF $testdir/F
file mkdir $dirA
file mkdir $dirB
file mkdir $dirC
file mkdir $dirD
file mkdir $dirE
file mkdir $dirF
puts -nonewline "\tRepmgr029.z9: Set up a group of 6"
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envA repmgr -local [list 127.0.0.1 $portA] -start master
puts -nonewline "." ; flush stdout
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envB repmgr -local [list 127.0.0.1 $portB] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envB
puts -nonewline "." ; flush stdout
set envC [berkdb env -create -errpfx B -home $dirC -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envC repmgr -local [list 127.0.0.1 $portC] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envC
puts -nonewline "." ; flush stdout
set envD [berkdb env -create -errpfx B -home $dirD -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envD repmgr -local [list 127.0.0.1 $portD] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envD 30
puts -nonewline "." ; flush stdout
set envE [berkdb env -create -errpfx B -home $dirE -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envE repmgr -local [list 127.0.0.1 $portE] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envE 30
puts -nonewline "." ; flush stdout
set envF [berkdb env -create -errpfx B -home $dirF -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envF repmgr -local [list 127.0.0.1 $portF] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envF 40
puts "."
puts "\tRepmgr029.z9: Shut down site F"
$envF close
puts "\tRepmgr029.z9: Remove site E"
$envE close
$envA repmgr -remove [list 127.0.0.1 $portE]
puts "\tRepmgr029.z9: Remove site D"
$envD close
$envA repmgr -remove [list 127.0.0.1 $portD]
puts "\tRepmgr029.z9: Shut down site C"
$envC close
puts "\tRepmgr029.z9: Bounce the master"
$envA close
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envA repmgr -local [list 127.0.0.1 $portA] -start elect
# We now have a group of 4, with only A and B running. That's not
# enough to elect a master.
puts "\tRepmgr029.z9: Restart site F"
set envF [berkdb env -create -errpfx F -home $dirF -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envF repmgr -local [list 127.0.0.1 $portF] -start elect
# There are now 3 sites running, in a 4-site group. That should be
# enough to elect a master, if site F can be advised of the fact that
# the group size has been reduced.
# Wait for an election to complete.
await_condition {[repmgr029_known_master $envA $envF $envB]} 30
$envA close
$envB close
$envF close
}
# See that a membership list gets restored after an interrupted internal init.
proc z10 { } {
global rep_verbose
global testdir
global tclsh_path
global test_path
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
foreach {portA portB portC} [available_ports 3] {}
set dirA $testdir/A
set dirB $testdir/B
set dirC $testdir/C
file mkdir $dirA
file mkdir $dirB
file mkdir $dirC
set pagesize 4096
set log_max [expr $pagesize * 8]
puts "\tRepmgr029.z10: Set up a group of 3, A (master), B, C"
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-recover -verbose [list rep $rv] -log_max $log_max]
$envA repmgr -local [list 127.0.0.1 $portA] -start master
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-recover -verbose [list rep $rv] -log_max $log_max]
$envB repmgr -local [list 127.0.0.1 $portB] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envB
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-recover -verbose [list rep $rv] -log_max $log_max]
$envC repmgr -local [list 127.0.0.1 $portC] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envC
puts "\tRepmgr029.z10: Shut down site C and generate enough churn to force internal init"
set log_endC [get_logfile $envC last]
$envC close
set niter 50
while { [get_logfile $envA first] <= $log_endC } {
$envA test force noarchive_timeout
rep_test btree $envA NULL $niter 0 0 0 -pagesize $pagesize
$envA log_flush
$envA log_archive -arch_remove
}
# Use separate process so that it works even on Windows.
# Inhibit master from sending any PAGE messages.
puts "\tRepmgr029.z10: Restart site C in a separate process"
$envA test abort no_pages
set pid [exec $tclsh_path $test_path/wrap.tcl \
repmgr029script.tcl $testdir/repmgr029script.log $dirC $portC $rv &]
watch_procs $pid 5
puts "\tRepmgr029.z10: Shut down the rest of the group"
$envB close
$envA close
puts "\tRepmgr029.z10: Restart site C alone"
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envC repmgr -local [list 127.0.0.1 $portC] -start elect
puts "\tRepmgr029.z10: Check list of known sites, A and B"
set l [$envC repmgr_site_list]
foreach p [list $portA $portB] {
set sought [list 127.0.0.1 $p]
error_check_good port$p \
[expr [lsearch -glob $l [concat * $sought *]] >= 0] 1
}
$envC close
}
# See that a client notices a membership change that happens while it is
# disconnected (via the internal init completion trigger).
proc z11 { } {
global rep_verbose
global testdir
global tclsh_path
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
foreach {portA portB portC portD} [available_ports 4] {}
set dirA $testdir/A
set dirB $testdir/B
set dirC $testdir/C
set dirD $testdir/D
file mkdir $dirA
file mkdir $dirB
file mkdir $dirC
file mkdir $dirD
set pagesize 4096
set log_max [expr $pagesize * 8]
puts -nonewline "\tRepmgr029.z11: Set up a group initially of size 3"
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-recover -verbose [list rep $rv] -log_max $log_max]
$envA repmgr -local [list 127.0.0.1 $portA] -start master
puts -nonewline "." ; flush stdout
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-recover -verbose [list rep $rv] -log_max $log_max]
$envB repmgr -local [list 127.0.0.1 $portB] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envB
puts -nonewline "." ; flush stdout
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-recover -verbose [list rep $rv] -log_max $log_max]
$envC repmgr -local [list 127.0.0.1 $portC] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envC
puts "."
puts "\tRepmgr029.z11: Shut down C"
$envC close
puts "\tRepmgr029.z11: Join new site D"
set envD [berkdb env -create -errpfx D -home $dirD -txn -rep -thread \
-recover -verbose [list rep $rv] -log_max $log_max]
$envD repmgr -local [list 127.0.0.1 $portD] \
-remote [list 127.0.0.1 $portA] -start client
puts "\tRepmgr029.z11: Generate enough churn to force internal init at C later"
set tail [get_logfile $envA last]
set niter 50
while { [get_logfile $envA first] <= $tail } {
$envA test force noarchive_timeout
rep_test btree $envA NULL $niter 0 0 0 -pagesize $pagesize
$envA log_flush
$envA log_archive -arch_remove
}
puts "\tRepmgr029.z11: Restart site C"
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envC repmgr -local [list 127.0.0.1 $portC] -start elect
await_startup_done $envC
puts "\tRepmgr029.z11: Check list of known sites"
set l [$envC repmgr_site_list]
foreach p [list $portA $portB $portD] {
set sought [list 127.0.0.1 $p]
error_check_good port$p \
[expr [lsearch -glob $l [concat * $sought *]] >= 0] 1
}
$envC close
$envD close
$envB close
$envA close
}
# Exercise the new connection-related event types.
proc z12 { } {
global rep_verbose
global testdir
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
foreach {portA portB portC} [available_ports 3] {}
set dirA $testdir/A
set dirB $testdir/B
file mkdir $dirA
file mkdir $dirB
puts "\tRepmgr029.z12: Start primordial master site A"
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-recover -verbose [list rep $rv] -event]
$envA repmgr -local [list 127.0.0.1 $portA] -start master \
-timeout {connection_retry 2000000}
puts "\tRepmgr029.z12: Add new client site B"
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-recover -verbose [list rep $rv] -event]
$envB repmgr -remote [list 127.0.0.1 $portA] \
-local [list 127.0.0.1 $portB] -start client
await_startup_done $envB
puts "\tRepmgr029.z12: Check connection events"
set ev [find_event [$envA event_info] connection_established]
error_check_good ev_len [llength $ev] 2
error_check_good ev_eid [lindex $ev 1] [repmgr029_get_eid $envA $portB]
set ev [find_event [$envB event_info] connection_established]
error_check_good ev_len2 [llength $ev] 2
error_check_good ev_eid2 [lindex $ev 1] [repmgr029_get_eid $envB $portA]
puts "\tRepmgr029.z12: Shut down site B, observe event at site A"
$envB close
set ev [await_event $envA connection_broken]
error_check_good ev_len3 [llength $ev] 2
set evinfo [lindex $ev 1]
error_check_good ev_len3b [llength $evinfo] 2
foreach {eid err} $evinfo {}
error_check_good ev_eid3 $eid [repmgr029_get_eid $envA $portB]
puts "\t\tRepmgr029.z12: (connection_broken error code is $err)"
set ev [await_event $envA connection_retry_failed]
error_check_good ev_len3 [llength $ev] 2
set evinfo [lindex $ev 1]
error_check_good ev_len3c [llength $evinfo] 2
foreach {eid err} $evinfo {}
error_check_good ev_eid3 $eid [repmgr029_get_eid $envA $portB]
puts "\t\tRepmgr029.z12: (retry_failed error code is $err)"
puts "\tRepmgr029.z12: Shut down site A, then restart B"
$envA close
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-recover -verbose [list rep $rv] -event]
$envB repmgr -local [list 127.0.0.1 $portB] -start elect \
-timeout {connection_retry 2000000}
# new event instances should continue to be fired indefinitely. For
# now, consider '3' to be close enough to infinity.
for { set i 1 } { $i <= 3 } { incr i } {
puts "\tRepmgr029.z12: Observe event ($i)"
set ev [await_event $envB connection_retry_failed]
error_check_good ev_eid4 [lindex $ev 1 0] [repmgr029_get_eid $envB $portA]
error_check_good never_estd \
[string length [find_event [$envB event_info] \
connection_established]] 0
# According to our definition of "connection broken" you can't
# "break" what you never had.
error_check_good never_broken \
[string length [find_event [$envB event_info] \
connection_broken]] 0
$envB event_info -clear
}
$envB close
}
# Make sure applications aren't bothered by perm failed events from failed GMDB
# operations.
proc z13 { } {
global rep_verbose
global testdir
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
foreach {portA portB portC} [available_ports 3] {}
set dirA $testdir/A
set dirB $testdir/B
set dirC $testdir/C
file mkdir $dirA
file mkdir $dirB
file mkdir $dirC
puts -nonewline "\tRepmgr029.z13: Create first 2 sites"
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-recover -verbose [list rep $rv] -event]
$envA repmgr -local [list 127.0.0.1 $portA] -start master
puts -nonewline "." ; flush stdout
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envB repmgr -local [list 127.0.0.1 $portB] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envB
puts "."
puts "\tRepmgr029.z13: Shut down site B, try to add third site, site C"
$envB close
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-recover -verbose [list rep $rv]]
set ret [catch {$envC repmgr -local [list 127.0.0.1 $portC] \
-remote [list 127.0.0.1 $portA] -start client} result]
error_check_bad no_failure $ret 0
error_check_match unavail $result "*DB_REP_UNAVAIL*"
puts "\tRepmgr029.z13: Make sure site A application didn't see a perm failure"
error_check_good no_failure \
[string length [find_event [$envA event_info] perm_failed]] 0
$envC close
$envA close
}
# Make sure we can add/remove sites even when ALL policy is in effect.
proc z14 { } {
global rep_verbose
global testdir
set rv off
if { $rep_verbose == 1 } {
set rv on
}
foreach {portA portB portC} [available_ports 3] {}
set dirA $testdir/A
set dirB $testdir/B
set dirC $testdir/C
foreach policy {all allpeers} {
env_cleanup $testdir
file mkdir $dirA
file mkdir $dirB
file mkdir $dirC
puts "\tRepmgr029.z14: Using \"$policy\" ack policy"
puts "\tRepmgr029.z14: Create first site A"
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-recover -verbose [list rep $rv] -event]
$envA repmgr -local [list 127.0.0.1 $portA] -start master -ack $policy
puts "\tRepmgr029.z14: Add 2nd site, B"
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envB repmgr -local [list 127.0.0.1 $portB] \
-remote [list 127.0.0.1 $portA] -start client -ack $policy
await_startup_done $envB
puts "\tRepmgr029.z14: Add 3rd site, C"
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envC repmgr -local [list 127.0.0.1 $portC] \
-remote [list 127.0.0.1 $portA] -start client -ack $policy
await_startup_done $envC
puts "\tRepmgr029.z14: Remove site B"
$envC repmgr -remove [list 127.0.0.1 $portB]
error_check_good removed \
[string length [repmgr029_site_list_status $envA $portB]] 0
$envB close
$envC close
$envA close
}
}
# Rescind a pending (previously incomplete) change, and check effect on nsites.
proc z15 { } {
global rep_verbose
global testdir
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
foreach {portA portB portC portD portE} [available_ports 5] {}
set dirA $testdir/A
set dirB $testdir/B
set dirC $testdir/C
set dirD $testdir/D
set dirE $testdir/E
file mkdir $dirA
file mkdir $dirB
file mkdir $dirC
file mkdir $dirD
file mkdir $dirE
puts -nonewline "\tRepmgr029.z15: Create initial group of 4 sites"
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-verbose [list rep $rv]]
$envA repmgr -local [list 127.0.0.1 $portA] -start master
puts -nonewline "." ; flush stdout
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-verbose [list rep $rv]]
$envB repmgr -local [list 127.0.0.1 $portB] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envB
puts -nonewline "." ; flush stdout
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-verbose [list rep $rv]]
$envC repmgr -local [list 127.0.0.1 $portC] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envC
puts -nonewline "." ; flush stdout
set envD [berkdb env -create -errpfx D -home $dirD -txn -rep -thread \
-verbose [list rep $rv]]
$envD repmgr -local [list 127.0.0.1 $portD] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envD
puts "."
puts "\tRepmgr029.z15: Shut down C and D, and try to add E"
$envC close
$envD close
set envE [berkdb env -create -errpfx E -home $dirE -txn -rep -thread \
-verbose [list rep $rv] -event]
set ret [catch {$envE repmgr -local [list 127.0.0.1 $portE] \
-remote [list 127.0.0.1 $portA] -start client} result]
error_check_bad no_failure $ret 0
error_check_match unavail $result "*DB_REP_UNAVAIL*"
error_check_good nsites [$envA rep_get_nsites] 5
await_condition {[expr [$envB rep_get_nsites] == 5]}
puts "\tRepmgr029.z15: Rescind the addition of site E, by removing it"
$envA repmgr -remove [list 127.0.0.1 $portE]
error_check_good nsites2 [$envA rep_get_nsites] 4
await_condition {[expr [$envB rep_get_nsites] == 4]}
puts -nonewline "\tRepmgr029.z15: Restart sites C and D"
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envC repmgr -local [list 127.0.0.1 $portC] -start client
await_startup_done $envC
puts -nonewline "." ; flush stdout
set envD [berkdb env -create -errpfx D -home $dirD -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envD repmgr -local [list 127.0.0.1 $portD] -start client
await_startup_done $envD
puts "."
puts "\tRepmgr029.z15: Try adding new site E again,\
this time it should succeed"
# Note that it was not necessary to bounce the env handle.
$envE repmgr -start client
error_check_good nsites [$envA rep_get_nsites] 5
await_condition {[expr [$envB rep_get_nsites] == 5]}
await_startup_done $envE
puts "\tRepmgr029.z15: Shut down C and D again,\
and this time try removing site E"
$envC close
$envD close
set ret [catch {$envA repmgr -remove [list 127.0.0.1 $portE]} result]
error_check_bad no_failure2 $ret 0
error_check_match unavail2 $result "*DB_REP_UNAVAIL*"
error_check_good nsites2 [$envA rep_get_nsites] 5
error_check_good nsites3 [$envB rep_get_nsites] 5
set db [berkdb open -env $envA -thread __db.rep.system __db.membership]
set SITE_DELETING 2
error_check_good deleting \
[repmgr029_gmdb_status $db 127.0.0.1 $portE] $SITE_DELETING
$db close
puts "\tRepmgr029.z15: See that site E fired event for as little\
as DELETING status"
await_event $envE local_site_removed
$envE close
puts "\tRepmgr029.z15: Rescind the removal of site E"
# The only way add site E is to have it start and try to join. Someday
# (maybe even before code freeze) it will be possible to restart the
# zombie carcass in the same env handle.
set envE [berkdb env -create -errpfx E -home $dirE -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envE repmgr -local [list 127.0.0.1 $portE] -start client
error_check_good nsites4 [$envA rep_get_nsites] 5
error_check_good nsites5 [$envB rep_get_nsites] 5
$envE close
$envB close
$envA close
}
# See that removing a non-existent site acts as a no-op, and doesn't yield an
# error.
proc z16 { } {
global rep_verbose
global testdir
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
# Group size will be three, but allocate an extra port to act as the
# non-existent sites.
foreach {portA portB portC portD} [available_ports 4] {}
set dirA $testdir/A
set dirB $testdir/B
set dirC $testdir/C
file mkdir $dirA
file mkdir $dirB
file mkdir $dirC
puts -nonewline "\tRepmgr029.z16: Create a group of 3 sites"
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-verbose [list rep $rv]]
$envA repmgr -local [list 127.0.0.1 $portA] -start master
puts -nonewline "." ; flush stdout
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-verbose [list rep $rv]]
$envB repmgr -local [list 127.0.0.1 $portB] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envB
puts -nonewline "." ; flush stdout
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-verbose [list rep $rv] -event]
$envC repmgr -local [list 127.0.0.1 $portC] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envC
puts "."
puts "\tRepmgr029.z16: Remove non-existent site"
$envB repmgr -remove [list 127.0.0.1 $portD]
error_check_good nsites [$envA rep_get_nsites] 3
error_check_good nsites [$envB rep_get_nsites] 3
error_check_good nsites [$envC rep_get_nsites] 3
# While we're on the topic of removing sites, let's try having a site
# remove itself.
puts "\tRepmgr029.z16: Have site C remove itself"
$envC repmgr -remove [list 127.0.0.1 $portC]
error_check_good nsites [$envA rep_get_nsites] 2
await_event $envC local_site_removed
$envC close
$envB close
$envA close
}
# Exercise group creation with non-default ack policies.
proc z17 { } {
global rep_verbose
global testdir
set rv off
if { $rep_verbose == 1 } {
set rv on
}
foreach {portA portB portC} [available_ports 3] {}
set dirA $testdir/A
set dirB $testdir/B
set dirC $testdir/C
foreach policy {one onepeer all allpeers allavailable none} {
env_cleanup $testdir
file mkdir $dirA
file mkdir $dirB
file mkdir $dirC
puts -nonewline "\tRepmgr029.z17: Create a group of 3 sites using\
`$policy' ack policy"
set envA [berkdb env -create -errpfx A -home $dirA -txn \
-rep -thread -verbose [list rep $rv]]
$envA repmgr -local [list 127.0.0.1 $portA creator] \
-start elect -ack $policy
puts -nonewline "." ; flush stdout
set envB [berkdb env -create -errpfx B -home $dirB -txn \
-rep -thread -verbose [list rep $rv] -event]
$envB repmgr -local [list 127.0.0.1 $portB] \
-remote [list 127.0.0.1 $portA] -start elect -ack $policy
await_startup_done $envB
puts -nonewline "." ; flush stdout
set envC [berkdb env -create -errpfx C -home $dirC -txn \
-rep -thread -verbose [list rep $rv] -event]
$envC repmgr -local [list 127.0.0.1 $portC] \
-remote [list 127.0.0.1 $portA] -start elect
await_startup_done $envC
puts "."
puts "\tRepmgr029.z17: Remove both clients."
$envA repmgr -remove [list 127.0.0.1 $portB]
error_check_good nsites [$envA rep_get_nsites] 2
await_event $envB local_site_removed
$envB close
$envA repmgr -remove [list 127.0.0.1 $portC]
error_check_good nsites [$envA rep_get_nsites] 1
await_event $envC local_site_removed
$envC close
$envA close
}
}
#
# Add new site to existing group, already populated via hot backup
# a. Start site A as group creator.
# b. Start site B as client, and wait it for sync.
# c. Hot backup the site B's environment to directory C,
# and start up site C using the directory C.
# d. Check membership at site C
#
proc z18 { } {
global rep_verbose
global testdir
global util_path
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
foreach {portA portB portC} [available_ports 3] {}
set dirA $testdir/dirA
set dirB $testdir/dirB
set dirC $testdir/dirC
file mkdir $dirA
file mkdir $dirB
file mkdir $dirC
puts -nonewline "\tRepmgr029.z18.a: Start site A as master."
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envA repmgr -local [list 127.0.0.1 $portA creator] \
-start master
error_check_good nsites_A [$envA rep_get_nsites] 1
puts "." ; flush stdout
puts -nonewline "\tRepmgr029.z18.b. Start site B"
set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \
-verbose [list rep $rv]]
$envB repmgr -local [list 127.0.0.1 $portB] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envB
error_check_good nsites_B [$envB rep_get_nsites] 2
puts "." ; flush stdout
puts "\tRepmgr029.z18.c.1: Hot backup the site B's environment to $dirC"
# Ensure $dirC is empty before hot backup.
set files [glob -nocomplain $dirC/*]
error_check_good no_files [llength $files] 0
eval exec $util_path/db_hotbackup -vh $dirB -b $dirC
puts "\tRepmgr029.z18.c.2: Start up site C in $dirC."
set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envC repmgr -local [list 127.0.0.1 $portC] \
-remote [list 127.0.0.1 $portA] -start client
await_startup_done $envC
error_check_good nsites_C [$envC rep_get_nsites] 3
puts "\tRepmgr029.z18.c.3: Verify site C starts without internal init."
error_check_good no_pages [stat_field $envC rep_stat "Pages received"] 0
error_check_good siteC_close [$envC close] 0
error_check_good siteB_close [$envB close] 0
error_check_good siteA_close [$envA close] 0
}
#
# Initiate group change during long-running transaction at master
# (waits for transaction to abort)
# a. start site A as master
# b. begin a transaction, write a record
# c. start a separate process to add a second site ("B") to the group
# d. in the transaction in b, write a record and sleep for a second in a loop.
# Would run into deadlock
# e. abort the txn when the deadlock occurs
# f. after that, the joining operation in the other thread should complete
# successfully.
#
proc z19 {} {
global rep_verbose
global testdir
global tclsh_path
global test_path
set rv off
if { $rep_verbose == 1 } {
set rv on
}
env_cleanup $testdir
foreach {portA portB} [available_ports 2] {}
set dirA $testdir/dirA
set dirB $testdir/dirB
file mkdir $dirA
file mkdir $dirB
puts "\tRepmgr029.z19.a: Start up site A as master "
set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \
-recover -verbose [list rep $rv]]
$envA repmgr -local [list 127.0.0.1 $portA creator] -start master
error_check_good nsites_A [$envA rep_get_nsites] 1
puts "\tRepmgr029.z19.b: Begin txn and open db on master."
set txn [$envA txn]
error_check_good txn_begin [is_valid_txn $txn $envA] TRUE
set testfile repmg029.db
set oflags {-create -btree -mode 0755 -thread -env $envA \
-txn $txn $testfile}
set db [eval {berkdb_open} $oflags]
error_check_good db_open [is_valid_db $db] TRUE
puts "\tRepmgr029.z19.c: Add site B in another process"
set pid [exec $tclsh_path $test_path/wrap.tcl repmgr029script2.tcl \
$testdir/repmgr029script2.log $dirB $portB $dirA $portA &]
puts "\tRepmgr029.z19.d: Write data in the txn, expecting deadlock"
set maxcount 100
for { set count 0 } { $count < $maxcount } { incr count } {
set key $count
set data "gmdb data"
if { [catch {$db put -txn $txn $key $data} ret] } {
error_check_good put_deadlock \
[is_substr $ret DB_LOCK_DEADLOCK] 1
break
} else {
tclsleep 1
}
}
error_check_good put_deadlock [is_substr $ret DB_LOCK_DEADLOCK] 1
error_check_good txn_abort [$txn abort] 0
puts "\tRepmgr029.z19.e: Confirm B has joined."
for { set count 0 } { $count < $maxcount } { incr count } {
if { [$envA rep_get_nsites] > 1 } {
break
} else {
tclsleep 1
}
}
watch_procs $pid 5
error_check_good db_close [$db close] 0
error_check_good master_close [$envA close] 0
# Check output file of the sub-process for failures.
set file repmgr029script2.log
set errstrings [eval findfail $testdir/$file]
foreach str $errstrings {
puts "$str"
}
error_check_good errstrings_llength [llength $errstrings] 0
}
proc repmgr029_dump_db { e } {
set db [berkdb open -env $e -thread __db.rep.system __db.membership]
set c [$db cursor]
set format_version [lindex [$c get -first] 0 1]
binary scan $format_version II fmt vers
puts "version $vers"
while {[llength [set r [$c get -next]]] > 0} {
set k [lindex $r 0 0]
set v [lindex $r 0 1]
binary scan $k I len
set hostname [string range $k 4 [expr 2 + $len]]
binary scan $hostname A* host
binary scan [string range $k [expr 4 + $len] end] S port
binary scan $v I status
puts "{$host $port} $status"
}
$c close
$db close
}
proc repmgr029_get_eid { e port } {
set sle [repmgr029_get_site_list_entry $e $port]
if { [string length $sle] == 0} {
return ""
}
return [lindex $sle 0]
}
proc repmgr029_get_site_list_entry { e port } {
foreach sle [$e repmgr_site_list] {
set p [lindex $sle 2]
if { $p == $port } {
return $sle
}
}
return ""
}
proc repmgr029_gmdb_status { db host port } {
set l [string length $host]
set key [binary format Ia*cS [expr $l + 1] $host 0 $port]
set kvlist [$db get $key]
if {[llength $kvlist] == 0} {
return 0
}
set kvpair [lindex $kvlist 0]
set val [lindex $kvpair 1]
binary scan $val I status
return $status
}
proc repmgr029_gmdb_version { db } {
set key [binary format IS 0 0]
set kvlist [$db get $key]
set kvpair [lindex $kvlist 0]
set val [lindex $kvpair 1]
binary scan $val II format version
return $version
}
proc repmgr029_known_master { e1 e2 e3 } {
foreach e [list $e1 $e2 $e3] {
set m [stat_field $e rep_stat "Master environment ID"]
if {$m == -2} {
return no
}
}
return yes
}
proc repmgr029_site_list_status { e port } {
set sle [repmgr029_get_site_list_entry $e $port]
if { [string length $sle] == 0 } {
return ""
}
return [lindex $sle 3]
}