# See the file LICENSE for redistribution information. # # Copyright (c) 2010, 2011 Oracle and/or its affiliates. All rights reserved. # # $Id$ # # TEST repmgr029 # TEST Test repmgr group membership: create, join, re-join and remove from # TEST repmgr group and observe changes in group membership database. # TEST proc repmgr029 { } { puts "Repmgr029: Repmgr Group Membership operations." z1 z2 z3 z4 z5 z6 z7 z8 z9 z10 z11 z12 z13 z14 z15 z16 z17 z18 z19 } # See that a joining site that names a non-master as helper gets a # "forward" response, and manages to then get to the true master. # # Note: there's a bit of a race here, depending on the condition of # site B at the time C tries to join. That should eventually be # tightened up. proc z3 {} { global rep_verbose global testdir set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {port0 port1 port2 port3 port4 port5} [available_ports 6] {} set masterdir $testdir/MASTERDIR set clientdir $testdir/CLIENTDIR set clientdir2 $testdir/CLIENTDIR2 set clientdir3 $testdir/CLIENTDIR3 set clientdir4 $testdir/CLIENTDIR4 set clientdir5 $testdir/CLIENTDIR5 file mkdir $masterdir file mkdir $clientdir file mkdir $clientdir2 file mkdir $clientdir3 file mkdir $clientdir4 file mkdir $clientdir5 puts "\tRepmgr029.z3.a: Primordial creation, Start Master site 0" set env1 [berkdb env -create -errpfx MASTER -home $masterdir -txn \ -rep -thread -recover -verbose [list rep $rv]] $env1 repmgr -local [list localhost $port0] -start master error_check_good nsites_A [$env1 rep_get_nsites] 1 puts "\tRepmgr029.z3.b: Simple join request, \ client 1 points directly at master" set env2 [berkdb env -create -errpfx CLIENT -home $clientdir -txn \ -rep -thread -recover -verbose [list rep $rv]] $env2 rep_config {mgr2sitestrict on} $env2 repmgr -local [list localhost $port1] \ -remote [list localhost $port0] -start client await_startup_done $env2 error_check_good nsites_A2 [$env1 rep_get_nsites] 2 error_check_good nsites_B2 [$env2 rep_get_nsites] 2 puts "\tRepmgr029.z3.c: Join request forwarding, start client 2." set env3 [berkdb env -create -errpfx CLIENT2 -home $clientdir2 -txn \ -rep -thread -recover -verbose [list rep $rv]] $env3 rep_config {mgr2sitestrict on} $env3 repmgr -local [list localhost $port2] \ -remote [list localhost $port1] set done no while {!$done} { if {[catch {$env3 repmgr -start client} msg]} { puts $msg tclsleep 1 } else { set done yes } } await_startup_done $env3 error_check_good nsites_A3 [$env1 rep_get_nsites] 3 error_check_good nsites_B3 [$env2 rep_get_nsites] 3 error_check_good nsites_C3 [$env3 rep_get_nsites] 3 puts "\tRepmgr029.z3.d: Master cannot be removed \ (by itself, or as requested from a client)" set ret [catch {$env1 repmgr -remove [list localhost $port0]} result] error_check_bad no_failure $ret 0 error_check_match unavail $result "*DB_REP_UNAVAIL*" set ret [catch {$env2 repmgr -remove [list localhost $port0]} result] error_check_bad no_failure2 $ret 0 error_check_match unavail2 $result "*DB_REP_UNAVAIL*" set db [berkdb open -env $env1 -thread __db.rep.system __db.membership] puts "\tRepmgr029.z3.e: Join request rejected for lack of acks" puts "\t\tRepmgr029.z3.e.1: Close client 1 and 2." error_check_good s_3_close [$env3 close] 0 error_check_good s_2_close [$env2 close] 0 puts "\t\tRepmgr029.z3.e.2: Start client 3." set env4 [berkdb env -create -errpfx CLIENT3 -home $clientdir3 \ -txn -rep -thread -recover -verbose [list rep $rv]] set ret [catch {$env4 repmgr -local [list localhost $port3] \ -remote [list localhost $port0] -start client} result] error_check_bad no_failure3 $ret 0 error_check_match unavail3 $result "*DB_REP_UNAVAIL*" set prev_vers [repmgr029_gmdb_version $db] puts "\t\tRepmgr029.z3.e.3: Check previous GMDB version $prev_vers" set SITE_ADDING 1 set SITE_PRESENT 4 error_check_good site_3_adding [repmgr029_gmdb_status $db localhost $port3] \ $SITE_ADDING puts "\t\tRepmgr029.z3.e.4: limbo resolution, restart client 1." set env2 [berkdb env -create -errpfx CLIENT -home $clientdir -txn \ -rep -thread -recover -verbose [list rep $rv] -event] # no helper should be needed this time. $env2 repmgr -local [list localhost $port1] -start client await_startup_done $env2 50 puts "\t\tRepmgr029.z3.e.5: normal txn at master" set niter 1 rep_test btree $env1 NULL $niter 0 0 0 set new_vers [repmgr029_gmdb_version $db] puts "\t\tRepmgr029.z3.e.6: NEW GMDB version $new_vers" error_check_good version_incr $new_vers [expr $prev_vers + 1] error_check_good site_3_added [repmgr029_gmdb_status $db localhost $port3] \ $SITE_PRESENT puts "\t\tRepmgr029.z3.e.7: client 3 rejoins." $env4 repmgr -start client await_startup_done $env4 60 # To verify that the GMDB has been updated on client side. puts "\t\tRepmgr029.z3.e.8: Verify the GMDB on the client 3." set db3 [berkdb open -env $env4 -thread __db.rep.system __db.membership] error_check_good vers [repmgr029_gmdb_version $db3] $new_vers $db3 close # This test case verify a scenario where (1) try another (different) # join request, still with insufficient acks, and see that it doesn't # load up another limbo; and then (2) with acks working, # a second request finishes off the first and then succeeds. # I guess we also need to try simply retrying the first addition. puts "\tRepmgr029.z3.f: Join request rejected for lack of acks" puts "\t\tRepmgr029.z3.f.1: Close client 1." error_check_good s_1_close [$env2 close] 0 set prev_vers [repmgr029_gmdb_version $db] puts "\t\tRepmgr029.z3.f.2: Check current GMDB version $prev_vers" puts "\t\tRepmgr029.z3.f.3: Start client 4." set env5 [berkdb env -create -errpfx CLIENT4 -home $clientdir4 \ -txn -rep -thread -recover -verbose [list rep $rv]] set ret [catch {$env5 repmgr -local [list localhost $port4] \ -remote [list localhost $port0] -start client} result] error_check_bad no_failure4 $ret 0 error_check_match unavail4 $result "*DB_REP_UNAVAIL*" set prev_vers [repmgr029_gmdb_version $db] puts "\t\tRepmgr029.z3.f.4: Check current GMDB version $prev_vers" error_check_good site_4_adding [repmgr029_gmdb_status $db localhost $port4] \ $SITE_ADDING puts "\t\tRepmgr029.z3.f.5: Start client 5." set env6 [berkdb env -create -errpfx CLIENT5 -home $clientdir5 \ -txn -rep -thread -recover -verbose [list rep $rv]] set ret [catch {$env6 repmgr -local [list localhost $port5] \ -remote [list localhost $port0] -start client} result] error_check_bad no_failure5 $ret 0 error_check_match unavail5 $result "*DB_REP_UNAVAIL*" set prev_vers [repmgr029_gmdb_version $db] puts "\t\tRepmgr029.z3.f.6: Check current GMDB version $prev_vers" # [M]: There is no gm status for client 5 so far. Let alone the "ADDING". #error_check_good site_5_adding [repmgr029_gmdb_status $db localhost $port5] \ # $SITE_ADDING puts "\t\tRepmgr029.z3.f.7: limbo resolution, restart client 1." set env2 [berkdb env -create -errpfx CLIENT -home $clientdir -txn \ -rep -thread -recover -verbose [list rep $rv] -event] # no helper should be needed this time. $env2 repmgr -local [list localhost $port1] -start client await_startup_done $env2 50 puts "\t\tRepmgr029.z3.f.8: normal txn at master" set niter 1 rep_test btree $env1 NULL $niter 0 0 0 set new_vers [repmgr029_gmdb_version $db] puts "\t\tRepmgr029.z3.f.9: NEW GMDB version $new_vers" error_check_good version_incr $new_vers [expr $prev_vers + 1] puts "\t\tRepmgr029.z3.f.10: client 5 rejoins." $env6 repmgr -start client await_startup_done $env6 60 set new_vers [repmgr029_gmdb_version $db] puts "\t\tRepmgr029.z3.f.11: NEW GMDB version $new_vers" # Check for client 5, which has gm status as "ADDED" error_check_good site_5_added [repmgr029_gmdb_status $db localhost $port5] \ $SITE_PRESENT #[M]: So far gm status for client 4 is "ADDED" error_check_good site_4_added [repmgr029_gmdb_status $db localhost $port4] \ $SITE_PRESENT #[M]: We'd like to check the gm status on the client 4 sides. # No Way! as client 4 has not been start up and sync. # puts "\t\tRepmgr029.z3.e.8: Verify the GMDB on the client 4." # set db4 [berkdb open -env $env5 -thread __db.rep.system __db.membership] # error_check_good vers [repmgr029_gmdb_version $db4] $new_vers puts "\t\tRepmgr029.z3.f.12: client 4 rejoins." $env5 repmgr -start client await_startup_done $env5 100 set new_vers [repmgr029_gmdb_version $db] puts "\t\tRepmgr029.z3.f.13: NEW GMDB version $new_vers" puts "\tRepmgr029.z3.h: Remove (downed) client 3, from master" $env1 repmgr -remove [list localhost $port3] error_check_good site_3_removed [repmgr029_gmdb_status $db localhost $port3] 0 error_check_good db_close [$db close] 0 error_check_good s_1_close [$env2 close] 0 error_check_good s_3_close [$env4 close] 0 error_check_good s_4_close [$env5 close] 0 error_check_good s_5_close [$env6 close] 0 error_check_good s_0_close [$env1 close] 0 puts "\tRepmgr029.z3.i: End OF Repmgr029" } # Remove a live site from a group, and see that the site gets a # LOCAL_SITE_REMOVED event, and the other sites get SITE_REMOVED. # proc z6 { } { global rep_verbose global testdir set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {portA portB portC} [available_ports 3] {} set dirA $testdir/A set dirB $testdir/B set dirC $testdir/C file mkdir $dirA file mkdir $dirB file mkdir $dirC puts -nonewline "\tRepmgr029.z6.a: Build basic 3-site group" set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -verbose [list rep $rv] -event] $envA repmgr -local [list localhost $portA creator] -start elect error_check_good nsites_a [$envA rep_get_nsites] 1 puts -nonewline "."; flush stdout set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -verbose [list rep $rv] -event] $envB repmgr -local [list localhost $portB] \ -remote [list localhost $portA] -start elect await_startup_done $envB error_check_good nsites_b [$envB rep_get_nsites] 2 puts -nonewline "."; flush stdout set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -verbose [list rep $rv] -event] $envC repmgr -local [list localhost $portC] \ -remote [list localhost $portA] -start elect await_startup_done $envC error_check_good nsites_c [$envC rep_get_nsites] 3 puts "."; flush stdout set eid_C_at_A [repmgr029_get_eid $envA $portC] set eid_C_at_B [repmgr029_get_eid $envB $portC] puts "\tRepmgr029.z6.b: Remove (live) site C from a request originating at B." $envB repmgr -remove [list localhost $portC] set db [berkdb open -env $envA -thread __db.rep.system __db.membership] error_check_good site_c_removed [repmgr029_gmdb_status $db localhost $portC] 0 set master_ev [find_event [$envA event_info] site_removed] error_check_good site_a_event [llength $master_ev] 2 error_check_good site_a_event_eid [lindex $master_ev 1] $eid_C_at_A error_check_good site_a_list [llength [repmgr029_get_eid $envA $portC]] 0 await_event $envC local_site_removed error_check_good s_c_close [$envC close] 0 await_condition {[expr [string length [repmgr029_site_list_status $envB $portC]] == 0]} set b_ev [find_event [$envB event_info] site_removed] error_check_good site_b_event [llength $b_ev] 2 error_check_good site_b_event_eid [lindex $b_ev 1] $eid_C_at_B error_check_good site_b_list [llength [repmgr029_get_eid $envB $portC]] 0 error_check_good s_b_close [$envB close] 0 $db close error_check_good s_a_close [$envA close] 0 } # See that SITE_ADDED events are fired appropriately. proc z8 { } { global rep_verbose global testdir set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {portA portB portC} [available_ports 3] {} set dirA $testdir/A set dirB $testdir/B set dirC $testdir/C file mkdir $dirA file mkdir $dirB file mkdir $dirC puts "\tRepmgr029.z8: Create primordial site" set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -verbose [list rep $rv] -event] $envA repmgr -local [list localhost $portA creator] -start elect puts "\tRepmgr029.z8: Add client, check for event at master" set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -verbose [list rep $rv] -event] $envB repmgr -local [list localhost $portB] \ -remote [list localhost $portA] -start elect set ev [find_event [$envA event_info] site_added] error_check_good ev_a [llength $ev] 2 set eid [lindex $ev 1] error_check_good ev_a_eid $eid [repmgr029_get_eid $envA $portB] await_startup_done $envB puts "\tRepmgr029.z8: Add another client, check for events at both other sites" $envA event_info -clear $envB event_info -clear set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -verbose [list rep $rv] -event] $envC repmgr -local [list localhost $portC] \ -remote [list localhost $portA] -start elect set ev [find_event [$envA event_info] site_added] error_check_good ev_a2 [llength $ev] 2 set eid [lindex $ev 1] error_check_good ev_a_eid2 $eid [repmgr029_get_eid $envA $portC] await_event $envB site_added set ev [find_event [$envB event_info] site_added] error_check_good ev_b [llength $ev] 2 set eid [lindex $ev 1] error_check_good ev_b_eid $eid [repmgr029_get_eid $envB $portC] $envC close $envB close $envA close } # Remove a site, starting at the site to be removed. See that we at least shut # down threads (if not also fire event in this case). # proc z7 { } { global rep_verbose global testdir set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {portA portB portC} [available_ports 3] {} set dirA $testdir/A set dirB $testdir/B set dirC $testdir/C file mkdir $dirA file mkdir $dirB file mkdir $dirC puts -nonewline "\tRepmgr029.z7: Set up a group of 3, A (master), B, C" set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -recover -verbose [list rep $rv] -event] $envA repmgr -local [list localhost $portA] -start master puts -nonewline "." ; flush stdout set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -recover -verbose [list rep $rv] -event] $envB repmgr -local [list localhost $portB] \ -remote [list localhost $portA] -start client await_startup_done $envB puts -nonewline "." ; flush stdout set eid_B_at_A [repmgr029_get_eid $envA $portB] set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -recover -verbose [list rep $rv] -event] $envC repmgr -local [list localhost $portC] \ -remote [list localhost $portA] -start client await_startup_done $envC puts "." puts "\tRepmgr029.z7: Remove site B itself" $envB repmgr -remove [list localhost $portB] await_event $envB local_site_removed set master_ev [find_event [$envA event_info] site_removed] error_check_good site_a_event_eid [lindex $master_ev 1] $eid_B_at_A error_check_good site_a_list [llength [repmgr029_get_eid $envA $portB]] 0 $envB close $envC close $envA close } # See that a join request is rejected if insufficient acks. (It should # remain in the db as "adding" though, and apps should be able to query # nsites to find out that it's been incremented.) # proc z4 {} { global rep_verbose global testdir set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {port0 port1 port2} [available_ports 3] {} set masterdir $testdir/MASTERDIR set clientdir $testdir/CLIENTDIR set clientdir2 $testdir/CLIENTDIR2 file mkdir $masterdir file mkdir $clientdir file mkdir $clientdir2 puts -nonewline "\tRepmgr029.z4.a: Start the master." set env1 [berkdb_env -create -errpfx MASTER -home $masterdir \ -txn -rep -thread -recover -verbose [list rep $rv]] $env1 repmgr -local [list localhost $port0] -start master error_check_good nsites_1 [$env1 rep_get_nsites] 1 puts "."; flush stdout puts -nonewline "\tRepmgr029.z4.b: Start first client." set env2 [berkdb_env -create -errpfx CLIENT -home $clientdir -txn \ -rep -thread -recover -verbose [list rep $rv]] $env2 rep_config {mgr2sitestrict on} $env2 repmgr -local [list localhost $port1] \ -remote [list localhost $port0] -start client await_startup_done $env2 error_check_good nsites_2 [$env2 rep_get_nsites] 2 puts "."; flush stdout puts "\tRepmgr029.z4.c: Close the first client." error_check_good s_2_close [$env2 close] 0 puts "\tRepmgr029.z4.d: Start the second client." set env3 [berkdb_env -create -errpfx CLIENT2 -home $clientdir2 \ -txn -rep -thread -recover -verbose [list rep $rv]] $env3 rep_config {mgr2sitestrict on} set ret [catch {$env3 repmgr -local [list localhost $port2] \ -remote [list localhost $port0] -start client} result] error_check_bad no_failure $ret 0 error_check_match unavail $result "*DB_REP_UNAVAIL*" puts "\tRepmgr029.z4.e: The second join failed as expected, \ since the first client is down" puts -nonewline "\tRepmgr029.z4.f: restart the first client." set env2 [berkdb_env -errpfx CLIENT -home $clientdir -txn -rep \ -thread -recover -create -verbose [list rep $rv]] $env2 rep_config {mgr2sitestrict on} $env2 repmgr -local [list localhost $port1] -start client await_startup_done $env2 puts "."; flush stdout puts "\tRepmgr029.z4.g: try to join the second client again" if {[catch {$env3 repmgr -start client} result] && \ [string match "*REP_UNAVAIL*" $result]} { puts "\tRepmgr029.z4.h: pause and try again" tclsleep 3 $env3 repmgr -start client } await_startup_done $env3 100 error_check_good nsites_3 [$env3 rep_get_nsites] 3 error_check_good s_3_close [$env3 close] 0 error_check_good s_2_close [$env2 close] 0 error_check_good s_1_close [$env1 close] 0 } # Cold-boot an established group, without specifying any remote sites, and see # that they can elect a master (demonstrating that they have recorded each # others' addresses). # proc z5 {} { global rep_verbose global testdir set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {port0 port1 port2} [available_ports 3] {} set masterdir $testdir/MASTERDIR set clientdir $testdir/CLIENTDIR set clientdir2 $testdir/CLIENTDIR2 file mkdir $masterdir file mkdir $clientdir file mkdir $clientdir2 puts -nonewline "\tRepmgr029.z5.a: Set up a group of 3, one master and two clients." set env1 [berkdb env -create -errpfx MASTER -home $masterdir \ -txn -rep -thread -recover -verbose [list rep $rv]] $env1 repmgr -local [list localhost $port0] -start master error_check_good nsites_1 [$env1 rep_get_nsites] 1 puts -nonewline "." ; flush stdout set env2 [berkdb env -create -errpfx CLIENT -home $clientdir \ -txn -rep -thread -recover -verbose [list rep $rv]] $env2 repmgr -local [list localhost $port1] \ -remote [list localhost $port0] -start client await_startup_done $env2 error_check_good nsites_2 [$env2 rep_get_nsites] 2 puts -nonewline "." ; flush stdout set env3 [berkdb env -create -errpfx CLIENT2 -home $clientdir2 \ -txn -rep -thread -recover -verbose [list rep $rv]] $env3 repmgr -local [list localhost $port2] \ -remote [list localhost $port0] -start client await_startup_done $env3 error_check_good nsites_3 [$env1 rep_get_nsites] 3 puts "." ; flush stdout puts "\tRepmgr029.z5: Shut down all sites and then restart with election" error_check_good s_2_close [$env2 close] 0 error_check_good s_3_close [$env3 close] 0 error_check_good s_1_close [$env1 close] 0 set env1 [berkdb env -create -errpfx A -home $masterdir \ -txn -rep -thread -recover -verbose [list rep $rv]] $env1 repmgr -local [list localhost $port0] -start elect -pri 100 set env2 [berkdb env -create -errpfx B -home $clientdir \ -txn -rep -thread -recover -verbose [list rep $rv]] $env2 repmgr -local [list localhost $port1] -start elect -pri 200 set env3 [berkdb env -create -errpfx C -home $clientdir2 \ -txn -rep -thread -recover -verbose [list rep $rv]] $env3 repmgr -local [list localhost $port2] -start elect -pri 140 puts "\tRepmgr029.z5: Wait for election to choose a new master" await_condition {[repmgr029_known_master $env1 $env2 $env3]} error_check_good nsites_1 [$env1 rep_get_nsites] 3 error_check_good nsites_2 [$env2 rep_get_nsites] 3 error_check_good nsites_3 [$env3 rep_get_nsites] 3 error_check_good s_3_close [$env3 close] 0 error_check_good s_1_close [$env1 close] 0 error_check_good s_2_close [$env2 close] 0 } # Remove a site while it is disconnected, and see if it can get an event when it # tries to reconnect. (2nd try) proc z2 { } { global rep_verbose global testdir set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {portA portB portC portD portE} [available_ports 5] {} set dirA $testdir/A set dirB $testdir/B set dirC $testdir/C set dirD $testdir/D set dirE $testdir/E file mkdir $dirA file mkdir $dirB file mkdir $dirC file mkdir $dirD file mkdir $dirE puts -nonewline "\tRepmgr029.z2.a: Set up a group of 5: A, B, C, D, E" set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -recover -verbose [list rep $rv]] $envA repmgr -local [list localhost $portA] -start master error_check_good nsites_a [$envA rep_get_nsites] 1 puts -nonewline "."; flush stdout set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -recover -verbose [list rep $rv] -event] $envB repmgr -local [list localhost $portB] \ -remote [list localhost $portA] -start client await_startup_done $envB error_check_good nsites_b [$envB rep_get_nsites] 2 puts -nonewline "."; flush stdout set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -recover -verbose [list rep $rv]] $envC repmgr -local [list localhost $portC] \ -remote [list localhost $portA] -start client await_startup_done $envC error_check_good nsites_c [$envC rep_get_nsites] 3 puts -nonewline "." ; flush stdout # It is ideal to increase the await time when the group size is large. set envD [berkdb env -create -errpfx D -home $dirD -txn -rep -thread \ -recover -verbose [list rep $rv]] $envD repmgr -local [list localhost $portD] \ -remote [list localhost $portA] -start client await_startup_done $envD 100 error_check_good nsites_d [$envD rep_get_nsites] 4 puts -nonewline "." ; flush stdout set envE [berkdb env -create -errpfx E -home $dirE -txn -rep -thread \ -recover -verbose [list rep $rv]] $envE repmgr -local [list localhost $portE] \ -remote [list localhost $portA] -start client await_startup_done $envE 200 error_check_good nsites_e [$envE rep_get_nsites] 5 puts "." ; flush stdout puts "\tRepmgr029.z2.b: shut down sites D and E" error_check_good s_d_close [$envD close] 0 error_check_good s_e_close [$envE close] 0 puts "\tRepmgr029.z2.c: remove site D from the group" $envA repmgr -remove [list localhost $portD] error_check_good rm_at_a \ [string length [repmgr029_site_list_status $envA $portD]] 0 puts "\tRepmgr029.z2.d: shut down all remaining sites" error_check_good s_b_close [$envB close] 0 error_check_good s_c_close [$envC close] 0 error_check_good s_a_close [$envA close] 0 puts -nonewline "\tRepmgr029.z2.e: start up just D and E \ (neither of which know that D has been removed)" set envD [berkdb env -create -errpfx D -home $dirD -txn -rep -thread \ -recover -verbose [list rep $rv] -event] $envD repmgr -local [list localhost $portD] -start elect\ -timeout {connection_retry 2000000} # Should comments out the await here, otherwise, envD cannot join #await_startup_done $envD puts -nonewline "."; flush stdout set envE [berkdb env -create -errpfx E -home $dirE -txn -rep -thread \ -recover -verbose [list rep $rv] -event] $envE repmgr -local [list localhost $portE] -start elect await_condition {[expr [stat_field $envD \ rep_stat "Messages processed"] > 0]} puts "."; flush stdout puts -nonewline "\tRepmgr029.z2.f: Start sites A, B, and C" set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -recover -verbose [list rep $rv]] $envA repmgr -local [list localhost $portA] -start elect -pri 200 puts -nonewline "." ; flush stdout set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -recover -verbose [list rep $rv] -event] $envB repmgr -local [list localhost $portB] -start elect -pri 150 puts -nonewline "." ; flush stdout set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -recover -verbose [list rep $rv]] $envC repmgr -local [list localhost $portC] -start elect -pri 100 await_startup_done $envC 1000 puts "." ; flush stdout puts "\tRepmgr029.z2.g: wait for site D to notice that it has been removed" await_event $envD local_site_removed # Yikes! This is not going to work! Site D will be rejected before it # gets a chance to have its database updated! :-( # [M] NOW: A is the master, B, C, E get sync with A, but D does not. error_check_good s_d_close [$envD close] 0 error_check_good s_e_close [$envE close] 0 error_check_good s_c_close [$envC close] 0 error_check_good s_b_close [$envB close] 0 error_check_good s_a_close [$envA close] 0 } # Remove a site while it is down. When it starts up again, it should rejoin. proc z1 { } { global rep_verbose global testdir set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {portA portB portC} [available_ports 3] {} set dirA $testdir/A set dirB $testdir/B set dirC $testdir/C file mkdir $dirA file mkdir $dirB file mkdir $dirC puts -nonewline "\tRepmgr029.z1.a: Set up a group of 3: A, B, C" set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -recover -verbose [list rep $rv]] $envA repmgr -local [list localhost $portA] -start master error_check_good nsitesA [$envA rep_get_nsites] 1 puts -nonewline "."; flush stdout set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -recover -verbose [list rep $rv] -event] $envB repmgr -local [list localhost $portB] \ -remote [list localhost $portA] -start client await_startup_done $envB error_check_good nsitesB [$envB rep_get_nsites] 2 puts -nonewline "."; flush stdout set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -recover -verbose [list rep $rv]] $envC repmgr -local [list localhost $portC] \ -remote [list localhost $portA] -start client await_startup_done $envC error_check_good nsitesC [$envC rep_get_nsites] 3 puts "."; flush stdout puts "\tRepmgr029.z1.b: Shut down site B, and remove it from the group." error_check_good s_b_close [$envB close] 0 $envA repmgr -remove [list localhost $portB] error_check_good rm_at_a \ [string length [repmgr029_site_list_status $envA $portB]] 0 puts "\tRepmgr029.z1.c: restart B" set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -recover -verbose [list rep $rv] -event] $envB repmgr -local [list localhost $portB] \ -timeout {connection_retry 2000000} -start client await_startup_done $envB # make sure we haven't fired a LOCAL_SITE_REMOVED event to B set ev [find_event [$envB event_info] local_site_removed] error_check_good site_b_not_removed [string length $ev] 0 # Now try it again, only this time the auto-rejoin fails due to lack of # acks, so B should shut down and fire LOCAL_SITE_REMOVED event. TODO: # should we have some sort of stat query so that the application can # tell whether threads are running? Or is that just redundant with the # event? puts "\tRepmgr029.z1.d: shut down and remove site B again" error_check_good s_b_close [$envB close] 0 $envA repmgr -remove [list localhost $portB] puts "\tRepmgr029.z1.e: shut down site C, and then restart B" error_check_good s_c_close [$envC close] 0 set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -recover -verbose [list rep $rv] -event] $envB repmgr -local [list localhost $portB] \ -timeout {connection_retry 2000000} -start client await_event $envB local_site_removed error_check_good s_b_close [$envB close] 0 error_check_good s_a_close [$envA close] 0 } # Test "sharing", by constructing a situation where a site that's been down for # a while has an obsolete, too-high notion of nsites. On a cold boot, if that # site is needed, it would spoil the election by requiring too many votes, # unless it gets a hint from other sites. # # Create a group of 6 sites, A, B, C, D, E, F. Make sure F knows nsites is 6; # then shut it down. Remove E; now nsites is 5 (A, B, C, D, f). Then remove D; # nsites is 4 (A, B, C, f). Now shut down everyone, and then reboot only A, B, # and F (leave C down). Try to elect a master. # proc z9 { } { global rep_verbose global testdir set rv [ expr $rep_verbose ? on : off ] env_cleanup $testdir foreach {portA portB portC portD portE portF} [available_ports 6] {} set dirA $testdir/A set dirB $testdir/B set dirC $testdir/C set dirD $testdir/D set dirE $testdir/E set dirF $testdir/F file mkdir $dirA file mkdir $dirB file mkdir $dirC file mkdir $dirD file mkdir $dirE file mkdir $dirF puts -nonewline "\tRepmgr029.z9: Set up a group of 6" set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -recover -verbose [list rep $rv]] $envA repmgr -local [list localhost $portA] -start master puts -nonewline "." ; flush stdout set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -recover -verbose [list rep $rv]] $envB repmgr -local [list localhost $portB] \ -remote [list localhost $portA] -start client await_startup_done $envB puts -nonewline "." ; flush stdout set envC [berkdb env -create -errpfx B -home $dirC -txn -rep -thread \ -recover -verbose [list rep $rv]] $envC repmgr -local [list localhost $portC] \ -remote [list localhost $portA] -start client await_startup_done $envC puts -nonewline "." ; flush stdout set envD [berkdb env -create -errpfx B -home $dirD -txn -rep -thread \ -recover -verbose [list rep $rv]] $envD repmgr -local [list localhost $portD] \ -remote [list localhost $portA] -start client await_startup_done $envD 30 puts -nonewline "." ; flush stdout set envE [berkdb env -create -errpfx B -home $dirE -txn -rep -thread \ -recover -verbose [list rep $rv]] $envE repmgr -local [list localhost $portE] \ -remote [list localhost $portA] -start client await_startup_done $envE 30 puts -nonewline "." ; flush stdout set envF [berkdb env -create -errpfx B -home $dirF -txn -rep -thread \ -recover -verbose [list rep $rv]] $envF repmgr -local [list localhost $portF] \ -remote [list localhost $portA] -start client await_startup_done $envF 40 puts "." puts "\tRepmgr029.z9: Shut down site F" $envF close puts "\tRepmgr029.z9: Remove site E" $envE close $envA repmgr -remove [list localhost $portE] puts "\tRepmgr029.z9: Remove site D" $envD close $envA repmgr -remove [list localhost $portD] puts "\tRepmgr029.z9: Shut down site C" $envC close puts "\tRepmgr029.z9: Bounce the master" $envA close set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -recover -verbose [list rep $rv]] $envA repmgr -local [list localhost $portA] -start elect # We now have a group of 4, with only A and B running. That's not # enough to elect a master. puts "\tRepmgr029.z9: Restart site F" set envF [berkdb env -create -errpfx F -home $dirF -txn -rep -thread \ -recover -verbose [list rep $rv]] $envF repmgr -local [list localhost $portF] -start elect # There are now 3 sites running, in a 4-site group. That should be # enough to elect a master, if site F can be advised of the fact that # the group size has been reduced. # Wait for an election to complete. await_condition {[repmgr029_known_master $envA $envF $envB]} 30 $envA close $envB close $envF close } # See that a membership list gets restored after an interrupted internal init. proc z10 { } { global rep_verbose global testdir global tclsh_path global test_path set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {portA portB portC} [available_ports 3] {} set dirA $testdir/A set dirB $testdir/B set dirC $testdir/C file mkdir $dirA file mkdir $dirB file mkdir $dirC set pagesize 4096 set log_max [expr $pagesize * 8] puts "\tRepmgr029.z10: Set up a group of 3, A (master), B, C" set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -recover -verbose [list rep $rv] -log_max $log_max] $envA repmgr -local [list localhost $portA] -start master set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -recover -verbose [list rep $rv] -log_max $log_max] $envB repmgr -local [list localhost $portB] \ -remote [list localhost $portA] -start client await_startup_done $envB set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -recover -verbose [list rep $rv] -log_max $log_max] $envC repmgr -local [list localhost $portC] \ -remote [list localhost $portA] -start client await_startup_done $envC puts "\tRepmgr029.z10: Shut down site C and generate enough churn to force internal init" set log_endC [get_logfile $envC last] $envC close set niter 50 while { [get_logfile $envA first] <= $log_endC } { $envA test force noarchive_timeout rep_test btree $envA NULL $niter 0 0 0 -pagesize $pagesize $envA log_flush $envA log_archive -arch_remove } # Use separate process so that it works even on Windows. # Inhibit master from sending any PAGE messages. puts "\tRepmgr029.z10: Restart site C in a separate process" $envA test abort no_pages set pid [exec $tclsh_path $test_path/wrap.tcl \ repmgr029script.tcl $testdir/repmgr029script.log $dirC $portC $rv &] watch_procs $pid 5 puts "\tRepmgr029.z10: Shut down the rest of the group" $envB close $envA close puts "\tRepmgr029.z10: Restart site C alone" set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -recover -verbose [list rep $rv]] $envC repmgr -local [list localhost $portC] -start elect puts "\tRepmgr029.z10: Check list of known sites, A and B" set l [$envC repmgr_site_list] foreach p [list $portA $portB] { set sought [list localhost $p] error_check_good port$p \ [expr [lsearch -glob $l [concat * $sought *]] >= 0] 1 } $envC close } # See that a client notices a membership change that happens while it is # disconnected (via the internal init completion trigger). proc z11 { } { global rep_verbose global testdir global tclsh_path set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {portA portB portC portD} [available_ports 4] {} set dirA $testdir/A set dirB $testdir/B set dirC $testdir/C set dirD $testdir/D file mkdir $dirA file mkdir $dirB file mkdir $dirC file mkdir $dirD set pagesize 4096 set log_max [expr $pagesize * 8] puts -nonewline "\tRepmgr029.z11: Set up a group initially of size 3" set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -recover -verbose [list rep $rv] -log_max $log_max] $envA repmgr -local [list localhost $portA] -start master puts -nonewline "." ; flush stdout set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -recover -verbose [list rep $rv] -log_max $log_max] $envB repmgr -local [list localhost $portB] \ -remote [list localhost $portA] -start client await_startup_done $envB puts -nonewline "." ; flush stdout set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -recover -verbose [list rep $rv] -log_max $log_max] $envC repmgr -local [list localhost $portC] \ -remote [list localhost $portA] -start client await_startup_done $envC puts "." puts "\tRepmgr029.z11: Shut down C" $envC close puts "\tRepmgr029.z11: Join new site D" set envD [berkdb env -create -errpfx D -home $dirD -txn -rep -thread \ -recover -verbose [list rep $rv] -log_max $log_max] $envD repmgr -local [list localhost $portD] \ -remote [list localhost $portA] -start client puts "\tRepmgr029.z11: Generate enough churn to force internal init at C later" set tail [get_logfile $envA last] set niter 50 while { [get_logfile $envA first] <= $tail } { $envA test force noarchive_timeout rep_test btree $envA NULL $niter 0 0 0 -pagesize $pagesize $envA log_flush $envA log_archive -arch_remove } puts "\tRepmgr029.z11: Restart site C" set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -recover -verbose [list rep $rv]] $envC repmgr -local [list localhost $portC] -start elect await_startup_done $envC puts "\tRepmgr029.z11: Check list of known sites" set l [$envC repmgr_site_list] foreach p [list $portA $portB $portD] { set sought [list localhost $p] error_check_good port$p \ [expr [lsearch -glob $l [concat * $sought *]] >= 0] 1 } $envC close $envD close $envB close $envA close } # Exercise the new connection-related event types. proc z12 { } { global rep_verbose global testdir set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {portA portB portC} [available_ports 3] {} set dirA $testdir/A set dirB $testdir/B file mkdir $dirA file mkdir $dirB puts "\tRepmgr029.z12: Start primordial master site A" set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -recover -verbose [list rep $rv] -event] $envA repmgr -local [list localhost $portA] -start master \ -timeout {connection_retry 2000000} puts "\tRepmgr029.z12: Add new client site B" set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -recover -verbose [list rep $rv] -event] $envB repmgr -remote [list localhost $portA] \ -local [list localhost $portB] -start client await_startup_done $envB puts "\tRepmgr029.z12: Check connection events" set ev [find_event [$envA event_info] connection_established] error_check_good ev_len [llength $ev] 2 error_check_good ev_eid [lindex $ev 1] [repmgr029_get_eid $envA $portB] set ev [find_event [$envB event_info] connection_established] error_check_good ev_len2 [llength $ev] 2 error_check_good ev_eid2 [lindex $ev 1] [repmgr029_get_eid $envB $portA] puts "\tRepmgr029.z12: Shut down site B, observe event at site A" $envB close set ev [await_event $envA connection_broken] error_check_good ev_len3 [llength $ev] 2 set evinfo [lindex $ev 1] error_check_good ev_len3b [llength $evinfo] 2 foreach {eid err} $evinfo {} error_check_good ev_eid3 $eid [repmgr029_get_eid $envA $portB] puts "\t\tRepmgr029.z12: (connection_broken error code is $err)" set ev [await_event $envA connection_retry_failed] error_check_good ev_len3 [llength $ev] 2 set evinfo [lindex $ev 1] error_check_good ev_len3c [llength $evinfo] 2 foreach {eid err} $evinfo {} error_check_good ev_eid3 $eid [repmgr029_get_eid $envA $portB] puts "\t\tRepmgr029.z12: (retry_failed error code is $err)" puts "\tRepmgr029.z12: Shut down site A, then restart B" $envA close set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -recover -verbose [list rep $rv] -event] $envB repmgr -local [list localhost $portB] -start elect \ -timeout {connection_retry 2000000} # new event instances should continue to be fired indefinitely. For # now, consider '3' to be close enough to infinity. for { set i 1 } { $i <= 3 } { incr i } { puts "\tRepmgr029.z12: Observe event ($i)" set ev [await_event $envB connection_retry_failed] error_check_good ev_eid4 [lindex $ev 1 0] [repmgr029_get_eid $envB $portA] error_check_good never_estd \ [string length [find_event [$envB event_info] \ connection_established]] 0 # According to our definition of "connection broken" you can't # "break" what you never had. error_check_good never_broken \ [string length [find_event [$envB event_info] \ connection_broken]] 0 $envB event_info -clear } $envB close } # Make sure applications aren't bothered by perm failed events from failed GMDB # operations. proc z13 { } { global rep_verbose global testdir set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {portA portB portC} [available_ports 3] {} set dirA $testdir/A set dirB $testdir/B set dirC $testdir/C file mkdir $dirA file mkdir $dirB file mkdir $dirC puts -nonewline "\tRepmgr029.z13: Create first 2 sites" set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -recover -verbose [list rep $rv] -event] $envA repmgr -local [list localhost $portA] -start master puts -nonewline "." ; flush stdout set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -recover -verbose [list rep $rv]] $envB repmgr -local [list localhost $portB] \ -remote [list localhost $portA] -start client await_startup_done $envB puts "." puts "\tRepmgr029.z13: Shut down site B, try to add third site, site C" $envB close set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -recover -verbose [list rep $rv]] set ret [catch {$envC repmgr -local [list localhost $portC] \ -remote [list localhost $portA] -start client} result] error_check_bad no_failure $ret 0 error_check_match unavail $result "*DB_REP_UNAVAIL*" puts "\tRepmgr029.z13: Make sure site A application didn't see a perm failure" error_check_good no_failure \ [string length [find_event [$envA event_info] perm_failed]] 0 $envC close $envA close } # Make sure we can add/remove sites even when ALL policy is in effect. proc z14 { } { global rep_verbose global testdir set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {portA portB portC} [available_ports 3] {} set dirA $testdir/A set dirB $testdir/B set dirC $testdir/C file mkdir $dirA file mkdir $dirB file mkdir $dirC foreach policy {all allpeers} { puts "\tRepmgr029.z14: Using \"$policy\" ack policy" puts "\tRepmgr029.z14: Create first site A" set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -recover -verbose [list rep $rv] -event] $envA repmgr -local [list localhost $portA] -start master -ack $policy puts "\tRepmgr029.z14: Add 2nd site, B" set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -recover -verbose [list rep $rv]] $envB repmgr -local [list localhost $portB] \ -remote [list localhost $portA] -start client -ack $policy await_startup_done $envB puts "\tRepmgr029.z14: Add 3rd site, C" set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -recover -verbose [list rep $rv]] $envC repmgr -local [list localhost $portC] \ -remote [list localhost $portA] -start client -ack $policy await_startup_done $envC puts "\tRepmgr029.z14: Remove site B" $envC repmgr -remove [list localhost $portB] error_check_good removed \ [string length [repmgr029_site_list_status $envA $portB]] 0 $envB close $envC close $envA close } } # Rescind a pending (previously incomplete) change, and check effect on nsites. proc z15 { } { global rep_verbose global testdir set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {portA portB portC portD portE} [available_ports 5] {} set dirA $testdir/A set dirB $testdir/B set dirC $testdir/C set dirD $testdir/D set dirE $testdir/E file mkdir $dirA file mkdir $dirB file mkdir $dirC file mkdir $dirD file mkdir $dirE puts -nonewline "\tRepmgr029.z15: Create initial group of 4 sites" set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -verbose [list rep $rv]] $envA repmgr -local [list localhost $portA] -start master puts -nonewline "." ; flush stdout set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -verbose [list rep $rv]] $envB repmgr -local [list localhost $portB] \ -remote [list localhost $portA] -start client await_startup_done $envB puts -nonewline "." ; flush stdout set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -verbose [list rep $rv]] $envC repmgr -local [list localhost $portC] \ -remote [list localhost $portA] -start client await_startup_done $envC puts -nonewline "." ; flush stdout set envD [berkdb env -create -errpfx D -home $dirD -txn -rep -thread \ -verbose [list rep $rv]] $envD repmgr -local [list localhost $portD] \ -remote [list localhost $portA] -start client await_startup_done $envD puts "." puts "\tRepmgr029.z15: Shut down C and D, and try to add E" $envC close $envD close set envE [berkdb env -create -errpfx E -home $dirE -txn -rep -thread \ -verbose [list rep $rv] -event] set ret [catch {$envE repmgr -local [list localhost $portE] \ -remote [list localhost $portA] -start client} result] error_check_bad no_failure $ret 0 error_check_match unavail $result "*DB_REP_UNAVAIL*" error_check_good nsites [$envA rep_get_nsites] 5 await_condition {[expr [$envB rep_get_nsites] == 5]} puts "\tRepmgr029.z15: Rescind the addition of site E, by removing it" $envA repmgr -remove [list localhost $portE] error_check_good nsites2 [$envA rep_get_nsites] 4 await_condition {[expr [$envB rep_get_nsites] == 4]} puts -nonewline "\tRepmgr029.z15: Restart sites C and D" set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -recover -verbose [list rep $rv]] $envC repmgr -local [list localhost $portC] -start client await_startup_done $envC puts -nonewline "." ; flush stdout set envD [berkdb env -create -errpfx D -home $dirD -txn -rep -thread \ -recover -verbose [list rep $rv]] $envD repmgr -local [list localhost $portD] -start client await_startup_done $envD puts "." puts "\tRepmgr029.z15: Try adding new site E again,\ this time it should succeed" # Note that it was not necessary to bounce the env handle. $envE repmgr -start client error_check_good nsites [$envA rep_get_nsites] 5 await_condition {[expr [$envB rep_get_nsites] == 5]} await_startup_done $envE puts "\tRepmgr029.z15: Shut down C and D again,\ and this time try removing site E" $envC close $envD close set ret [catch {$envA repmgr -remove [list localhost $portE]} result] error_check_bad no_failure2 $ret 0 error_check_match unavail2 $result "*DB_REP_UNAVAIL*" error_check_good nsites2 [$envA rep_get_nsites] 5 error_check_good nsites3 [$envB rep_get_nsites] 5 set db [berkdb open -env $envA -thread __db.rep.system __db.membership] set SITE_DELETING 2 error_check_good deleting \ [repmgr029_gmdb_status $db localhost $portE] $SITE_DELETING $db close puts "\tRepmgr029.z15: See that site E fired event for as little\ as DELETING status" await_event $envE local_site_removed $envE close puts "\tRepmgr029.z15: Rescind the removal of site E" # The only way add site E is to have it start and try to join. Someday # (maybe even before code freeze) it will be possible to restart the # zombie carcass in the same env handle. set envE [berkdb env -create -errpfx E -home $dirE -txn -rep -thread \ -recover -verbose [list rep $rv]] $envE repmgr -local [list localhost $portE] -start client error_check_good nsites4 [$envA rep_get_nsites] 5 error_check_good nsites5 [$envB rep_get_nsites] 5 $envE close $envB close $envA close } # See that removing a non-existent site acts as a no-op, and doesn't yield an # error. proc z16 { } { global rep_verbose global testdir set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir # Group size will be three, but allocate an extra port to act as the # non-existent sites. foreach {portA portB portC portD} [available_ports 4] {} set dirA $testdir/A set dirB $testdir/B set dirC $testdir/C file mkdir $dirA file mkdir $dirB file mkdir $dirC puts -nonewline "\tRepmgr029.z16: Create a group of 3 sites" set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -verbose [list rep $rv]] $envA repmgr -local [list localhost $portA] -start master puts -nonewline "." ; flush stdout set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -verbose [list rep $rv]] $envB repmgr -local [list localhost $portB] \ -remote [list localhost $portA] -start client await_startup_done $envB puts -nonewline "." ; flush stdout set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -verbose [list rep $rv] -event] $envC repmgr -local [list localhost $portC] \ -remote [list localhost $portA] -start client await_startup_done $envC puts "." puts "\tRepmgr029.z16: Remove non-existent site" $envB repmgr -remove [list localhost $portD] error_check_good nsites [$envA rep_get_nsites] 3 error_check_good nsites [$envB rep_get_nsites] 3 error_check_good nsites [$envC rep_get_nsites] 3 # While we're on the topic of removing sites, let's try having a site # remove itself. puts "\tRepmgr029.z16: Have site C remove itself" $envC repmgr -remove [list localhost $portC] error_check_good nsites [$envA rep_get_nsites] 2 await_event $envC local_site_removed $envC close $envB close $envA close } # Exercise group creation with non-default ack policies. proc z17 { } { global rep_verbose global testdir set rv off if { $rep_verbose == 1 } { set rv on } foreach {portA portB portC} [available_ports 3] {} set dirA $testdir/A set dirB $testdir/B set dirC $testdir/C foreach policy {one onepeer all allpeers allavailable none} { env_cleanup $testdir file mkdir $dirA file mkdir $dirB file mkdir $dirC puts -nonewline "\tRepmgr029.z17: Create a group of 3 sites using\ `$policy' ack policy" set envA [berkdb env -create -errpfx A -home $dirA -txn \ -rep -thread -verbose [list rep $rv]] $envA repmgr -local [list localhost $portA creator] \ -start elect -ack $policy puts -nonewline "." ; flush stdout set envB [berkdb env -create -errpfx B -home $dirB -txn \ -rep -thread -verbose [list rep $rv] -event] $envB repmgr -local [list localhost $portB] \ -remote [list localhost $portA] -start elect -ack $policy await_startup_done $envB puts -nonewline "." ; flush stdout set envC [berkdb env -create -errpfx C -home $dirC -txn \ -rep -thread -verbose [list rep $rv] -event] $envC repmgr -local [list localhost $portC] \ -remote [list localhost $portA] -start elect await_startup_done $envC puts "." puts "\tRepmgr029.z17: Remove both clients." $envA repmgr -remove [list localhost $portB] error_check_good nsites [$envA rep_get_nsites] 2 await_event $envB local_site_removed $envB close $envA repmgr -remove [list localhost $portC] error_check_good nsites [$envA rep_get_nsites] 1 await_event $envC local_site_removed $envC close $envA close } } # # Add new site to existing group, already populated via hot backup # a. Start site A as group creator. # b. Start site B as client, and wait it for sync. # c. Hot backup the site B's environment to directory C, # and start up site C using the directory C. # d. Check membership at site C # proc z18 { } { global rep_verbose global testdir global util_path set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {portA portB portC} [available_ports 3] {} set dirA $testdir/dirA set dirB $testdir/dirB set dirC $testdir/dirC file mkdir $dirA file mkdir $dirB file mkdir $dirC puts -nonewline "\tRepmgr029.z18.a: Start site A as master." set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -recover -verbose [list rep $rv]] $envA repmgr -local [list localhost $portA creator] \ -start master error_check_good nsites_A [$envA rep_get_nsites] 1 puts "." ; flush stdout puts -nonewline "\tRepmgr029.z18.b. Start site B" set envB [berkdb env -create -errpfx B -home $dirB -txn -rep -thread \ -verbose [list rep $rv]] $envB repmgr -local [list localhost $portB] \ -remote [list localhost $portA] -start client await_startup_done $envB error_check_good nsites_B [$envB rep_get_nsites] 2 puts "." ; flush stdout puts "\tRepmgr029.z18.c.1: Hot backup the site B's environment to $dirC" # Ensure $dirC is empty before hot backup. set files [glob -nocomplain $dirC/*] error_check_good no_files [llength $files] 0 eval exec $util_path/db_hotbackup -vh $dirB -b $dirC puts "\tRepmgr029.z18.c.2: Start up site C in $dirC." set envC [berkdb env -create -errpfx C -home $dirC -txn -rep -thread \ -recover -verbose [list rep $rv]] $envC repmgr -local [list localhost $portC] \ -remote [list localhost $portA] -start client await_startup_done $envC error_check_good nsites_C [$envC rep_get_nsites] 3 puts "\tRepmgr029.z18.c.3: Verify site C starts without internal init." error_check_good no_pages [stat_field $envC rep_stat "Pages received"] 0 error_check_good siteC_close [$envC close] 0 error_check_good siteB_close [$envB close] 0 error_check_good siteA_close [$envA close] 0 } # # Initiate group change during long-running transaction at master # (waits for transaction to abort) # a. start site A as master # b. begin a transaction, write a record # c. start a separate process to add a second site ("B") to the group # d. in the transaction in b, write a record and sleep for a second in a loop. # Would run into deadlock # e. abort the txn when the deadlock occurs # f. after that, the joining operation in the other thread should complete # successfully. # proc z19 {} { global rep_verbose global testdir global tclsh_path global test_path set rv off if { $rep_verbose == 1 } { set rv on } env_cleanup $testdir foreach {portA portB} [available_ports 2] {} set dirA $testdir/dirA set dirB $testdir/dirB file mkdir $dirA file mkdir $dirB puts "\tRepmgr029.z19.a: Start up site A as master " set envA [berkdb env -create -errpfx A -home $dirA -txn -rep -thread \ -recover -verbose [list rep $rv]] $envA repmgr -local [list localhost $portA creator] -start master error_check_good nsites_A [$envA rep_get_nsites] 1 puts "\tRepmgr029.z19.b: Begin txn and open db on master." set txn [$envA txn] error_check_good txn_begin [is_valid_txn $txn $envA] TRUE set testfile repmg029.db set oflags {-create -btree -mode 0755 -thread -env $envA \ -txn $txn $testfile} set db [eval {berkdb_open} $oflags] error_check_good db_open [is_valid_db $db] TRUE puts "\tRepmgr029.z19.c: Add site B in another process" set pid [exec $tclsh_path $test_path/wrap.tcl repmgr029script2.tcl \ $testdir/repmgr029script2.log $dirB $portB $dirA $portA &] puts "\tRepmgr029.z19.d: Write data in the txn, expecting deadlock" set maxcount 100 for { set count 0 } { $count < $maxcount } { incr count } { set key $count set data "gmdb data" if { [catch {$db put -txn $txn $key $data} ret] } { error_check_good put_deadlock \ [is_substr $ret DB_LOCK_DEADLOCK] 1 break } else { tclsleep 1 } } error_check_good put_deadlock [is_substr $ret DB_LOCK_DEADLOCK] 1 error_check_good txn_abort [$txn abort] 0 puts "\tRepmgr029.z19.e: Confirm B has joined." for { set count 0 } { $count < $maxcount } { incr count } { if { [$envA rep_get_nsites] > 1 } { break } else { tclsleep 1 } } watch_procs $pid 5 error_check_good db_close [$db close] 0 error_check_good master_close [$envA close] 0 # Check output file of the sub-process for failures. set file repmgr029script2.log set errstrings [eval findfail $testdir/$file] foreach str $errstrings { puts "$str" } error_check_good errstrings_llength [llength $errstrings] 0 } proc repmgr029_dump_db { e } { set db [berkdb open -env $e -thread __db.rep.system __db.membership] set c [$db cursor] set format_version [lindex [$c get -first] 0 1] binary scan $format_version II fmt vers puts "version $vers" while {[llength [set r [$c get -next]]] > 0} { set k [lindex $r 0 0] set v [lindex $r 0 1] binary scan $k I len set hostname [string range $k 4 [expr 2 + $len]] binary scan $hostname A* host binary scan [string range $k [expr 4 + $len] end] S port binary scan $v I status puts "{$host $port} $status" } $c close $db close } proc repmgr029_get_eid { e port } { set sle [repmgr029_get_site_list_entry $e $port] if { [string length $sle] == 0} { return "" } return [lindex $sle 0] } proc repmgr029_get_site_list_entry { e port } { foreach sle [$e repmgr_site_list] { set p [lindex $sle 2] if { $p == $port } { return $sle } } return "" } proc repmgr029_gmdb_status { db host port } { set l [string length $host] set key [binary format Ia*cS [expr $l + 1] $host 0 $port] set kvlist [$db get $key] if {[llength $kvlist] == 0} { return 0 } set kvpair [lindex $kvlist 0] set val [lindex $kvpair 1] binary scan $val I status return $status } proc repmgr029_gmdb_version { db } { set key [binary format IS 0 0] set kvlist [$db get $key] set kvpair [lindex $kvlist 0] set val [lindex $kvpair 1] binary scan $val II format version return $version } proc repmgr029_known_master { e1 e2 e3 } { foreach e [list $e1 $e2 $e3] { set m [stat_field $e rep_stat "Master environment ID"] if {$m == -2} { return no } } return yes } proc repmgr029_site_list_status { e port } { set sle [repmgr029_get_site_list_entry $e $port] if { [string length $sle] == 0 } { return "" } return [lindex $sle 3] }