libdb/test/tcl/repmgr026.tcl

# See the file LICENSE for redistribution information.
#
# Copyright (c) 2009, 2011 Oracle and/or its affiliates.  All rights reserved.
#
# TEST	repmgr026
# TEST	Test of "full election" timeouts.
# TEST	1. Cold boot with all sites present.
# TEST	2. Cold boot with some sites missing.
# TEST	3. Partial-participation election with one client having seen a master,
# TEST	   but another just starting up fresh.
# TEST	4. Partial participation, with all participants already having seen a
# TEST	   master.
# TEST

proc repmgr026 { { tnum 026 } } {
	source ./include.tcl

	if { $is_freebsd_test == 1 } {
		puts "Skipping replication manager test on FreeBSD platform."
		return
	}

	foreach use_leases {no yes} {
		foreach client_down {no yes} {
			puts "Repmgr$tnum: Full election test, \
			    client_down: $client_down; leases: $use_leases"
			repmgr026_sub $tnum $client_down $use_leases
		}
	}
}

proc repmgr026_sub { tnum client_down use_leases } {
	global testdir
	global repfiles_in_memory
	global rep_verbose
	global verbose_type
	
	set verbargs ""
	if { $rep_verbose == 1 } {
		set verbargs " -verbose {$verbose_type on} "
	}

	set repmemargs ""
	if { $repfiles_in_memory } {
		set repmemargs "-rep_inmem_files "
	}

	env_cleanup $testdir
	file mkdir [set dira $testdir/SITE_A]
	file mkdir [set dirb $testdir/SITE_B]
	file mkdir [set dirc $testdir/SITE_C]
	file mkdir [set dird $testdir/SITE_D]
	file mkdir [set dire $testdir/SITE_E]
	foreach { porta portb portc portd porte } [available_ports 5] {}

	# First, just create/establish the group.
	puts -nonewline "Repmgr$tnum: Create a group of 5 sites: "
	set common "-create -txn $verbargs $repmemargs \
	    -rep -thread -event"
	if { $use_leases } {
		append common " -rep_lease {[list 3000000]} "
	}
	set cmda "berkdb_env_noerr $common -errpfx SITE_A -home $dira"
	set cmdb "berkdb_env_noerr $common -errpfx SITE_B -home $dirb"
	set cmdc "berkdb_env_noerr $common -errpfx SITE_C -home $dirc"
	set cmdd "berkdb_env_noerr $common -errpfx SITE_D -home $dird"
	set cmde "berkdb_env_noerr $common -errpfx SITE_E -home $dire"
	set common_mgr " -start elect \
	    -timeout {connection_retry 5000000} \
	    -timeout {election_retry 2000000} \
	    -timeout {full_election 60000000} \
	    -timeout {election 5000000} -timeout {ack 3000000}"
	set enva [eval $cmda]
	eval $enva repmgr $common_mgr  \
	    -local {[list localhost $porta creator]}
	puts -nonewline "." ; 	flush stdout
	set envb [eval $cmdb]
	eval $envb repmgr $common_mgr \
	    -local {[list localhost $portb]} -remote {[list localhost $porta]}
	await_startup_done $envb
	puts -nonewline "." ; 	flush stdout
	set envc [eval $cmdc]
	eval $envc repmgr $common_mgr \
	    -local {[list localhost $portc]} -remote {[list localhost $porta]}
	await_startup_done $envc
	puts -nonewline "." ; 	flush stdout
	set envd [eval $cmdd]
	eval $envd repmgr $common_mgr \
	    -local {[list localhost $portd]} -remote {[list localhost $porta]}
	await_startup_done $envd
	puts -nonewline "." ; 	flush stdout
	set enve [eval $cmde]
	eval $enve repmgr $common_mgr \
	    -local {[list localhost $porte]} -remote {[list localhost $porta]}
	await_startup_done $enve
	puts "."
	$enve close
	$envd close
	$envc close
	$envb close
	$enva close

	# Cold boot the group (with or without site E), giving site A a
	# high priority.
	# 

	# The wait_limit's are intended to be an amount that is way more than
	# the expected timeout, used for nothing more than preventing the test
	# from hanging forever.  The leeway amount should be enough less than
	# the timeout to allow for any imprecision introduced by the test
	# mechanism.
	# 
	set elect_wait_limit 25
	set full_secs_leeway 59
	set full_wait_limit 85

	puts "\tRepmgr$tnum.a: Start first four sites."
	set enva [eval $cmda]
	eval $enva repmgr $common_mgr -pri 200 -local {[list localhost $porta]}

	set envb [eval $cmdb]
	eval $envb repmgr $common_mgr -pri 100 -local {[list localhost $portb]}

	set envc [eval $cmdc]
	eval $envc repmgr $common_mgr -pri 90 -local {[list localhost $portc]}

	set envd [eval $cmdd]
	eval $envd repmgr $common_mgr -pri 80 -local {[list localhost $portd]}

	if { $client_down } {
		set enve NONE
	} else {
		puts "\tRepmgr$tnum.b: Start fifth site."
		set enve [eval $cmde]
		eval $enve repmgr $common_mgr -pri 50 \
		    -local {[list localhost $porte]}
	}

	# wait for results, and make sure they're correct
	#
	set envlist [list $enva $envb $envc $envd]
	if { $enve != "NONE" } {
		lappend envlist $enve
	}
	set limit $full_wait_limit
	puts "\tRepmgr$tnum.c: wait (up to $limit seconds) for first election."
	set t [repmgr026_await_election_result $envlist $limit]
	if { $client_down } {
		error_check_good slow_election [expr $t > $full_secs_leeway] 1
	} else {
		# When all sites participate, the election should finish in way
		# less than 60 seconds.
		# 
		error_check_good timely_election [expr $t < $full_secs_leeway] 1
	}
	puts "\tRepmgr$tnum.d: first election completed in $t seconds"

	puts "\tRepmgr$tnum.e: wait for start-up done"
	$enva event_info -clear
	await_startup_done $envb
	$envb event_info -clear
	await_startup_done $envc
	$envc event_info -clear
	await_startup_done $envd
	$envd event_info -clear
	if { $enve != "NONE" } {
		await_startup_done $enve
		$enve event_info -clear
	}

	# Shut down site A, in order to test elections with less than the whole
	# group voting.  However, normally repmgr's reaction to losing master
	# connection is to try a "fast election" (the n-1 trick).  So we must do
	# something to mitigate that (see below).
	# 
	puts "\tRepmgr$tnum.f: shut down master site A"
	if { $client_down } {
		# The fifth site is already down, so now we'll have just B, C,
		# and D running.  Therefore, even with repmgr pulling its "fast
		# election" (n-1) trick, we don't have enough votes for a
		# full-participation short circuit; so this is a valid test of
		# the "normal" election timeout.
		#
		$enva close
	} else {
		# Here all sites are running, so if we just killed the master
		# repmgr would invoke its "fast election" trick, resulting in no
		# timeout.  Since the purpose of this test is to ensure the
		# correct use of timeouts, that's no good.  Instead, let's first
		# kill one more other site.
		$enve close
		$enva close
	}

	# wait for results, and check them
	# 
	set envlist [list $envb $envc $envd]
	set limit $elect_wait_limit
	puts "\tRepmgr$tnum.h: wait (up to $limit seconds) for second election."
	set t [repmgr026_await_election_result $envlist $limit]
	error_check_good normal_election [expr $t < $full_secs_leeway] 1
	puts "\tRepmgr$tnum.i: second election completed in $t seconds"

	$envd close
	$envc close
	$envb close
}

# Wait (a limited amount of time) for the election to finish.  The first env
# handle in the list is the expected winner, and the others are the remaining
# clients.  Returns the approximate amount of time (in seconds) that the
# election took.
# 
proc repmgr026_await_election_result { envlist limit } {
	set begin [clock seconds]
	set deadline [expr $begin + $limit]
	while { true } {
		set t [clock seconds]
		if { $t > $deadline } {
			error "FAIL: time limit exceeded"
		}

		if { [repmgr026_is_ready $envlist] } {
			return [expr $t - $begin]
		}

		tclsleep 1
	}
}

proc repmgr026_is_ready { envlist } {
	set winner [lindex $envlist 0]
	if {![is_elected $winner]} {
		return false
	}

	foreach client [lrange $envlist 1 end] {
		if {![is_event_present $client newmaster]} {
			return false
		}
	}
	return true
}
Release 5.2.28 on 6/10/2011 2011-09-13 17:44:24 +00:00			`# See the file LICENSE for redistribution information.`
			`#`
			`# Copyright (c) 2009, 2011 Oracle and/or its affiliates. All rights reserved.`
			`#`
			`# TEST repmgr026`
			`# TEST Test of "full election" timeouts.`
			`# TEST 1. Cold boot with all sites present.`
			`# TEST 2. Cold boot with some sites missing.`
			`# TEST 3. Partial-participation election with one client having seen a master,`
			`# TEST but another just starting up fresh.`
			`# TEST 4. Partial participation, with all participants already having seen a`
			`# TEST master.`
			`# TEST`

			`proc repmgr026 { { tnum 026 } } {`
			`source ./include.tcl`

			`if { $is_freebsd_test == 1 } {`
			`puts "Skipping replication manager test on FreeBSD platform."`
			`return`
			`}`

			`foreach use_leases {no yes} {`
			`foreach client_down {no yes} {`
			`puts "Repmgr$tnum: Full election test, \`
			`client_down: $client_down; leases: $use_leases"`
			`repmgr026_sub $tnum $client_down $use_leases`
			`}`
			`}`
			`}`

			`proc repmgr026_sub { tnum client_down use_leases } {`
			`global testdir`
			`global repfiles_in_memory`
			`global rep_verbose`
			`global verbose_type`

			`set verbargs ""`
			`if { $rep_verbose == 1 } {`
			`set verbargs " -verbose {$verbose_type on} "`
			`}`

			`set repmemargs ""`
			`if { $repfiles_in_memory } {`
			`set repmemargs "-rep_inmem_files "`
			`}`

			`env_cleanup $testdir`
			`file mkdir [set dira $testdir/SITE_A]`
			`file mkdir [set dirb $testdir/SITE_B]`
			`file mkdir [set dirc $testdir/SITE_C]`
			`file mkdir [set dird $testdir/SITE_D]`
			`file mkdir [set dire $testdir/SITE_E]`
			`foreach { porta portb portc portd porte } [available_ports 5] {}`

			`# First, just create/establish the group.`
			`puts -nonewline "Repmgr$tnum: Create a group of 5 sites: "`
			`set common "-create -txn $verbargs $repmemargs \`
			`-rep -thread -event"`
			`if { $use_leases } {`
			`append common " -rep_lease {[list 3000000]} "`
			`}`
			`set cmda "berkdb_env_noerr $common -errpfx SITE_A -home $dira"`
			`set cmdb "berkdb_env_noerr $common -errpfx SITE_B -home $dirb"`
			`set cmdc "berkdb_env_noerr $common -errpfx SITE_C -home $dirc"`
			`set cmdd "berkdb_env_noerr $common -errpfx SITE_D -home $dird"`
			`set cmde "berkdb_env_noerr $common -errpfx SITE_E -home $dire"`
			`set common_mgr " -start elect \`
			`-timeout {connection_retry 5000000} \`
			`-timeout {election_retry 2000000} \`
			`-timeout {full_election 60000000} \`
			`-timeout {election 5000000} -timeout {ack 3000000}"`
			`set enva [eval $cmda]`
			`eval $enva repmgr $common_mgr \`
			`-local {[list localhost $porta creator]}`
			`puts -nonewline "." ; flush stdout`
			`set envb [eval $cmdb]`
			`eval $envb repmgr $common_mgr \`
			`-local {[list localhost $portb]} -remote {[list localhost $porta]}`
			`await_startup_done $envb`
			`puts -nonewline "." ; flush stdout`
			`set envc [eval $cmdc]`
			`eval $envc repmgr $common_mgr \`
			`-local {[list localhost $portc]} -remote {[list localhost $porta]}`
			`await_startup_done $envc`
			`puts -nonewline "." ; flush stdout`
			`set envd [eval $cmdd]`
			`eval $envd repmgr $common_mgr \`
			`-local {[list localhost $portd]} -remote {[list localhost $porta]}`
			`await_startup_done $envd`
			`puts -nonewline "." ; flush stdout`
			`set enve [eval $cmde]`
			`eval $enve repmgr $common_mgr \`
			`-local {[list localhost $porte]} -remote {[list localhost $porta]}`
			`await_startup_done $enve`
			`puts "."`
			`$enve close`
			`$envd close`
			`$envc close`
			`$envb close`
			`$enva close`

			`# Cold boot the group (with or without site E), giving site A a`
			`# high priority.`
			`#`

			`# The wait_limit's are intended to be an amount that is way more than`
			`# the expected timeout, used for nothing more than preventing the test`
			`# from hanging forever. The leeway amount should be enough less than`
			`# the timeout to allow for any imprecision introduced by the test`
			`# mechanism.`
			`#`
			`set elect_wait_limit 25`
			`set full_secs_leeway 59`
			`set full_wait_limit 85`

			`puts "\tRepmgr$tnum.a: Start first four sites."`
			`set enva [eval $cmda]`
			`eval $enva repmgr $common_mgr -pri 200 -local {[list localhost $porta]}`

			`set envb [eval $cmdb]`
			`eval $envb repmgr $common_mgr -pri 100 -local {[list localhost $portb]}`

			`set envc [eval $cmdc]`
			`eval $envc repmgr $common_mgr -pri 90 -local {[list localhost $portc]}`

			`set envd [eval $cmdd]`
			`eval $envd repmgr $common_mgr -pri 80 -local {[list localhost $portd]}`

			`if { $client_down } {`
			`set enve NONE`
			`} else {`
			`puts "\tRepmgr$tnum.b: Start fifth site."`
			`set enve [eval $cmde]`
			`eval $enve repmgr $common_mgr -pri 50 \`
			`-local {[list localhost $porte]}`
			`}`

			`# wait for results, and make sure they're correct`
			`#`
			`set envlist [list $enva $envb $envc $envd]`
			`if { $enve != "NONE" } {`
			`lappend envlist $enve`
			`}`
			`set limit $full_wait_limit`
			`puts "\tRepmgr$tnum.c: wait (up to $limit seconds) for first election."`
			`set t [repmgr026_await_election_result $envlist $limit]`
			`if { $client_down } {`
			`error_check_good slow_election [expr $t > $full_secs_leeway] 1`
			`} else {`
			`# When all sites participate, the election should finish in way`
			`# less than 60 seconds.`
			`#`
			`error_check_good timely_election [expr $t < $full_secs_leeway] 1`
			`}`
			`puts "\tRepmgr$tnum.d: first election completed in $t seconds"`

			`puts "\tRepmgr$tnum.e: wait for start-up done"`
			`$enva event_info -clear`
			`await_startup_done $envb`
			`$envb event_info -clear`
			`await_startup_done $envc`
			`$envc event_info -clear`
			`await_startup_done $envd`
			`$envd event_info -clear`
			`if { $enve != "NONE" } {`
			`await_startup_done $enve`
			`$enve event_info -clear`
			`}`

			`# Shut down site A, in order to test elections with less than the whole`
			`# group voting. However, normally repmgr's reaction to losing master`
			`# connection is to try a "fast election" (the n-1 trick). So we must do`
			`# something to mitigate that (see below).`
			`#`
			`puts "\tRepmgr$tnum.f: shut down master site A"`
			`if { $client_down } {`
			`# The fifth site is already down, so now we'll have just B, C,`
			`# and D running. Therefore, even with repmgr pulling its "fast`
			`# election" (n-1) trick, we don't have enough votes for a`
			`# full-participation short circuit; so this is a valid test of`
			`# the "normal" election timeout.`
			`#`
			`$enva close`
			`} else {`
			`# Here all sites are running, so if we just killed the master`
			`# repmgr would invoke its "fast election" trick, resulting in no`
			`# timeout. Since the purpose of this test is to ensure the`
			`# correct use of timeouts, that's no good. Instead, let's first`
			`# kill one more other site.`
			`$enve close`
			`$enva close`
			`}`

			`# wait for results, and check them`
			`#`
			`set envlist [list $envb $envc $envd]`
			`set limit $elect_wait_limit`
			`puts "\tRepmgr$tnum.h: wait (up to $limit seconds) for second election."`
			`set t [repmgr026_await_election_result $envlist $limit]`
			`error_check_good normal_election [expr $t < $full_secs_leeway] 1`
			`puts "\tRepmgr$tnum.i: second election completed in $t seconds"`

			`$envd close`
			`$envc close`
			`$envb close`
			`}`

			`# Wait (a limited amount of time) for the election to finish. The first env`
			`# handle in the list is the expected winner, and the others are the remaining`
			`# clients. Returns the approximate amount of time (in seconds) that the`
			`# election took.`
			`#`
			`proc repmgr026_await_election_result { envlist limit } {`
			`set begin [clock seconds]`
			`set deadline [expr $begin + $limit]`
			`while { true } {`
			`set t [clock seconds]`
			`if { $t > $deadline } {`
			`error "FAIL: time limit exceeded"`
			`}`

			`if { [repmgr026_is_ready $envlist] } {`
			`return [expr $t - $begin]`
			`}`

			`tclsleep 1`
			`}`
			`}`

			`proc repmgr026_is_ready { envlist } {`
			`set winner [lindex $envlist 0]`
			`if {![is_elected $winner]} {`
			`return false`
			`}`

			`foreach client [lrange $envlist 1 end] {`
			`if {![is_event_present $client newmaster]} {`
			`return false`
			`}`
			`}`
			`return true`
			`}`