libdb/test/tcl/db_reptest.tcl

# See the file LICENSE for redistribution information.
#
# Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
#
# $Id$
#
# TEST	db_reptest
# TEST	Wrapper to configure and run the db_reptest program.

#
# TODO:
# late client start.
# Number of message proc threads.
#

global last_nsites
set last_nsites 0

#
# There are several user-level procs that the user may invoke.
# 1. db_reptest - Runs randomized configurations in a loop.
# 2. basic_db_reptest - Runs a simple set configuration once,
#	as a smoke test.
# 3. restore_db_reptest 'dir' - Runs the configuration given in 'dir'
#	in a loop.  The purpose is either to reproduce a problem
#	that some configuration encountered, or test a fix.
# 4. db_reptest_prof - Runs a single randomized configuration
#	and generates gprof profiling information for that run.
# 5. basic_db_reptest_prof - Runs a simple set configuration and
#	generates gprof profiling information for that run.
# 6. restore_db_reptest_prof - Runs the configuration given in 'dir' and
#	generates gprof profiling information for one run.
#

#
# db_reptest - Run a randomized configuration.  Run the test
# 'count' times in a loop, or until 'stopstr' is seen in the OUTPUT
# files or if no count or string is given, it is an infinite loop.
#
proc db_reptest { { stopstr "" } {count -1} } {
	berkdb srand [pid]
	set cmd "db_reptest_int random"
	db_reptest_loop $cmd $stopstr $count
}

#
# Run a basic reptest.  The types are:
# Basic 0 - Two sites, start with site 1 as master, 5 worker threads, btree,
#	run 100 seconds, onesite remote knowledge.
# Basic 1 - Three sites, all sites start as client, 5 worker threads, btree
#	run 150 seconds, full remote knowledge.
#
proc basic_db_reptest { { basic 0 } } {
	global util_path

	if { [file exists $util_path/db_reptest] == 0 } {
		puts "Skipping db_reptest.  Is it built?"
		return
	}
	if { $basic == 0 } {
		db_reptest_int basic0
	}
	if { $basic == 1 } {
		db_reptest_int basic1
	}
}

proc basic_db_reptest_prof { { basic 0 } } {
	basic_db_reptest $basic
	generate_profiles
}

#
# Restore a configuration from the given directory and
# run that configuration in a loop 'count' times or until
# 'stopstr' is seen in the OUTPUT files or if no count or
# string is given, it is an infinite loop.
#
proc restore_db_reptest { restoredir { stopstr "" } { count -1 } } {
	set cmd "db_reptest_int restore $restoredir/SAVE_RUN"
	db_reptest_loop $cmd $stopstr $count
}

proc restore_db_reptest_prof { restoredir } {
	restore_db_reptest $restoredir "" 1
	generate_profiles
}

#
# Run a single randomized iteration and then generate the profile
# information for each site.
#
proc db_reptest_prof { } {
	berkdb srand [pid]
	set cmd "db_reptest_int random"
	db_reptest_loop $cmd "" 1
	generate_profiles
}

proc generate_profiles {} {
	global dirs
	global use
	global util_path

	#
	# Once it is complete, generate profile information.
	#
	for { set i 1 } { $i <= $use(nsites) } { incr i } {
		set gmon NULL
		set known_gmons \
		    { $dirs(env.$i)/db_reptest.gmon $dirs(env.$i)/gmon.out }
		foreach gfile $known_gmons {
			if { [file exists $gfile] } {
				set gmon $gfile
				break
			}
		}
		if { $gmon == "NULL" } {
			puts "No gmon file.  Was it built with profiling?"
			return
		}
		set prof_out db_reptest.$i.OUT
		set stat [catch {exec gprof $util_path/db_reptest \
		    $gmon >>& $prof_out} ret]
		if { $stat != 0 } {
			puts "FAIL: gprof: $ret"
		}
		error_check_good gprof $stat 0
		puts "Profiled output for site $i: $prof_out"
	}
}

proc db_reptest_profile { } {
	db_reptest_prof
}

#
# Wrapper to run the command in a loop, 'count' times.
#
proc db_reptest_loop { cmd stopstr count } {
	global util_path

	if { [file exists $util_path/db_reptest] == 0 } {
		puts "Skipping db_reptest.  Is it built?"
		return
	}
	set iteration 1
	set start_time [clock format [clock seconds] -format "%H:%M %D"]
	while { 1 } {
		puts -nonewline "ITERATION $iteration: "
		puts -nonewline \
		    [clock format [clock seconds] -format "%H:%M %D"]
		puts " (Started: $start_time)"

		#
		eval $cmd

		puts -nonewline "COMPLETED $iteration: "
		puts [clock format [clock seconds] -format "%H:%M %D"]
		incr iteration
		#
		# If we've been given a string to look for, run until we
		# see it.  Or if not, skip to the count check.
		#
		if { [string length $stopstr] > 0 } {
			set found [search_output $stopstr]
			if { $found } {
				break
			}
		}
		if { $count > 0 && $iteration > $count } {
			break
		}
	}
}

#
# Internal version of db_reptest that all user-level procs
# eventually call.  It will configure a single run of
# db_reptest based on the configuration type specified
# in 'cfgtype'.  This proc will:
# Configure a run of db_reptest
# Run db_reptest
# Verify the sites after db_reptest completes.
#
proc db_reptest_int { cfgtype { restoredir NULL } } {
	source ./include.tcl
	global dirs
	global use

	env_cleanup $testdir

	set dirs(save) TESTDIR/SAVE_RUN
	set dirs(restore) $restoredir
	reptest_cleanup $dirs(save)

	#
	# Set up the array to indicate if we are going to use
	# a metadata dir or database data dir.  If we are using
	# datadirs, decide which one gets the created database.
	#
	get_datadirs $cfgtype use dirs

	#
	# Get all the default or random values needed for the test
	# and its args first.
	#
	set runtime 0
	#
	# Get number of sites first because pretty much everything else
	# after here depends on how many sites there are.
	#
	set use(nsites) [get_nsites $cfgtype $dirs(restore)]
	set use(lease) [get_lease $cfgtype $dirs(restore)]
	set use(peers) [get_peers $cfgtype]
	#
	# Get port information in case it needs to be converted for this
	# run.  A conversion will happen for a restored run if the current
	# baseport is different than the one used in restoredir.
	#
	set portlist [available_ports $use(nsites)]
	set baseport(curr) [expr [lindex $portlist 0] - 1]
	set baseport(orig) [get_orig_baseport $cfgtype $dirs(restore)]
	#
	# Only use kill if we have > 2 sites.
	# Returns a list.  An empty list means this will not be a kill test.
	# Otherwise the list has 3 values, the kill type and 2 kill sites.
	# See the 'get_kill' proc for a description of kill types.
	#
	set use(kill) ""
	set kill_type "NONE"
	set kill_site 0
	set kill_remover 0
	set site_remove 0
	if { $use(nsites) > 2 } {
		set use(kill) [get_kill $cfgtype \
		    $dirs(restore) $use(nsites) baseport]
		if { [llength $use(kill)] > 0 } {
			set kill_type [lindex $use(kill) 0]
			set kill_site [lindex $use(kill) 1]
			set kill_remover [lindex $use(kill) 2]
		} else {
			# If we are not doing a kill test, determine if
			# we are doing a remove test.
			set site_remove [get_remove $cfgtype $dirs(restore) \
			    $use(nsites)]
		}
	}
	if { $cfgtype != "restore" } {
		if { $use(lease) } {
			set use(master) 0
		} else {
			set use(master) [get_usemaster $cfgtype]
			if { $site_remove == $use(master) } {
				set site_remove 0
			}
		}
		set master_site [get_mastersite $cfgtype $use(master) $use(nsites)]
		set noelect [get_noelect $use(master)]
		set master2_site [get_secondary_master \
		    $noelect $master_site $kill_site $use(nsites)]
		set workers [get_workers $cfgtype $use(lease)]
		set dbtype [get_dbtype $cfgtype]
		set runtime [get_runtime $cfgtype]
		puts "Running: $use(nsites) sites, $runtime seconds."
		puts -nonewline "Running: "
		if { $use(createdir) } {
			puts -nonewline \
    "$use(datadir) datadirs, createdir DATA.$use(createdir), "
		}
		if { $use(metadir) } {
			puts -nonewline "METADIR, "
		}
		if { $kill_type == "DIE" || $kill_type == "REMOVE" } {
			puts -nonewline "kill site $kill_site, "
		}
		if { $kill_type == "LIVE_REM" } {
			puts -nonewline \
			    "live removal of site $kill_site by $kill_remover, "
		} elseif { $site_remove } {
			puts -nonewline "remove site $site_remove, "
		}
		if { $use(lease) } {
			puts "with leases."
		} elseif { $use(master) } {
			set master_text "master site $master_site"
			if { $noelect } {
				set master_text [concat $master_text \
				    "no elections"]
			}
			if { $master2_site } {
				set master_text [concat $master_text \
				    "secondary master site $master2_site"]
			}
			puts "$master_text."
		} else {
			puts "no master."
		}
	}
	#
	# This loop sets up the args to the invocation of db_reptest
	# for each site.
	#
	for { set i 1 } {$i <= $use(nsites) } { incr i } {
		set dirs(env.$i) TESTDIR/ENV$i
		set dirs(home.$i) ../ENV$i
		reptest_cleanup $dirs(env.$i)
		#
		# If we are restoring the args, just read them from the
		# saved location for this sites.  Otherwise build up
		# the args for each piece we need.
		#
		if { $cfgtype == "restore" } {
			set cid [open $dirs(restore)/DB_REPTEST_ARGS.$i r]
			set prog_args($i) [read $cid]
			close $cid
			#
			# Convert -K port number arguments to current
			# baseport if needed.  regsub -all substitutes
			# all occurrences of pattern, which is "-K "
			# and a number.  The result of regsub contains a tcl
			# expression with the number (\2, the second part of
			# the pattern), operators and variable names, e.g.:
			#   -K [expr 30104 - $baseport(orig) + $baseport(curr)]
			# and then subst evalutes the tcl expression.
			#
			if { $baseport(curr) != $baseport(orig) } {
				regsub -all {(-K )([0-9]+)} $prog_args($i) \
				    {-K [expr \2 - $baseport(orig) + \
				    $baseport(curr)]} prog_args($i)
				set prog_args($i) [subst $prog_args($i)]
			}
			if { $runtime == 0 } {
				set runtime [parse_runtime $prog_args($i)]
				puts "Runtime: $runtime"
			}
		} else {
			set nmsg [berkdb random_int 1 [expr $use(nsites) * 2]]
			set prog_args($i) \
			    "-v -c $workers -t $dbtype -T $runtime -m $nmsg "
			set prog_args($i) \
			    [concat $prog_args($i) "-h $dirs(home.$i)"]
			set prog_args($i) \
			    [concat $prog_args($i) "-o $use(nsites)"]
			#
			# Add in if this site should remove itself.
			#
			if { $site_remove == $i } {
				set prog_args($i) [concat $prog_args($i) "-r"]
			}
			#
			# Add in if this site should kill itself.
			#
			if { ($kill_type == "DIE" || $kill_type == "REMOVE") && \
			    $kill_site == $i} {
				set prog_args($i) [concat $prog_args($i) "-k"]
			}
			#
			# Add in if this site should remove a killed site.
			#
			if { $kill_remover == $i } {
				set kport [lindex $portlist \
				    [expr $kill_site - 1]]
				set prog_args($i) [concat $prog_args($i) \
				    "-K $kport"]
			}
			#
			# Add in if this site starts as a master or client.
			#
			if { $i == $master_site } {
				set state($i) MASTER
				set prog_args($i) [concat $prog_args($i) "-M"]
			} else {
				set state($i) CLIENT
				#
				# If we have a master, then we just want to
				# start as a client.  Otherwise start with
				# elections.
				#
				if { $use(master) } {
					set prog_args($i) \
					    [concat $prog_args($i) "-C"]
				} else {
					set prog_args($i) \
					    [concat $prog_args($i) "-E"]
				}
			}
			#
			# Add in if we are in no elections mode and if we are
			# the secondary master.
			#
			if { $noelect } {
				set prog_args($i) [concat $prog_args($i) "-n"]
				if { $i == $master2_site } {
					set prog_args($i) \
					    [concat $prog_args($i) "-s"]
				}
			}
		}
		save_db_reptest $dirs(save) ARGS $i $prog_args($i)
	}

	# Now make the DB_CONFIG file for each site.
	reptest_make_config $cfgtype dirs state use $portlist baseport

	# Run the test
	run_db_reptest dirs $use(nsites) $runtime $use(lease)
	puts "Test run complete.  Verify."

	# Verify the test run.
	verify_db_reptest $use(nsites) dirs use $kill_site $site_remove

	# Show the summary files
	print_summary

}

#
# Make a DB_CONFIG file for all sites in the group
#
proc reptest_make_config { cfgtype dirsarr starr usearr portlist baseptarr } {
	upvar $dirsarr dirs
	upvar $starr state
	upvar $baseptarr baseport
	upvar $usearr use
	global rporttype

	#
	# Generate global config values that should be the same
	# across all sites, such as number of sites and log size, etc.
	#
	set rporttype NULL
	set default_cfglist {
	{ "set_flags" "DB_TXN_NOSYNC" }
	{ "rep_set_request" "150000 2400000" }
	{ "rep_set_timeout" "db_rep_checkpoint_delay 0" }
	{ "rep_set_timeout" "db_rep_connection_retry 2000000" }
	{ "rep_set_timeout" "db_rep_heartbeat_monitor 1000000" }
	{ "rep_set_timeout" "db_rep_heartbeat_send 500000" }
	{ "set_cachesize"  "0 4194304 1" }
	{ "set_lg_max" "131072" }
	{ "set_lk_detect" "db_lock_default" }
	{ "set_verbose" "db_verb_recovery" }
	{ "set_verbose" "db_verb_replication" }
	}

	set acks { db_repmgr_acks_all db_repmgr_acks_all_peers \
	    db_repmgr_acks_none db_repmgr_acks_one db_repmgr_acks_one_peer \
	    db_repmgr_acks_quorum }

	#
	# 2site strict and ack policy must be the same on all sites.
	#
	if { $cfgtype == "random" } {
		if { $use(nsites) == 2 } {
			set strict [berkdb random_int 0 1]
		} else {
			set strict 0
		}
		if { $use(lease) } {
			#
			# 2site strict with leases must have ack policy of
			# one because quorum acks are ignored in this case,
			# resulting in lease expired panics on some platforms.
			#
			if { $strict } {
				set ackpolicy db_repmgr_acks_one
			} else {
				set ackpolicy db_repmgr_acks_quorum
			}
		} else {
			set done 0
			while { $done == 0 } {
				set acksz [expr [llength $acks] - 1]
				set myack [berkdb random_int 0 $acksz]
				set ackpolicy [lindex $acks $myack]
				#
				# Only allow the "none" policy with 2 sites
				# otherwise it can overwhelm the system and
				# it is a rarely used option.
				#
				if { $ackpolicy == "db_repmgr_acks_none" && \
				    $use(nsites) > 2 } {
					continue
				}
				#
				# Only allow "all" or "all_peers" policies
				# if not killing a site, otherwise the
				# unavailable site will cause the master
				# to ignore acks and blast the clients with
				# log records.
				#
				if { [llength $use(kill)] > 0 && \
				    ($ackpolicy == "db_repmgr_acks_all" || \
				    $ackpolicy ==
				    "db_repmgr_acks_all_peers") } {
					continue
				}
				set done 1
			}
		}
	} else {
		set ackpolicy db_repmgr_acks_one
	}
	#
	# Set known_master to the initial master or if one is not
	# assigned, randomly assign the group creator.
	#
	set known_master 0
	if { $cfgtype != "restore" } {
		for { set i 1 } { $i <= $use(nsites) } { incr i } {
			if { $state($i) == "MASTER" } {
				set known_master $i
			}
		}
		if { $known_master == 0 } {
			set known_master [berkdb random_int 1 $use(nsites)]
		}
	}
	for { set i 1 } { $i <= $use(nsites) } { incr i } {
		#
		# If we're restoring we just need to copy it.
		#
		if { $cfgtype == "restore" } {
			#
			# Convert DB_CONFIG port numbers to current baseport
			# if needed.
			#
			set restore_cfile $dirs(restore)/DB_CONFIG.$i
			set new_cfile $dirs(env.$i)/DB_CONFIG
			set new_save_cfile $dirs(save)/DB_CONFIG.$i
			if { $baseport(curr) != $baseport(orig) } {
				convert_config_ports $restore_cfile \
				    $new_cfile baseport
				file copy $new_cfile $new_save_cfile
			} else {
				file copy $restore_cfile $new_cfile
				file copy $restore_cfile $new_save_cfile
			}
			if { $use(metadir) } {
				file mkdir $dirs(env.$i)/METADIR
			}
			if { $use(datadir) } {
				for { set diri 1 } { $diri <= $use(datadir) } \
				    { incr diri } {
					file mkdir $dirs(env.$i)/DATA.$diri
				}
			}
			continue
		}
		#
		# Otherwise set up per-site config information
		#
		set cfglist $default_cfglist

		#
		# Add lease configuration if needed.  We're running all
		# locally, so there is no clock skew.
		#
		set allist [get_ack_lease_timeouts $use(lease)]
		if { $use(lease) } {
			#
			# We need to have an ack timeout > lease timeout.
			# Otherwise txns can get committed without waiting
			# long enough for leases to get granted.
			#
			lappend cfglist { "rep_set_config" "db_rep_conf_lease" }
			lappend cfglist { "rep_set_timeout" \
			    "db_rep_lease_timeout [lindex $allist 1]" }
			lappend cfglist { "rep_set_timeout" \
			    "db_rep_ack_timeout [lindex $allist 0]" }
		} else {
			lappend cfglist { "rep_set_timeout" \
			    "db_rep_ack_timeout [lindex $allist 0]" }
		}

		#
		# Add datadirs and the metadir, if needed.  If we are using
		# datadirs, then set which one is the create dir.
		#
		if { $use(metadir) } {
			file mkdir $dirs(env.$i)/METADIR
			lappend cfglist { "set_metadata_dir" "METADIR" }
		}
		if { $use(datadir) } {
			for { set diri 1 } { $diri <= $use(datadir) } \
			    { incr diri } {
				file mkdir $dirs(env.$i)/DATA.$diri
				#
				# Need to add to list in 2 steps otherwise
				# $diri in the list will not get evaluated
				# until later.
				#
				set litem [list add_data_dir DATA.$diri]
				lappend cfglist $litem
			}
			lappend cfglist { "set_create_dir" \
			    "DATA.$use(createdir)" }
		}

		#
		# Priority
		#
		if { $state($i) == "MASTER" } {
			lappend cfglist { "rep_set_priority" 100 }
		} else {
			if { $cfgtype == "random" } {
				set pri [berkdb random_int 10 25]
			} else {
				set pri 20
			}
			set litem [list rep_set_priority $pri]
			lappend cfglist $litem
		}
		#
		# Others: limit size, bulk, 2site strict
		#
		if { $cfgtype == "random" } {
			set limit_sz [berkdb random_int 15000 1000000]
			set bulk [berkdb random_int 0 1]
			if { $bulk } {
				lappend cfglist \
				    { "rep_set_config" "db_rep_conf_bulk" }
			}
			#
			# 2site strict was set above for all sites but
			# should only be used for sites in random configs.
			#
			if { $strict } {
				lappend cfglist { "rep_set_config" \
				    "db_repmgr_conf_2site_strict" }
			}
		} else {
			set limit_sz 100000
		}
		set litem [list rep_set_limit "0 $limit_sz"]
		lappend cfglist $litem
		set litem [list repmgr_set_ack_policy $ackpolicy]
		lappend cfglist $litem
		#
		# Now set up the local and remote ports.  If we are the
		# known_master (either master or group creator) set the
		# group creator flag on.
		#
		# Must use explicit 127.0.0.1 rather than localhost because
		# localhost can be configured differently on different
		# machines or platforms.  Use of localhost can cause
		# available_ports to return ports that are actually in use.
		#
		set lport($i) [lindex $portlist [expr $i - 1]]
		if { $i == $known_master } {
			#
			# Any change to this generated syntax will probably
			# require a change to get_orig_baseport, which relies
			# on this ordering and these embedded spaces.
			#
			set litem [list repmgr_site \
			    "127.0.0.1 $lport($i) db_local_site on \
			    db_group_creator on"]
		} else {
			set litem [list repmgr_site \
			    "127.0.0.1 $lport($i) db_local_site on"]
		}
		lappend cfglist $litem
		set rport($i) [get_rport $portlist $i $use(nsites) \
		    $known_master $cfgtype]
		#
		# Declare all sites bootstrap helpers.
		#
		foreach p $rport($i) {
			if { $use(peers) } {
				set litem [list repmgr_site "127.0.0.1 $p \
				    db_bootstrap_helper on db_repmgr_peer on"]
			} else {
				set litem [list repmgr_site "127.0.0.1 $p \
				    db_bootstrap_helper on"]
			}
			#
			# If we have full knowledge, assume a legacy system.
			#
			if { $cfgtype == "full" } {
				lappend litem "db_legacy on"
			}
			lappend cfglist $litem
		}
		#
		# Now write out the DB_CONFIG file.
		#
		set cid [open $dirs(env.$i)/DB_CONFIG a]
		foreach c $cfglist {
			set carg [subst [lindex $c 0]]
			set cval [subst [lindex $c 1]]
			puts $cid "$carg $cval"
		}
		close $cid
		set cid [open $dirs(env.$i)/DB_CONFIG r]
		set cfg [read $cid]
		close $cid

		save_db_reptest $dirs(save) CONFIG $i $cfg
	}

}

proc reptest_cleanup { dir } {
	#
	# For now, just completely remove it all.  We might want
	# to use env_cleanup at some point in the future.
	#
	fileremove -f $dir
	file mkdir $dir
}


proc save_db_reptest { savedir op site savelist } {
	#
	# Save a copy of the configuration and args used to run this
	# instance of the test.
	#
	if { $op == "CONFIG" } {
		set outfile $savedir/DB_CONFIG.$site
	} else {
		set outfile $savedir/DB_REPTEST_ARGS.$site
	}
	set cid [open $outfile a]
	puts -nonewline $cid $savelist
	close $cid
}

proc run_db_reptest { dirsarr numsites runtime use_lease } {
	source ./include.tcl
	upvar $dirsarr dirs
	global killed_procs

	set pids {}
	#
	# Wait three times workload run time plus an ack_timeout for each site
	# to kill a run.  The ack_timeout is especially significant for runs
	# where leases are in use because they take much longer to get started.
	#
	set ack_timeout [lindex [get_ack_lease_timeouts $use_lease] 0]
	set watch_time [expr $runtime * 3 + \
	    [expr $ack_timeout / 1000000] * $numsites]
	for {set i 1} {$i <= $numsites} {incr i} {
		lappend pids [exec $tclsh_path $test_path/wrap_reptest.tcl \
		    $dirs(save)/DB_REPTEST_ARGS.$i $dirs(env.$i) \
		    $dirs(save)/site$i.log &]
		tclsleep 1
	}
	watch_procs $pids 15 $watch_time
	set killed [llength $killed_procs]
	if { $killed > 0 } {
		error "Processes $killed_procs never finished"
	}
}

proc verify_db_reptest { num_sites dirsarr usearr kill site_rem } {
	upvar $dirsarr dirs
	upvar $usearr use

	set startenv 1
	set cmpeid 2
	if { $kill == 1 || $site_rem == 1 } {
		set startenv 2
		set cmpeid 3
	}
	set envbase [berkdb_env_noerr -home $dirs(env.$startenv)]
	set datadir ""
	if { $use(createdir) } {
		set datadir DATA.$use(createdir)
	}
	for { set i $cmpeid } { $i <= $num_sites } { incr i } {
		if { $i == $kill || $i == $site_rem } {
			continue
		}
		set cmpenv [berkdb_env_noerr -home $dirs(env.$i)]
		puts "Compare $dirs(env.$startenv) with $dirs(env.$i)"
		#
		# Compare 2 envs.  We assume the name of the database that
		# db_reptest creates and know it is 'am1.db'.
		# We want as other args:
		# 0 - compare_shared_portion
		# 1 - match databases
		# 0 - don't compare logs (for now)
		rep_verify $dirs(env.$startenv) $envbase $dirs(env.$i) $cmpenv \
		    0 1 0 am1.db $datadir
		$cmpenv close
	}
	$envbase close
}

proc get_nsites { cfgtype restoredir } {
	global last_nsites

	#
	# Figure out the number of sites.  We use 'glob' to get all of
	# the valid DB_CONFIG files in the restoredir.  That command uses
	# a single digit match, so the maximum number of sites must be <= 9.
	# Match DB_CONFIG.# so that it does not consider anything like an
	# emacs save file.
	#
	set maxsites 5
	#
	# If someone changes maxsites to be too big, it will break the
	# 'glob' below.  Catch that now.
	#
	if { $maxsites > 9 } {
		error "Max sites too large."
	}
	if { $cfgtype == "restore" } {
		set ret [catch {glob $restoredir/DB_CONFIG.\[1-$maxsites\]} \
		    result]
		if { $ret != 0 } {
			error "Could not get config list: $result"
		}
		return [llength $result]
	}
	if { $cfgtype == "random" } {
		#
		# Sometimes 'random' doesn't seem to do a good job.  I have
		# seen on all iterations after the first one, nsites is
		# always 2, 100% of the time.  Add this bit to make sure
		# this nsites values is different from the last iteration.
		#
		set n [berkdb random_int 2 $maxsites]
		while { $n == $last_nsites } {
			set n [berkdb random_int 2 $maxsites]
puts "Getting random nsites between 2 and $maxsites.  Got $n, last_nsites $last_nsites"
		}
		set last_nsites $n
		return $n
	}
	if { $cfgtype == "basic0" } {
		return 2
	}
	if { $cfgtype == "basic1" } {
		return 3
	}
	return -1
}

#
# Run with master leases?  25%/75% (use a master lease 25% of the time).
#
proc get_lease { cfgtype restoredir } {
	#
	# The number of sites must be the same for all.  Read the
	# first site's saved DB_CONFIG file if we're restoring since
	# we only know we have at least 1 site.
	#
	if { $cfgtype == "restore" } {
		set uselease 0
		set cid [open $restoredir/DB_CONFIG.1 r]
		while { [gets $cid cfglist] } {
#			puts "Read in: $cfglist"
			if { [llength $cfglist] == 0 } {
				break;
			}
			set cfg [lindex $cfglist 0]
			if { $cfg == "rep_set_config" } {
				set lease [lindex $cfglist 1]
				if { $lease == "db_rep_conf_lease" } {
					set uselease 1
					break;
				}
			}
		}
		close $cid
		return $uselease
	}
	if { $cfgtype == "random" } {
		set leases { 1 0 0 0 }
		set len [expr [llength $leases] - 1]
		set i [berkdb random_int 0 $len]
		return [lindex $leases $i]
	}
	if { $cfgtype == "basic0" } {
		return 0
	}
	if { $cfgtype == "basic1" } {
		return 0
	}
}

#
# Do a kill test about half the time.  We randomly choose a
# site number to kill, it could be a master or a client.  If
# we want to remove the site from the group, randomly choose
# a site to do the removal.
#
# We return a list with the kill type and the sites.  Return
# an empty list if we don't kill any site.  There are a few variants:
#
# 1: Die - A site just kills itself but remains part of the group.
# Return a list {DIE deadsite# 0}.
# 2: Removal - A site kills itself, and some site will also remove
# the dead site from the group. (Could be the same site that is dying,
# in which case the removal is done right before it exits.)
# {REMOVE deadsite# removalsite#}.
# 3. Live removal - Some site removes another live site from the group.
# (Could be itself.)
# {LIVE_REM killsite# removalsite#}.
#
proc get_kill { cfgtype restoredir num_sites basept } {
	upvar $basept baseport

	set nokill ""
	if { $cfgtype == "restore" } {
		set ksite 0
		set localkill 0
		set rkill 0
		set rsite 0
		set kport 0
		set ktype NONE
		for { set i 1 } { $i <= $num_sites } { incr i } {
			set cid [open $restoredir/DB_REPTEST_ARGS.$i r]
			# !!!
			# We currently assume the args file is 1 line.
			#
			gets $cid arglist
			close $cid
#			puts "Read in: $arglist"
			set dokill [lsearch $arglist "-k"]
			set dorem [lsearch $arglist "-K"]
			#
			# Only 1 of those args should ever be set for a given
			# input line.  We need to look at all sites in order
			# to determine the kill type.  If we find both -k and
			# -K, the site will be the same, so overwriting it
			# no matter what order the sites, is okay.
			#
			if { $dokill >= 0 } {
				set ksite $i
				set localkill 1
			}
			#
			# If it is a remote removal kill type, we are
			# the site doing the removing and we need to get
			# the site to remove from the arg.  $dorem is the
			# index of the arg, so + 1 is the site number.
			# The site in the arg is the port number so grab
			# the site number out of it.
			#
			if { $dorem >= 0 } {
				set rkill 1
				set kport [lindex $arglist [expr $dorem + 1]]
				set ksite [expr $kport - $baseport(orig)]
				# Convert kport to current baseport if needed.
				if { $baseport(curr) != $baseport(orig) } {
					set kport [expr $kport - \
					    $baseport(orig) + $baseport(curr)]
				}
				set rsite $i
			}
		}
		#
		# If we have a remote kill, then we decide the kill type
		# based on whether the killed site will be dead or alive.
		# If we found no site to kill/remove, we know it is not
		# a kill test.
		#
		if { $ksite == 0 } {
			return $nokill
		} else {
			#
			# See proc comment for a definition of each kill type.
			#
			if { $localkill == 1 && $rkill == 0 } {
				set ktype DIE
			}
			if { $localkill == 1 && $rkill == 1 } {
				set ktype REMOVE
			}
			if { $localkill == 0 && $rkill == 2 } {
				set ktype LIVE_REM
			}
			return [list $ktype $ksite $rsite]
		}
	}
	if { $cfgtype == "random" } {
		# Do a kill and/or removal test half the time.
		set k { 0 0 0 1 1 1 0 1 1 0 }
		set len [expr [llength $k] - 1]
		set i [berkdb random_int 0 $len]
		set dokill [lindex $k $i]
		set i [berkdb random_int 0 $len]
		set dorem [lindex $k $i]
		#
		# Set up for the possibilities listed above.
		#
		if { $dokill == 0 && $dorem == 0 } {
			return $nokill
		}
		#
		# Choose which sites to kill and do removal.
		#
		set ksite [berkdb random_int 1 $num_sites]
		set rsite [berkdb random_int 1 $num_sites]
		if { $dokill == 1 && $dorem == 0 } {
			set ktype DIE
			set rsite 0
		}
		if { $dokill == 1 && $dorem == 1 } {
			set ktype REMOVE
		}
		if { $dokill == 0 && $dorem == 1 } {
			set ktype LIVE_REM
		}
		return [list $ktype $ksite $rsite]
	}
	if { $cfgtype == "basic0" || $cfgtype == "basic1" } {
		return $nokill
	} else {
		error "Get_kill: Invalid config type $cfgtype"
	}
}

#
# If we want to run a remove/rejoin, which site?  This proc
# will return a site number of a site to remove/rejoin or
# it will return 0 if no removal test.  Sites are numbered
# starting at 1.
#
proc get_remove { cfgtype restoredir nsites } {
	set rsite 0
	if { $cfgtype == "random" } {
		# Do a remove test half the time we're called.
		set k { 0 0 0 1 1 1 0 1 1 0 }
		set len [expr [llength $k] - 1]
		set i [berkdb random_int 0 $len]
		if { [lindex $k $i] == 1 } {
			set rsite [berkdb random_int 1 $nsites]
		}
	} elseif { $cfgtype == "restore" } {
		#
		# If we're restoring we still need to know if a site is
		# running its own removal test so we know to skip it for verify.
		#
		for { set i 1 } { $i <= $nsites } { incr i } {
			set cid [open $restoredir/DB_REPTEST_ARGS.$i r]
			# !!!
			# We currently assume the args file is 1 line.
			# This code also assumes only 1 site ever does removal.
			#
			gets $cid arglist
			close $cid
#			puts "Read in: $arglist"
			set dorem [lsearch $arglist "-r"]
			if { $dorem >= 0 } {
				set rsite $i
				#
				# If we find one, we know no other site will
				# be doing removal.  So stop now.
				#
				break
			}
		}
	}
	return $rsite
}

#
# Use peers or only the master for requests? 25%/75% (use a peer 25%
# of the time and master 75%)
#
proc get_peers { cfgtype } {
	if { $cfgtype == "random" } {
		set peer { 0 0 0 1 }
		set len [expr [llength $peer] - 1]
		set i [berkdb random_int 0 $len]
		return [lindex $peer $i]
	} else {
		return 0
	}
}

#
# Start with a master or all clients?  25%/75% (use a master 25%
# of the time and have all clients 75%)
#
proc get_usemaster { cfgtype } {
	if { $cfgtype == "random" } {
		set mst { 1 0 0 0 }
		set len [expr [llength $mst] - 1]
		set i [berkdb random_int 0 $len]
		return [lindex $mst $i]
	}
	if { $cfgtype == "basic0" } {
		return 1
	}
	if { $cfgtype == "basic1" } {
		return 0
	}
}

#
# If we use a master, which site?  This proc will return
# the site number of the mastersite, or it will return
# 0 if no site should start as master.  Sites are numbered
# starting at 1.
#
proc get_mastersite { cfgtype usemaster nsites } {
	if { $usemaster == 0 } {
		return 0
	}
	if { $cfgtype == "random" } {
		return [berkdb random_int 1 $nsites]
	}
	if { $cfgtype == "basic0" } {
		return 1
	}
	if { $cfgtype == "basic1" } {
		return 0
	}
}

#
# If we are using a master, use no elections 20% of the time.
#
proc get_noelect { usemaster } {
	if { $usemaster } {
		set noelect { 0 0 1 0 0 }
		set len [expr [llength $noelect] - 1]
		set i [berkdb random_int 0 $len]
		return [lindex $noelect $i]
	} else {
		return 0
	}
}

#
# If we are using no elections mode and we are going to kill the initial
# master, select a different site to start up as master after the initial
# master is killed.
#
proc get_secondary_master { noelect master_site kill nsites } {
	if { $noelect == 0 || $kill != $master_site} {
		return 0
	}
	set master2_site [berkdb random_int 1 $nsites]
	while { $master2_site == $master_site } {
		set master2_site [berkdb random_int 1 $nsites]
	}
	return $master2_site
}

#
# This is the number of worker threads performing the workload.
# This is not the number of message processing threads.
#
# Scale back the number of worker threads if leases are in use.
# The timing with leases can be fairly sensitive and since all sites
# run on the local machine, too many workers on every site can
# overwhelm the system, causing lost messages and delays that make
# the tests fail.  Rather than try to tweak timeouts, just reduce
# the workloads a bit.
#
proc get_workers { cfgtype lease } {
	if { $cfgtype == "random" } {
		if { $lease } {
			return [berkdb random_int 2 4]
		} else {
			return [berkdb random_int 2 8]
		}
	}
	if { $cfgtype == "basic0" || $cfgtype == "basic1" } {
		return 5
	}
}

proc get_dbtype { cfgtype } {
	if { $cfgtype == "random" } {
		#
		# 50% btree, 25% queue 12.5% hash 12.5% recno
		# We favor queue only because there is special handling
		# for queue in internal init.
		#
#		set methods {btree btree btree btree queue queue hash recno}
		set methods {btree btree btree btree hash recno}
		set len [expr [llength $methods] - 1]
		set i [berkdb random_int 0 $len]
		return [lindex $methods $i]
	}
	if { $cfgtype == "basic0" || $cfgtype == "basic1" } {
		return btree
	}
}

proc get_runtime { cfgtype } {
	if { $cfgtype == "random" } {
		return [berkdb random_int 100 500]
	}
	if { $cfgtype == "basic0" } {
		return 100
	}
	if { $cfgtype == "basic1" } {
		return 150
	}
}

proc get_rport { portlist i num_sites known_master cfgtype} {
	global rporttype

	if { $cfgtype == "random" && $rporttype == "NULL" } {
		set types {backcirc forwcirc full onesite}
		set len [expr [llength $types] - 1]
		set rindex [berkdb random_int 0 $len]
		set rporttype [lindex $types $rindex]
	}
	if { $cfgtype == "basic0" } {
		set rporttype onesite
	}
	if { $cfgtype == "basic1" } {
		set rporttype full
	}
	#
	# This produces a circular knowledge ring.  Either forward
	# or backward.  In the forwcirc, ENV1 knows (via -r) about
	# ENV2, ENV2 knows about ENV3, ..., ENVX knows about ENV1.
	#
	if { $rporttype == "forwcirc" } {
		if { $i != $num_sites } {
			return [list [lindex $portlist $i]]
		} else {
			return [list [lindex $portlist 0]]
		}
	}
	if { $rporttype == "backcirc" } {
		if { $i != 1 } {
			return [list [lindex $portlist [expr $i - 2]]]
		} else {
			return [list [lindex $portlist [expr $num_sites - 1]]]
		}
	}
	#
	# This produces a configuration where site N does not know
	# about any other site and every other site knows about site N.
	# Site N must either be the master or group creator.
	# NOTE: Help_site_i subtracts one because site numbers
	# are 1-based and list indices are 0-based.
	#
	if { $rporttype == "onesite" } {
		set helper_site [expr $known_master - 1]
		if { $i == $known_master } {
			return {}
		}
		return [lindex $portlist $helper_site]
	}
	#
	# This produces a fully connected configuration
	#
	if { $rporttype == "full" } {
		set rlist {}
		for { set site 1 } { $site <= $num_sites } { incr site } {
			if { $site != $i } {
				lappend rlist \
				    [lindex $portlist [expr $site - 1]]
			}
		}
		return $rlist
	}
}

#
# We need to have an ack timeout > lease timeout. Otherwise txns can get
# committed without waiting long enough for leases to get granted.  We
# return a list {acktimeout# leasetimeout#}, with leasetimeout#=0 if leases
# are not in use.
#
proc get_ack_lease_timeouts { useleases } {
	if { $useleases } {
		return [list 20000000 10000000]
	} else {
		return [list 5000000 0]
	}
}

#
# Use datadir half the time.  Then pick how many and which datadir
# the database should reside in.  Use a metadata dir 25% of the time.
#
proc get_datadirs { cfgtype usearr dirarr } {
	upvar $usearr use
	upvar $dirarr dir

	set use(datadir) 0
	set use(createdir) 0
	set use(metadir) 0
	if { $cfgtype == "random" } {
		set meta { 0 0 0 1 }
		#
		# Randomly pick if we use datadirs, and if so, how many, up to 4.
		# Although we may create several datadirs, we only choose one
		# of them in which to create the database.
		#
		set data { 0 0 0 0 1 2 3 4 }
		set mlen [expr [llength $meta] - 1]
		set dlen [expr [llength $data] - 1]
		set im [berkdb random_int 0 $mlen]
		set id [berkdb random_int 0 $dlen]
		set use(datadir) [lindex $data $id]
		set use(metadir) [lindex $meta $im]
		#
		# If we're using datadirs, then randomly pick the creation dir.
		#
		if { $use(datadir) != 0 } {
			set use(createdir) [berkdb random_int 1 $use(datadir)]
		}
	} elseif { $cfgtype == "restore" } {
		set cid [open $dir(restore)/DB_CONFIG.1 r]
		set cfg [read $cid]
		# Look for metadata_dir, add_data_dir and set_create_dir.
		set use(metadir) [regexp -all {(set_metadata_dir)} $cfg]
		set use(datadir) [regexp -all {(add_data_dir)} $cfg]
		if { $use(datadir) } {
			set c [regexp {(set_create_dir )(DATA.[0-9])} $cfg m cr]
			#
			# We need to extract the directory number from the
			# createdir directory name.  I.e., DATA.2 needs '2'.
			#
			regexp {(DATA.)([0-9])} $m match d use(createdir)
		}
		close $cid
	}
	return 0
}

#
# Get the original baseport for a configuration to be restored by using
# the local site port number for its first site because every configuration
# will have a first site.
#
proc get_orig_baseport { cfgtype { restoredir NULL } } {
	if { $cfgtype != "restore" } {
		return 0
	} else {
		set cid [open $restoredir/DB_CONFIG.1 r]
		set cfg [read $cid]
		# Look for a number between "127.0.0.1" and "db_local_site on".
		# The spaces after 127.0.0.1 and before db_local_site are
		# significant in the pattern match.
		regexp {(127.0.0.1 )([0-9]+)( db_local_site on)} $cfg \
		    match p1 pnum
		close $cid
		return [expr $pnum - 1]
	}
}

#
# Convert DB_CONFIG file port numbers following "127.0.0.1 " to use a
# different baseport.  regsub -all substitutes all occurrences of pattern,
# which is "127.0.0.1 " and a number.  The result of regsub contains a tcl
# expression with the number (\2, the second part of the pattern), operators
# and variable names, e.g.:
#     -K [expr 30104 - $baseport(orig) + $baseport(curr)]
# and then subst evalutes the tcl expression.
#
# Writes a converted copy of orig_file to new_file.
#
proc convert_config_ports { orig_file new_file basept } {
	upvar $basept baseport

	set cid [open $orig_file r]
	set cfg [read $cid]
	regsub -all {(127.0.0.1 )([0-9]+)} $cfg \
	    {127.0.0.1 [expr \2 - $baseport(orig) + $baseport(curr)]} cfg
	set cfg [subst $cfg]
	close $cid
	set cid [open $new_file a]
	puts -nonewline $cid $cfg
	close $cid
	return 0
}

proc parse_runtime { progargs } {
	set i [lsearch $progargs "-T"]
	set val [lindex $progargs [expr $i + 1]]
	return $val
}

proc print_summary { } {
	source ./include.tcl

	set ret [catch {glob $testdir/summary.*} result]
	if { $ret == 0 } {
		set sumfiles $result
	} else {
		puts "Could not get summary list: $result"
		return 1
	}
	foreach f $sumfiles {
		puts "====   $f   ===="
		set ret [catch {open $f} fd]
		if { $ret != 0 } {
			puts "Error opening $f: $fd"
			continue
		}
		while { [gets $fd line] >= 0 } {
			puts "$line"
		}
		close $fd
	}
	return 0
}

proc search_output { stopstr } {
	source ./include.tcl

	set ret [catch {glob $testdir/E*/OUTPUT} result]
	if { $ret == 0 } {
		set outfiles $result
	} else {
		puts "Could not find any OUTPUT files: $result"
		return 0
	}
	set found 0
	foreach f $outfiles {
		set ret [catch {exec grep $stopstr $f > /dev/null} result]
		if { $ret == 0 } {
			puts "$f: Match found: $stopstr"
			set found 1
		}
	}
	return $found
}