#!/usr/bin/perl -w use strict; ## sample invocation: #perl tpc-ch-generator.pl --sf=30 | sort -S 2G -T /mnt/sdb1/tmp -t, -k9,9n -k7,7n -k8,8n -k2,2n -k1,1 -k3,3n -k4,4n -k5,5n -k6,6n -k10,10n -k11,11n -k12,12n > /home/sears/workload-vldb/sf30.sort sub pickCountry { my $a = shift; my @big_fish = @{$a}; my $p = rand(100); for(my $i = 0; $i < @big_fish; $i++) { if($p < $big_fish[$i]) { return $i; } $p -= $big_fish[$i]; } return int(rand(200-@big_fish)) + (@big_fish); } sub addtime { my $day = shift; my $week = shift; my $year = shift; my $delta = shift; my $delday = $day + $delta; my $delweek = $week; my $delyear = $year; while($delday > 6) { $delday-=7; $delweek++; } while($delweek > 51) { $delweek-=52; $delyear++; } return ($delday, $delweek, $delyear); } # Proportions based on canada import/export according to WTO, Oct 2007 report) my @big_supp_fish = qw (54.9 12.3 8.7 4.0 3.9); sub pickSupplierCountry { return pickCountry(\@big_supp_fish); } my @big_cust_fish = qw (81.6 6.6 2.1 1.7 1.0); sub pickCustomerCountry { return pickCountry(\@big_cust_fish); } @ARGV==1||die; sub pickYear { my $max_year = 10000; my $p = rand(100); if($p < 99) { # Pick w/in 1995-2005 return 1995 + int(rand(10)); #(start of 1995-end of 2004) } my $year = int(rand($max_year - 10)); if($year >= 1995) { $year += 10; } return $year; } # Magic incantations: # ./database-generator.pl --test-supp-rng | sort -k1,1n | uniq -c | tac # ./database-generator.pl --test-cust-rng | sort -k1,1n | uniq -c | tac # ./database-generator.pl --test-year-rng | sort -k1,1n | uniq -c | tac # ./database-generator.pl --test-week-rng | sort -k1,1n | uniq -c | tac sub pickWeek { my $p = rand(100); if($p < 20) { # christmas if(rand(1) < 0.5) { return 50; } return 51; } $p -= 20; if($p < 20) { # mother's day if(rand(1) < 0.5) { return 18; } return 19; } my $week = int(rand(52-4)); if($week > 17) { $week += 2; } if($week > 49) { warn("Invalid week!!!"); } return $week; } sub pickDay { my $p = rand(100); if($p < 99) { return int(rand(5)); } else { return 5+int(rand(2)); } } sub choosePart { my $SF = shift || die "Expected scale factor"; # TPC-H calls for SF * 200,000 for part range, but has a concept # of part suppliers, w/ 4 suppliers per part. We treat # (part_id,supplier_id) as a single key here. my $p = int(rand($SF * 800000)); return $p; } sub pricePart { my $partkey = shift; ## Mult tpc-h formula by 100 since we don't support floating point columuns return 100*((90000 + (($partkey/10) % 20001) + 100 * ($partkey % 1000))/100); } my %partSourceCountry; sub suppliercountryPart { my $p = shift; if(!defined($partSourceCountry{$p})) { $partSourceCountry{$p} = pickSupplierCountry(); } return $partSourceCountry{$p} } sub chooseQuantity { return int(rand(50))+1; } my $nextOrderNum = 0; sub chooseOrderNum { my $n = $nextOrderNum; $nextOrderNum += 4; return $n; } sub choosePartcount { return 1 + int(rand(7)); } my $nextlineNum = 0; sub chooseLineitemNum { my $n = $nextlineNum; $nextlineNum += 4; return $n; } my $SF = 1; sub extendedpricePriceQuantity { my $p = shift; my $q = shift; return $p * $q; } if($ARGV[0] eq "--test-supp-rng") { for(my $i = 0; $i < 10000; $i++) { print (pickSupplierCountry()."\n"); } } elsif($ARGV[0] eq "--test-cust-rng") { for(my $i = 0; $i < 10000; $i++) { print (pickCustomerCountry()."\n"); } } elsif($ARGV[0] eq "--test-year-rng") { for(my $i = 0; $i < 1000000; $i++) { print (pickYear()."\n"); } } elsif($ARGV[0] eq "--test-week-rng") { for(my $i = 0; $i < 10000; $i++) { print (pickWeek()."\n"); } } elsif($ARGV[0] eq "--test-day-rng") { for(my $i = 0; $i < 10000; $i++) { print (pickDay()."\n"); } } elsif($ARGV[0] eq "--test-part-rng") { for(my $i = 0; $i < 10000; $i++) { my $part = choosePart($SF); print("$part\t".pricePart($part)."\t"); print(suppliercountryPart($part)."\t".suppliercountryPart($part)."\n"); } } elsif($ARGV[0] eq "--test-quant-rng") { for(my $i = 0; $i < 10000; $i++) { print(chooseQuantity()."\n"); } } elsif($ARGV[0] eq "--test-order-rng") { for(my $i = 0; $i < 10000; $i++) { print(chooseOrderNum()."\n"); } } elsif($ARGV[0] eq "--test-line-rng") { for(my $i = 0; $i < 10000; $i++) { print(chooseLineitemNum()."\n"); } } elsif($ARGV[0] =~ /^--sf=([0-9.]+)/o) { $SF = $1; # generating projection of natural join of lineitem table and others. # preallocate array to hold temporary tuples my @tups; my $LINEITEM = 0; my $PARTKEY = 1; my $QUANTITY = 2; my $PARTPRICE = 3; my $SRCNAT = 4; my $EXTPRICE = 5; for(my $i = 0; $i < ($SF * 6000000.0);) { # This is the schema: #l_partkey l_extendedprice o_quantity o_totalprice o_orderdate_wk o_orderdate_dayofwk o_orderdate_yr s_nationkey c_nationkey my $partcount = choosePartcount(); my $totalprice = 0; my $week = pickWeek(); my $day = pickDay(); my $year = pickYear(); my $cust_nation = pickCustomerCountry(); for(my $j = 0; $j < $partcount; $j++) { #my $lineitem = chooseLineitemNum(); my $partkey = choosePart($SF); my $quantity = chooseQuantity(); my $partprice = pricePart($partkey); my $src_nation = suppliercountryPart($partkey); my $extendedprice = extendedpricePriceQuantity($partprice, $quantity); my $totalprice += $extendedprice; #$tups[$j][$LINEITEM] = $lineitem; $tups[$j][$PARTKEY] = $partkey; $tups[$j][$QUANTITY] = $quantity; $tups[$j][$PARTPRICE] = $partprice; $tups[$j][$SRCNAT] = $src_nation; $tups[$j][$EXTPRICE] = $extendedprice; #push @tups, \@tup; $i++; } my $p = rand(100); my $deliver_time = 1+int(rand(14)); if($p < 99) { for(my $j = 0; $j < $partcount; $j++) { print("add,$i,$tups[$j][$PARTKEY],$tups[$j][$PARTPRICE],$tups[$j][$QUANTITY],$totalprice,$week,$day,$year,$tups[$j][$SRCNAT],$cust_nation,0\n"); my ($delday, $delweek, $delyear) = addtime($day,$week,$year,($j == 0 ? $deliver_time : (1+int(rand($deliver_time))))); print("deliver,$i,$tups[$j][$PARTKEY],$tups[$j][$PARTPRICE],$tups[$j][$QUANTITY],$totalprice,$delweek,$delday,$delyear,$tups[$j][$SRCNAT],$cust_nation,1\n"); } } else { my $rollbackidx = int(rand($partcount)); for(my $j = 0; $j < $partcount; $j++) { print("add,$i,$tups[$j][$PARTKEY],$tups[$j][$PARTPRICE],$tups[$j][$QUANTITY],$totalprice,$week,$day,$year,$tups[$j][$SRCNAT],$cust_nation,0\n"); my ($delday, $delweek, $delyear) = addtime($day,$week,$year,$deliver_time); print("delete,$i,$tups[$j][$PARTKEY],$tups[$j][$PARTPRICE],$tups[$j][$QUANTITY],$totalprice,$delweek,$delday,$delyear,$tups[$j][$SRCNAT],$cust_nation,0\n"); } } my $order_status_count = 1+int(rand(4)); my @status_time; $p = rand(100); if($p < 50) { for(my $j = 0; $j < $order_status_count; $j++) { push @status_time, int(rand($deliver_time * 1.3)); } foreach my $d (sort @status_time) { # ($d >= 0) || die; my ($statday, $statweek, $statyear) = addtime($day, $week, $year, $d); my $off = int(rand($partcount)); # ($statday + 7 * ($statweek + 52 * $statyear) >= # $day + 7 * ($week + 52 * $year)) || die "$statday $statweek $statyear < $day $week $year ($d)"; print("status,$i,$tups[$off][$PARTKEY],$tups[$off][$PARTPRICE],$tups[$off][$QUANTITY],$totalprice,$statweek,$statday,$statyear,$tups[$off][$SRCNAT],$cust_nation,0\n"); } } } }