So, it definitely works, in that it stops a low(er) ranking flapping
process from continuing to make new proposals, so then the cycle of
flapping stops. Whenever an up/down state changes and a new/different
proposal is made, then things immediately resume, yay.
However, there's still a problem of the chain state at this time,
I believe. Here's a session that's damped by the flap counter:
SET always_last_partitions ON ... we should see convergence to correct chains.
21:23:03.170 d uses: [{epoch,457},{author,a},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,3}}}]}]
21:23:03.270 c uses: [{epoch,457},{author,a},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,3}}}]}]
21:23:03.471 a uses: [{epoch,459},{author,a},{upi,[a,d]},{repair,[c]},{down,[b]},{d,[{repair_airquote_done,{we_agree,457}},{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,3}}}]}]
21:23:03.611 b uses: [{epoch,460},{author,b},{upi,[b]},{repair,[c,d]},{down,[a]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,3}}}]}]
21:23:03.635 d uses: [{epoch,461},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,3}}}]}]
21:23:03.672 c uses: [{epoch,461},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,3}}}]}]
21:23:03.873 a uses: [{epoch,462},{author,a},{upi,[a,d]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,3}}}]}]
21:23:04.155 d uses: [{epoch,463},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,4}}}]}]
21:23:04.198 c uses: [{epoch,463},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,4}}}]}]
21:23:04.270 b uses: [{epoch,464},{author,b},{upi,[b]},{repair,[c,d]},{down,[a]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,4}}}]}]
21:23:04.276 a uses: [{epoch,465},{author,a},{upi,[a,d]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,4}}}]}]
21:23:04.652 d uses: [{epoch,466},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,4}}}]}]
21:23:04.660 c uses: [{epoch,466},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,4}}}]}]
21:23:04.679 a uses: [{epoch,467},{author,a},{upi,[a,d]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,4}}}]}]
21:23:04.914 b uses: [{epoch,468},{author,b},{upi,[b]},{repair,[c,d]},{down,[a]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,4}}}]}]
21:23:05.058 d uses: [{epoch,469},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,5}}}]}]
21:23:05.062 c uses: [{epoch,469},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,5}}}]}]
21:23:05.081 a uses: [{epoch,470},{author,a},{upi,[a,d]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,5}}}]}]
21:23:05.579 b uses: [{epoch,471},{author,b},{upi,[b]},{repair,[c,d]},{down,[a]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,5}}}]}]
21:23:05.581 d uses: [{epoch,472},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,5}}}]}]
21:23:05.590 c uses: [{epoch,472},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,5}}}]}]
21:23:05.885 a uses: [{epoch,473},{author,a},{upi,[a,d]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,5}}}]}]
21:23:06.102 d uses: [{epoch,474},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,6}}}]}]
21:23:06.159 c uses: [{epoch,474},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,6}}}]}]
21:23:06.250 b uses: [{epoch,475},{author,b},{upi,[b]},{repair,[c,d]},{down,[a]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,6}}}]}]
21:23:06.288 a uses: [{epoch,476},{author,a},{upi,[a,d]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,6}}}]}]
21:23:06.612 d uses: [{epoch,477},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,6}}}]}]
21:23:06.620 c uses: [{epoch,477},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,6}}}]}]
21:23:06.691 a uses: [{epoch,478},{author,a},{upi,[a,d]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,6}}}]}]
21:23:06.893 b uses: [{epoch,479},{author,b},{upi,[b]},{repair,[c,d]},{down,[a]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,6}}}]}]
21:23:07.015 d uses: [{epoch,480},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,7}}}]}]
21:23:07.022 c uses: [{epoch,480},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,7}}}]}]
21:23:07.094 a uses: [{epoch,481},{author,a},{upi,[a,d]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,7}}}]}]
21:23:07.516 d uses: [{epoch,482},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,7}}}]}]
21:23:07.550 b uses: [{epoch,483},{author,b},{upi,[b]},{repair,[c,d]},{down,[a]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,7}}}]}]
{FLAP: c flaps 4}!
{FLAP: c flaps 5}!
21:23:07.898 a uses: [{epoch,484},{author,a},{upi,[a,d]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,7}}}]}]
21:23:08.010 d uses: [{epoch,485},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,8}}}]}]
21:23:08.013 c uses: [{epoch,485},{author,d},{upi,[a]},{repair,[b,d,c]},{down,[]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,8}}}]}]
21:23:08.221 b uses: [{epoch,486},{author,b},{upi,[b]},{repair,[c,d]},{down,[a]},{d,[{author_proc,react},{ps,[{a,b}]},{nodes_up,[b,c,d]}]},{d2,[{network_islands,[na_reset_by_always]},{hooray,{v2,{2014,11,8},{21,23,8}}}]}]
{FLAP: a flaps 5}!
{FLAP: a flaps 6}!
SET always_last_partitions OFF ... let loose the dogs of war!
21:23:17.349 b uses: [{epoch,495},{author,b},{upi,[b]},{repair,[c,d,a]},{down,[]},{d,[{author_proc,react},{ps,[]},{nodes_up,[a,b,c,d]}]},{d2,[{network_islands,[islands_not_supported]},{hooray,{v2,{2014,11,8},{21,23,17}}}]}]
So, the state of the chains at 21:23:11.221, three seconds after
the flapping detector finished, is:
epoch=484, UPI=[a,d], repair=[c], nodes_up=[a,c,d]
epoch=485, UPI=[a], repair=[b,d,c], nodes_up=[a,b,c,d]
epoch=486, UPI=[b], repair=[c,d], nodes_up=[b,c,d]
The UPIs are overlapping, derp, that won't work, thanks to the magic
of epoch version # enforcement, However, the clients need to concern
themselves with the repairing members, also. As soon as a client
in the epoch=486 sends an op to FLU c or FLU d, those nodes will
wedge themselves because they're in a different epoch. Everyone
will get stuck, and then life sucks.
Future work TBD!
So, this is an interesting case where an asymmetric network parittion
can cause the current algorithm to cycle for several seconds, then one
participant X becomes less active (I'm not sure why), the other two
participants slowly come to an agreement, then X seems to wake up and
return everyone to the cycle/flapping loop.
SET always_last_partitions ON ... we should see convergence to correct chains.
16:35:03.986 c uses: [{epoch,321},{author,b},{upi,[b]},{repair,[a,c]},{down,[]},{d,[{author_proc,react},{ps,[]},{nodes_up,[a,b,c]}]},{d2,[]}]
16:35:04.118 b uses: [{epoch,323},{author,b},{upi,[b,c]},{repair,[]},{down,[a]},{d,[{repair_airquote_done,{we_agree,321}},{author_proc,react},{ps,[{b,a}]},{nodes_up,[b,c]}]},{d2,[]}]
16:35:04.492 c uses: [{epoch,324},{author,c},{upi,[b]},{repair,[a,c]},{down,[]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,b,c]}]},{d2,[]}]
16:35:04.520 b uses: [{epoch,325},{author,b},{upi,[b,c]},{repair,[]},{down,[a]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[b,c]}]},{d2,[]}]
16:35:04.583 a uses: [{epoch,326},{author,a},{upi,[a]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,c]}]},{d2,[]}]
16:35:04.894 c uses: [{epoch,327},{author,c},{upi,[b]},{repair,[a,c]},{down,[]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,b,c]}]},{d2,[]}]
16:35:04.922 b uses: [{epoch,328},{author,b},{upi,[b,c]},{repair,[]},{down,[a]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[b,c]}]},{d2,[]}]
16:35:05.291 a uses: [{epoch,329},{author,a},{upi,[a]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,c]}]},{d2,[]}]
16:35:05.296 c uses: [{epoch,330},{author,c},{upi,[b]},{repair,[a,c]},{down,[]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,b,c]}]},{d2,[]}]
16:35:05.324 b uses: [{epoch,331},{author,b},{upi,[b,c]},{repair,[]},{down,[a]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[b,c]}]},{d2,[]}]
16:35:05.830 c uses: [{epoch,332},{author,c},{upi,[b]},{repair,[a,c]},{down,[]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,b,c]}]},{d2,[]}]
16:35:06.023 a uses: [{epoch,333},{author,a},{upi,[a]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,c]}]},{d2,[]}]
16:35:06.128 b uses: [{epoch,334},{author,b},{upi,[b,c]},{repair,[]},{down,[a]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[b,c]}]},{d2,[]}]
16:35:06.342 c uses: [{epoch,335},{author,c},{upi,[b]},{repair,[a,c]},{down,[]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,b,c]}]},{d2,[]}]
16:35:06.530 b uses: [{epoch,336},{author,b},{upi,[b,c]},{repair,[]},{down,[a]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[b,c]}]},{d2,[]}]
16:35:06.734 a uses: [{epoch,337},{author,a},{upi,[a]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,c]}]},{d2,[]}]
16:35:06.746 c uses: [{epoch,338},{author,c},{upi,[b]},{repair,[a,c]},{down,[]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,b,c]}]},{d2,[]}]
16:35:06.932 b uses: [{epoch,339},{author,b},{upi,[b,c]},{repair,[]},{down,[a]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[b,c]}]},{d2,[]}]
16:35:07.267 c uses: [{epoch,340},{author,c},{upi,[b]},{repair,[a,c]},{down,[]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,b,c]}]},{d2,[]}]
16:35:07.334 b uses: [{epoch,341},{author,b},{upi,[b,c]},{repair,[]},{down,[a]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[b,c]}]},{d2,[]}]
16:35:07.460 a uses: [{epoch,342},{author,a},{upi,[a]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,c]}]},{d2,[]}]
16:35:07.669 c uses: [{epoch,343},{author,c},{upi,[b]},{repair,[a,c]},{down,[]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,b,c]}]},{d2,[]}]
16:35:07.736 b uses: [{epoch,344},{author,b},{upi,[b,c]},{repair,[]},{down,[a]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[b,c]}]},{d2,[]}]
16:35:08.165 a uses: [{epoch,345},{author,a},{upi,[a]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,c]}]},{d2,[]}]
16:35:08.194 c uses: [{epoch,346},{author,c},{upi,[b]},{repair,[a,c]},{down,[]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,b,c]}]},{d2,[]}]
16:35:08.541 b uses: [{epoch,347},{author,b},{upi,[b,c]},{repair,[]},{down,[a]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[b,c]}]},{d2,[]}]
16:35:08.702 c uses: [{epoch,348},{author,c},{upi,[b]},{repair,[a,c]},{down,[]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,b,c]}]},{d2,[]}]
16:35:08.894 a uses: [{epoch,349},{author,a},{upi,[a]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,c]}]},{d2,[]}]
16:35:08.944 b uses: [{epoch,350},{author,b},{upi,[b,c]},{repair,[]},{down,[a]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[b,c]}]},{d2,[]}]
16:35:09.212 c uses: [{epoch,351},{author,c},{upi,[b]},{repair,[a,c]},{down,[]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,b,c]}]},{d2,[]}]
16:35:09.346 b uses: [{epoch,352},{author,b},{upi,[b,c]},{repair,[]},{down,[a]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[b,c]}]},{d2,[]}]
16:35:09.598 a uses: [{epoch,353},{author,a},{upi,[a]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,c]}]},{d2,[]}]
16:35:09.614 c uses: [{epoch,354},{author,c},{upi,[b]},{repair,[a,c]},{down,[]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,b,c]}]},{d2,[]}]
16:35:09.748 b uses: [{epoch,355},{author,b},{upi,[b,c]},{repair,[]},{down,[a]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[b,c]}]},{d2,[]}]
16:35:10.135 c uses: [{epoch,356},{author,c},{upi,[b]},{repair,[a,c]},{down,[]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,b,c]}]},{d2,[]}]
16:35:10.150 b uses: [{epoch,357},{author,b},{upi,[b,c]},{repair,[]},{down,[a]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[b,c]}]},{d2,[]}]
16:35:10.329 a uses: [{epoch,358},{author,a},{upi,[a]},{repair,[c]},{down,[b]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,c]}]},{d2,[]}]
16:35:10.537 c uses: [{epoch,359},{author,c},{upi,[b]},{repair,[a,c]},{down,[]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[a,b,c]}]},{d2,[]}]
16:35:10.552 b uses: [{epoch,360},{author,b},{upi,[b,c]},{repair,[]},{down,[a]},{d,[{author_proc,react},{ps,[{b,a}]},{nodes_up,[b,c]}]},{d2,[]}]
This is a return to the old, possibly asymmetric/unidirectional network
partition simulation scheme. PULSE testing so far for the
symmetric/bidirectional partitioning scheme (via the "islands" approach)
appears to be very stable, yay.
So, let's go back to the harder environment and see what happens!
This is some brute-force-and-not-subtle hackery, but it looks like I've
got the basis for a test that a model checker (QuickCheck or Concuerror
or something else) can use for a good/bad check.
The following properties are examined (but not enforced):
* At each epoch, are each of the chains disjoint? I.e. no single FLU
is a member of different chains at the same epoch.
This is a safety/sanity check.
* For each unique chain UPI list at each epoch, are all of the FLUs in that
chain unanimous in their agreement:
agreed_membership: all UPI FLUs agree about the UPI list
not_agreed: the membership algorithm has not yet agreed on
the UPI list
This is not a safety/sanity check per se, but it can be useful input
into a good safety check.
Some examples:
* At epoch 0, there is no agreement on UPI membership of the one [a,b,c]
chain.
* At epoch 1, there is full agreement,
* At epoch 4, we're back to no agreement.
* At epoch 17, there's agreement on a small chain with UPI list=[a].
(This agreement continues until epoch 216, but that history is omitted
here.)
[{0,
{ok_disjoint,[{[a,b,c],
not_unique,0,
[<<159,215,105,140,29,151,142,2,162,90,225,209,10,102,119,
193,110,72,75,245>>,
<<213,46,129,248,23,50,210,247,145,68,65,112,232,101,28,56,
239,12,78,227>>,
<<230,146,66,183,10,218,57,29,233,166,108,176,118,109,
226,186,190,56,174,108>>]}]}},
{1,{ok_disjoint,[{agreed_membership,[a,b,c]}]}},
{4,
{ok_disjoint,[{not_unique,[a,b,c],
[not_in_this_epoch,
<<208,227,221,233,254,160,36,134,252,106,
124,192,101,171,168,68,169,55,2,54>>]}]}},
{6,
{ok_disjoint,[{not_unique,[a,b,c],
[not_in_this_epoch,
<<191,47,203,143,195,230,71,162,39,132,188,
128,64,39,18,9,73,148,207,220>>]}]}},
{17,{ok_disjoint,[{agreed_membership,[a]}]}},
{24,{ok_disjoint,[{agreed_membership,[a]}]}},
[...]
Starting at epoch 419, the network stabilized, but not fully,
into two "islands" of servers, a alone and b&c together.
At epoch 486, the network is fully stabilized with the same network
partition. We see rapid convergence to two chains, [a] and [b,c].
{419,{ok_disjoint,[{agreed_membership,[a]}]}},
{425,{ok_disjoint,[{agreed_membership,[b]}]}},
{436,{ok_disjoint,[{agreed_membership,[b]}]}},
{442,{ok_disjoint,[{agreed_membership,[b]}]}},
{444,{ok_disjoint,[{agreed_membership,[b]}]}},
{454,{ok_disjoint,[{agreed_membership,[b]}]}},
{456,{ok_disjoint,[{agreed_membership,[b]}]}},
{458,{ok_disjoint,[{agreed_membership,[b]}]}},
{463,{ok_disjoint,[{agreed_membership,[b]}]}},
{468,{ok_disjoint,[{agreed_membership,[b]}]}},
{479,{ok_disjoint,[{agreed_membership,[b]}]}},
{482,{ok_disjoint,[{agreed_membership,[b]}]}},
{486,{ok_disjoint,[{agreed_membership,[a]}]}},
{488,{ok_disjoint,[{agreed_membership,[b]}]}},
{490,{ok_disjoint,[{agreed_membership,[b,c]}]}},
{492,{ok_disjoint,[{agreed_membership,[b,c]}]}}]
foo
So, this still pops up occasionally:
% rebar skip_deps=true -v eunit suites=machi_flu0_test,machi_chain_manager1
[...]
a private: [{epoch,223},{author,a},{upi,[a,b]},{repair,[]},{down,[c]},{d,[{author_proc,react},{nodes_up,[a,b]}]},{d2,[{up_nodz,[a,b]},{hooray,{v2,{2014,11,3},{20,19,57}}}]}]
b private: [{epoch,224},{author,b},{upi,[b,a]},{repair,[]},{down,[c]},{d,[{author_proc,react},{nodes_up,[a,b]}]},{d2,[{up_nodz,[a,b]},{hooray,{v2,{2014,11,3},{20,19,57}}}]}]
c private: [{epoch,191},{author,c},{upi,[c]},{repair,[]},{down,[a,b]},{d,[{author_proc,react},{nodes_up,[c]}]},{d2,[{up_nodz,[c]},{hooray,{v2,{2014,11,3},{20,19,57}}}]}]
The mis-ordering between [a,b] and [b,a] happens after the partition settled
on the islands of [a,b] and [c].
{ c100 , ? LINE , _AnyOtherReturnValue } {c100,734,
{err,error,
{badmatch,[a,b]},
from,
[{epoch,70},
{author,a},
{upi,[a]},
{repair,[b]},
{down,[c]},
{d,
[{author_proc,react},
{nodes_up,[a,b]}]},
{d2,[]}],
to,
[{epoch,194},
{author,b},
{upi,[b,a]},
{repair,[]},
{down,[c]},
{d,
[{author_proc,react},
{nodes_up,[a,b]}]},
{d2,[]}],
relative_to,a,stack,[...]
That diagram is really valuable, but it also takes a long time
to make any kind of edit; the process is too slow. This is a todo
item a reminder that the flowchart is important documentation and
must be brought back into sync with the code soon.