Add MSC charts to help explain BAD-looking trim race

This commit is contained in:
Scott Lystig Fritchie 2014-02-24 18:24:07 +09:00
parent d077148b47
commit 13e15e0ecf
4 changed files with 189 additions and 0 deletions

View file

@ -0,0 +1,23 @@
## read-repair-race.1.
First attempt at using "mscgen" to make some Message Sequence
Chart (MSC) for a race found at commit 087c2605ab.
## read-repair-race.2.
Second attempt. This is almost exactly the trace that is
generated by this failing test case at commit 087c2605ab:
C2 = [{1,2,1},{{[{set,{var,1},{call,corfurl_pulse,setup,[1,2,1,standard]}}],[[{set,{var,3},{call,corfurl_pulse,append,[{var,1},<<0>>]}}],[{set,{var,2},{call,corfurl_pulse,read_approx,[{var,1},6201864198]}},{set,{var,5},{call,corfurl_pulse,append,[{var,1},<<0>>]}}],[{set,{var,4},{call,corfurl_pulse,append,[{var,1},<<0>>]}},{set,{var,6},{call,corfurl_pulse,trim,[{var,1},510442857]}}]]},{25152,1387,78241}},[{events,[[{no_bad_reads,[]}]]}]].
eqc:check(corfurl_pulse:prop_pulse(), C2).
## read-repair-race.2b.*
Same basic condition as read-repair-race.2, but edited
substantially to make it clearer what is happening.
Also for commit 087c2605ab.
I believe that I have a fix for the silver-colored
`error-overwritten`, but the correctness of it remains to be seen.

View file

@ -0,0 +1,49 @@
msc {
"<0.12583.0>" [label="Client1"], "<0.12574.0>" [label="FLU1"], "<0.12575.0>" [label="FLU2"], "<0.12576.0>" [label="FLU3"], "<0.12584.0>" [label="Client2"], "<0.12585.0>" [label="Client3"];
"<0.12585.0>" -> "<0.12576.0>" [ label = "{read,1,1}" ] ;
"<0.12583.0>" -> "<0.12574.0>" [ label = "{write,1,1,<<0>>}" ] ;
"<0.12576.0>" -> "<0.12585.0>" [ label = "error_unwritten" ] ;
"<0.12585.0>" abox "<0.12585.0>" [ label="Read Repair starts", textbgcolour="yellow"];
"<0.12585.0>" -> "<0.12574.0>" [ label = "{read,1,1}" ] ;
"<0.12574.0>" -> "<0.12583.0>" [ label = "ok" ] ;
"<0.12583.0>" -> "<0.12575.0>" [ label = "{write,1,1,<<0>>}" ] ;
"<0.12574.0>" -> "<0.12585.0>" [ label = "{ok,<<0>>}" ,textcolour="red"] ;
"<0.12585.0>" -> "<0.12575.0>" [ label = "{write,1,1,<<0>>}" ] ;
"<0.12575.0>" -> "<0.12585.0>" [ label = "ok" ] ;
"<0.12585.0>" -> "<0.12576.0>" [ label = "{write,1,1,<<0>>}" ] ;
"<0.12575.0>" -> "<0.12583.0>" [ label = "error_overwritten" ] ;
"<0.12583.0>" abox "<0.12583.0>" [ label = "Race with read repair? Read to double-check", textbgcolour="yellow" ] ;
"<0.12583.0>" -> "<0.12575.0>" [ label = "{read,1,1}" ] ;
"<0.12576.0>" -> "<0.12585.0>" [ label = "ok" ] ;
"<0.12585.0>" abox "<0.12585.0>" [ label="Read Repair SUCCESS", textbgcolour="green"];
"<0.12585.0>" abox "<0.12585.0>" [ label="Our problem: the PULSE model never believes that append_page ever wrote LPN 1", textcolour="red"];
"<0.12584.0>" abox "<0.12584.0>" [ label = "Client2 decides to trim LPN 1", textbgcolour="orange" ] ;
"<0.12584.0>" -> "<0.12574.0>" [ label = "{trim,1,1}" ] ;
"<0.12575.0>" -> "<0.12583.0>" [ label = "{ok,<<0>>}"] ;
"<0.12583.0>" abox "<0.12583.0>" [ label = "Value matches, yay!", textbgcolour="yellow" ] ;
"<0.12583.0>" abox "<0.12583.0>" [ label = "Continue writing", textbgcolour="yellow" ] ;
"<0.12583.0>" -> "<0.12576.0>" [ label = "{write,1,1,<<0>>}" ] ;
"<0.12574.0>" -> "<0.12584.0>" [ label = "ok" ] ;
"<0.12584.0>" -> "<0.12575.0>" [ label = "{trim,1,1}" ] ;
"<0.12576.0>" -> "<0.12583.0>" [ label = "error_overwritten" ] ;
"<0.12583.0>" abox "<0.12583.0>" [ label = "Race with read repair? Read to double-check", textbgcolour="yellow" ] ;
"<0.12583.0>" -> "<0.12576.0>" [ label = "{read,1,1}" ] ;
"<0.12575.0>" -> "<0.12584.0>" [ label = "ok" ] ;
"<0.12584.0>" -> "<0.12576.0>" [ label = "{trim,1,1}" ] ;
"<0.12576.0>" -> "<0.12584.0>" [ label = "ok" ] ;
"<0.12576.0>" -> "<0.12583.0>" [ label = "error_trimmed" ] ;
"<0.12583.0>" abox "<0.12583.0>" [ label = "Value MISMATCH!", textcolour="red" ] ;
"<0.12583.0>" abox "<0.12583.0>" [ label = "Read repair", textbgcolour="yellow" ] ;
"<0.12583.0>" -> "<0.12574.0>" [ label = "{read,1,1}" ] ;
"<0.12574.0>" -> "<0.12583.0>" [ label = "error_trimmed" ] ;
"<0.12583.0>" -> "<0.12575.0>" [ label = "{fill,1,1}" ] ;
"<0.12575.0>" -> "<0.12583.0>" [ label = "error_trimmed" ] ;
"<0.12583.0>" -> "<0.12576.0>" [ label = "{fill,1,1}" ] ;
"<0.12576.0>" -> "<0.12583.0>" [ label = "error_trimmed" ] ;
"<0.12583.0>" abox "<0.12583.0>" [ label = "At this point, we give up on LPN 1.", textcolour="red" ] ;
"<0.12583.0>" abox "<0.12583.0>" [ label = "Sequencer gives us LPN 2", textbgcolour="yellow" ] ;
"<0.12583.0>" abox "<0.12583.0>" [ label = "LPN 2 has been filled (not shown).", textbgcolour="yellow" ] ;
"<0.12583.0>" abox "<0.12583.0>" [ label = "Sequencer gives us LPN 3", textbgcolour="yellow" ] ;
"<0.12583.0>" abox "<0.12583.0>" [ label = "We write LPN 3 successfully", textbgcolour="green" ] ;
}

View file

@ -0,0 +1,60 @@
msc {
"<0.32555.4>" [label="Client1"], "<0.32551.4>" [label="FLU1"], "<0.32552.4>" [label="FLU2"], "<0.32556.4>" [label="Client2"], "<0.32557.4>" [label="Client3"];
"<0.32555.4>" abox "<0.32555.4>" [ label = "Writer", textbgcolour="orange"],
"<0.32556.4>" abox "<0.32556.4>" [ label = "Reader", textbgcolour="orange"],
"<0.32557.4>" abox "<0.32557.4>" [ label = "Trimmer", textbgcolour="orange"];
"<0.32555.4>" abox "<0.32555.4>" [ label = "append_page()", textbgcolour="yellow"] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Sequencer assigns LPN 1", textbgcolour="yellow"] ;
"<0.32555.4>" -> "<0.32551.4>" [ label = "{write,1,1,<<0>>}" ] ;
"<0.32556.4>" abox "<0.32556.4>" [ label = "read_page(LPN 1)", textbgcolour="yellow"] ;
"<0.32556.4>" -> "<0.32552.4>" [ label = "{read,1,1}" ] ;
"<0.32552.4>" -> "<0.32556.4>" [ label = "error_unwritten" ] ;
"<0.32556.4>" abox "<0.32556.4>" [ label = "Start read repair", textbgcolour="aqua"] ;
"<0.32556.4>" -> "<0.32551.4>" [ label = "{read,1,1}" ] ;
"<0.32551.4>" -> "<0.32555.4>" [ label = "ok" ] ;
"<0.32551.4>" -> "<0.32556.4>" [ label = "{ok,<<0>>}" ] ;
"<0.32556.4>" -> "<0.32552.4>" [ label = "{write,1,1,<<0>>}" ] ;
"<0.32555.4>" -> "<0.32552.4>" [ label = "{write,1,1,<<0>>}" ] ;
"<0.32557.4>" -> "<0.32551.4>" [ label = "{trim,1,1}" ] ;
"<0.32552.4>" -> "<0.32555.4>" [ label = "error_overwritten" ] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Our attempt to write LPN 1 is interrupted", textbgcolour="yellow"] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Check if an eager read-repair has written our data for us.", textbgcolour="yellow"] ;
"<0.32555.4>" -> "<0.32552.4>" [ label = "{read,1,1}" ] ;
"<0.32551.4>" -> "<0.32557.4>" [ label = "ok" ] ;
"<0.32552.4>" -> "<0.32556.4>" [ label = "ok" ] ;
"<0.32556.4>" abox "<0.32556.4>" [ label = "End read repair", textbgcolour="aqua"] ;
"<0.32556.4>" abox "<0.32556.4>" [ label = "read_page(LPN 1) -> {ok, <<0>>}", textbgcolour="yellow"] ;
"<0.32556.4>" abox "<0.32556.4>" [ label = "See red stuff at bottom....", textcolour="red"] ;
# "<0.32556.4>" abox "<0.32556.4>" [ label = "But PULSE thinks that LPN 1 was never written.", textcolour="red"] ;
# "<0.32556.4>" abox "<0.32556.4>" [ label = "Fixing this requires ... lots of pondering...", textcolour="red"] ;
"<0.32557.4>" -> "<0.32552.4>" [ label = "{trim,1,1}" ] ;
"<0.32552.4>" -> "<0.32557.4>" [ label = "ok" ] ;
"<0.32552.4>" -> "<0.32555.4>" [ label = "error_trimmed" ] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Wow, an eager trimmer got us, ouch.", textbgcolour="yellow"] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Start read repair", textbgcolour="aqua"] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Read repair here is for sanity checking, not really necessary.", textbgcolour="yellow"] ;
"<0.32555.4>" -> "<0.32551.4>" [ label = "{read,1,1}" ] ;
"<0.32551.4>" -> "<0.32555.4>" [ label = "error_trimmed" ] ;
"<0.32555.4>" -> "<0.32552.4>" [ label = "{fill,1,1}" ] ;
"<0.32552.4>" -> "<0.32555.4>" [ label = "error_trimmed" ] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "End read repair", textbgcolour="aqua"] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Our attempt to write LPN 1 has failed. Must ask sequencer for a new LPN.", textbgcolour="yellow"] ;
"<0.32551.4>" abox "<0.32552.4>" [ label = "LPN 2 is written (race details omitted)", textbgcolour="orange"] ;
"<0.32551.4>" abox "<0.32552.4>" [ label = "LPN 3 is written (race details omitted)", textbgcolour="orange"] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Sequencer assigns LPN 4", textbgcolour="yellow"] ;
"<0.32555.4>" -> "<0.32551.4>" [ label = "{write,1,4,<<0>>}" ] ;
"<0.32551.4>" -> "<0.32555.4>" [ label = "ok" ] ;
"<0.32555.4>" -> "<0.32552.4>" [ label = "{write,1,4,<<0>>}" ] ;
"<0.32552.4>" -> "<0.32555.4>" [ label = "ok" ] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "append_page() -> LPN 4", textbgcolour="yellow"] ;
"<0.32555.4>" abox "<0.32557.4>" [ label="Small problem: the PULSE model never believes that append_page ever wrote LPN 1", textcolour="red"];
"<0.32556.4>" abox "<0.32556.4>" [ label = "read_page(LPN 1)", textbgcolour="yellow"] ;
"<0.32556.4>" -> "<0.32552.4>" [ label = "{read,1,4}" ] ;
"<0.32552.4>" -> "<0.32556.4>" [ label = "{ok,<<0>>}" ] ;
"<0.32556.4>" abox "<0.32556.4>" [ label = "read_page(LPN 4) -> {ok, <<0>>}", textbgcolour="yellow"] ;
"<0.32555.4>" abox "<0.32557.4>" [ label="Big problem: Client2 has witnessed the same page written at LPN 1 and at LPN 4.", textcolour="red"];
"<0.32555.4>" abox "<0.32557.4>" [ label="", textcolour="red"];
"<0.32555.4>" abox "<0.32557.4>" [ label="", textcolour="red"];
}

View file

@ -0,0 +1,57 @@
msc {
"<0.32555.4>" [label="Client1"], "<0.32551.4>" [label="FLU1=Head"], "<0.32552.4>" [label="FLU2=Tail"], "<0.32556.4>" [label="Client2"], "<0.32557.4>" [label="Client3"];
"<0.32555.4>" abox "<0.32555.4>" [ label = "Writer", textbgcolour="orange"],
"<0.32556.4>" abox "<0.32556.4>" [ label = "Reader", textbgcolour="orange"],
"<0.32557.4>" abox "<0.32557.4>" [ label = "Trimmer", textbgcolour="orange"];
"<0.32555.4>" abox "<0.32555.4>" [ label = "append_page()", textbgcolour="yellow"] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Sequencer assigns LPN 1", textbgcolour="yellow"] ;
"<0.32555.4>" -> "<0.32551.4>" [ label = "{write,1,1,<<0>>}" ] ;
"<0.32551.4>" -> "<0.32555.4>" [ label = "ok" ] ;
"<0.32556.4>" abox "<0.32556.4>" [ label = "read_page(LPN 1)", textbgcolour="yellow"] ;
"<0.32556.4>" -> "<0.32552.4>" [ label = "{read,1,1}" ] ;
"<0.32552.4>" -> "<0.32556.4>" [ label = "error_unwritten" ] ;
"<0.32556.4>" abox "<0.32556.4>" [ label = "Start read repair", textbgcolour="aqua"] ;
"<0.32556.4>" -> "<0.32551.4>" [ label = "{read,1,1}" ] ;
"<0.32551.4>" -> "<0.32556.4>" [ label = "{ok,<<0>>}" ] ;
"<0.32556.4>" -> "<0.32552.4>" [ label = "{write,1,1,<<0>>}" ] ;
"<0.32552.4>" -> "<0.32556.4>" [ label = "ok" ] ;
"<0.32556.4>" abox "<0.32556.4>" [ label = "End read repair", textbgcolour="aqua"] ;
"<0.32556.4>" abox "<0.32556.4>" [ label = "read_page(LPN 1) -> {ok, <<0>>}", textbgcolour="yellow"] ;
"<0.32556.4>" abox "<0.32556.4>" [ label = "See red stuff at bottom....", textcolour="red"] ;
# "<0.32556.4>" abox "<0.32556.4>" [ label = "But PULSE thinks that LPN 1 was never written.", textcolour="red"] ;
# "<0.32556.4>" abox "<0.32556.4>" [ label = "Fixing this requires ... lots of pondering...", textcolour="red"] ;
"<0.32557.4>" -> "<0.32551.4>" [ label = "{trim,1,1}" ] ;
"<0.32551.4>" -> "<0.32557.4>" [ label = "ok" ] ;
"<0.32557.4>" -> "<0.32552.4>" [ label = "{trim,1,1}" ] ;
"<0.32552.4>" -> "<0.32557.4>" [ label = "ok" ] ;
"<0.32555.4>" -> "<0.32552.4>" [ label = "{write,1,1,<<0>>}" ] ;
"<0.32552.4>" -> "<0.32555.4>" [ label = "error_overwritten", textbgcolour="silver" ] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Our attempt to write LPN 1 is interrupted", textbgcolour="yellow"] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Check if an eager read-repair has written our data for us.", textbgcolour="yellow"] ;
"<0.32555.4>" -> "<0.32552.4>" [ label = "{read,1,1}" ] ;
"<0.32552.4>" -> "<0.32555.4>" [ label = "error_trimmed" ] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Wow, an eager trimmer got us, ouch.", textbgcolour="yellow"] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Start read repair", textbgcolour="aqua"] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Read repair here is for sanity checking, not really necessary.", textbgcolour="yellow"] ;
"<0.32555.4>" -> "<0.32551.4>" [ label = "{read,1,1}" ] ;
"<0.32551.4>" -> "<0.32555.4>" [ label = "error_trimmed" ] ;
"<0.32555.4>" -> "<0.32552.4>" [ label = "{fill,1,1}" ] ;
"<0.32552.4>" -> "<0.32555.4>" [ label = "error_trimmed" ] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "End read repair", textbgcolour="aqua"] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Our attempt to write LPN 1 has failed. Must ask sequencer for a new LPN.", textbgcolour="yellow"] ;
"<0.32551.4>" abox "<0.32552.4>" [ label = "LPN 2 and 3 are written (race details omitted)", textbgcolour="orange"] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "Sequencer assigns LPN 4", textbgcolour="yellow"] ;
"<0.32555.4>" -> "<0.32551.4>" [ label = "{write,1,4,<<0>>}" ] ;
"<0.32551.4>" -> "<0.32555.4>" [ label = "ok" ] ;
"<0.32555.4>" -> "<0.32552.4>" [ label = "{write,1,4,<<0>>}" ] ;
"<0.32552.4>" -> "<0.32555.4>" [ label = "ok" ] ;
"<0.32555.4>" abox "<0.32555.4>" [ label = "append_page() -> LPN 4", textbgcolour="yellow"] ;
"<0.32555.4>" abox "<0.32557.4>" [ label="Small problem: the PULSE model never believes that append_page ever wrote LPN 1", textcolour="red"];
"<0.32556.4>" abox "<0.32556.4>" [ label = "read_page(LPN 1)", textbgcolour="yellow"] ;
"<0.32556.4>" -> "<0.32552.4>" [ label = "{read,1,4}" ] ;
"<0.32552.4>" -> "<0.32556.4>" [ label = "{ok,<<0>>}" ] ;
"<0.32556.4>" abox "<0.32556.4>" [ label = "read_page(LPN 4) -> {ok, <<0>>}", textbgcolour="yellow"] ;
"<0.32555.4>" abox "<0.32557.4>" [ label="Big problem: Client2 has witnessed the same page written at LPN 1 and at LPN 4.", textcolour="red"];
}