Fix various problems with repair not being aware of inner projections

This commit is contained in:
Scott Lystig Fritchie 2015-07-20 16:25:42 +09:00
parent 319397ecd2
commit 88d3228a4c
2 changed files with 78 additions and 51 deletions

View file

@ -214,6 +214,7 @@ init({MyName, InitMembersDict, MgrOpts}) ->
Opt = fun(Key, Default) -> proplists:get_value(Key, MgrOpts, Default) end,
RunEnv = [{seed, Opt(seed, now())},
{use_partition_simulator, Opt(use_partition_simulator, false)},
{simulate_repair, Opt(simulate_repair, true)},
{network_partitions, Opt(network_partitions, [])},
{network_islands, Opt(network_islands, [])},
{up_nodes, Opt(up_nodes, not_init_yet)}],
@ -597,6 +598,7 @@ calc_projection(_OldThreshold, _NoPartitionThreshold, LastProj,
end,
Repairing_list2 = [X || X <- OldRepairing_list, lists:member(X, Up)],
Simulator_p = proplists:get_value(use_partition_simulator, RunEnv2, false),
SimRepair_p = proplists:get_value(simulate_repair, RunEnv2, true),
{NewUPI_list3, Repairing_list3, RunEnv3} =
case {NewUp, Repairing_list2} of
{[], []} ->
@ -613,11 +615,11 @@ calc_projection(_OldThreshold, _NoPartitionThreshold, LastProj,
SameEpoch_p = check_latest_private_projections_same_epoch(
NewUPI_list ++ Repairing_list2,
S#ch_mgr.proj, Partitions, S),
if Simulator_p andalso SameEpoch_p
andalso RelativeToServer == LastInCurrentUPI ->
if Simulator_p andalso SimRepair_p andalso
SameEpoch_p andalso RelativeToServer == LastInCurrentUPI ->
D_foo=[{repair_airquote_done, {we_agree, (S#ch_mgr.proj)#projection_v1.epoch_number}}],
{NewUPI_list ++ [H], T, RunEnv2};
not Simulator_p
not (Simulator_p andalso SimRepair_p)
andalso
RepairFS == {repair_final_status, ok} ->
D_foo=[{repair_done, {repair_final_status, ok, (S#ch_mgr.proj)#projection_v1.epoch_number}}],
@ -1338,7 +1340,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
{flap_limit, FlapLimit}]}),
case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of
true ->
?V("{FLAP: ~w flaps ~w}! ", [S#ch_mgr.name, P_newprop_flap_count]);
ok; %% ?V("{FLAP: ~w flaps ~w}! ", [S#ch_mgr.name, P_newprop_flap_count]);
_ ->
ok
end,
@ -2102,66 +2104,85 @@ gobble_calls(StaticCall) ->
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
perhaps_start_repair(
#ch_mgr{name=MyName,
repair_worker=undefined,
proj=#projection_v1{creation_time=Start,
upi=[_|_]=UPI,
repairing=[_|_]}}=S) ->
RepairId = {MyName, os:timestamp()},
RepairOpts = [{repair_mode, repair}, verbose, {repair_id, RepairId}],
%% RepairOpts = [{repair_mode, check}, verbose],
RepairFun = fun() -> do_repair(S, RepairOpts, ap_mode) end,
LastUPI = lists:last(UPI),
IgnoreStabilityTime_p = proplists:get_value(ignore_stability_time,
S#ch_mgr.opts, false),
case timer:now_diff(os:timestamp(), Start) div 1000000 of
N when MyName == LastUPI andalso
(IgnoreStabilityTime_p orelse
N >= ?REPAIR_START_STABILITY_TIME) ->
{WorkerPid, _Ref} = spawn_monitor(RepairFun),
S#ch_mgr{repair_worker=WorkerPid,
repair_start=os:timestamp(),
repair_final_status=undefined};
perhaps_start_repair(#ch_mgr{name=MyName,
repair_worker=undefined,
proj=P_current}=S) ->
case inner_projection_or_self(P_current) of
#projection_v1{creation_time=Start,
upi=[_|_]=UPI,
repairing=[_|_]} ->
RepairId = {MyName, os:timestamp()},
RepairOpts = [{repair_mode,repair}, verbose, {repair_id,RepairId}],
%% RepairOpts = [{repair_mode, check}, verbose],
RepairFun = fun() -> do_repair(S, RepairOpts, ap_mode) end,
LastUPI = lists:last(UPI),
IgnoreStabilityTime_p = proplists:get_value(ignore_stability_time,
S#ch_mgr.opts, false),
case timer:now_diff(os:timestamp(), Start) div 1000000 of
N when MyName == LastUPI andalso
(IgnoreStabilityTime_p orelse
N >= ?REPAIR_START_STABILITY_TIME) ->
{WorkerPid, _Ref} = spawn_monitor(RepairFun),
S#ch_mgr{repair_worker=WorkerPid,
repair_start=os:timestamp(),
repair_final_status=undefined};
_ ->
S
end;
_ ->
S
end;
perhaps_start_repair(S) ->
S.
do_repair(
#ch_mgr{name=MyName,
proj=#projection_v1{upi=UPI,
repairing=[_|_]=Repairing,
members_dict=MembersDict}}=_S_copy,
Opts, ap_mode=RepairMode) ->
?V("RePaiR-~w,", [self()]),
T1 = os:timestamp(),
RepairId = proplists:get_value(repair_id, Opts, id1),
error_logger:info_msg("Repair start: tail ~p of ~p -> ~p, ~p ID ~w\n",
[MyName, UPI, Repairing, RepairMode, RepairId]),
do_repair(#ch_mgr{name=MyName,
proj=#projection_v1{upi=UPI,
repairing=[_|_]=Repairing,
members_dict=MembersDict}}=S,
Opts, ap_mode=RepairMode) ->
ETS = ets:new(repair_stats, [private, set]),
ETS_T_Keys = [t_in_files, t_in_chunks, t_in_bytes,
t_out_files, t_out_chunks, t_out_bytes,
t_bad_chunks, t_elapsed_seconds],
[ets:insert(ETS, {K, 0}) || K <- ETS_T_Keys],
Res = machi_chain_repair:repair(ap_mode, MyName, Repairing, UPI,
MembersDict, ETS, Opts),
T2 = os:timestamp(),
Elapsed = (timer:now_diff(T2, T1) div 1000) / 1000,
ets:insert(ETS, {t_elapsed_seconds, Elapsed}),
Summary = case Res of ok -> "success";
{ok, MyProj} = ?FLU_PC:read_latest_projection(proxy_pid(MyName, S),
private),
MyEpochID = machi_projection:get_epoch_id(MyProj),
RepairEpochIDs = [case ?FLU_PC:read_latest_projection(proxy_pid(Rep, S),
private) of
{ok, Proj} ->
machi_projection:get_epoch_id(Proj);
_ ->
unknown
end || Rep <- Repairing],
case lists:usort(RepairEpochIDs) of
[MyEpochID] ->
T1 = os:timestamp(),
RepairId = proplists:get_value(repair_id, Opts, id1),
error_logger:info_msg(
"Repair start: tail ~p of ~p -> ~p, ~p ID ~w\n",
[MyName, UPI, Repairing, RepairMode, RepairId]),
Res = machi_chain_repair:repair(ap_mode, MyName, Repairing, UPI,
MembersDict, ETS, Opts),
T2 = os:timestamp(),
Elapsed = (timer:now_diff(T2, T1) div 1000) / 1000,
ets:insert(ETS, {t_elapsed_seconds, Elapsed}),
Summary = case Res of ok -> "success";
_ -> "FAILURE"
end,
Stats = [{K, ets:lookup_element(ETS, K, 2)} || K <- ETS_T_Keys],
error_logger:info_msg("Repair ~s: tail ~p of ~p finished ~p repair ID ~w: "
"~p\nStats ~p\n",
[Summary, MyName, UPI, RepairMode, RepairId,
Res, Stats]),
ets:delete(ETS),
exit({repair_final_status, Res}).
end,
Stats = [{K, ets:lookup_element(ETS, K, 2)} || K <- ETS_T_Keys],
error_logger:info_msg(
"Repair ~s: tail ~p of ~p finished ~p repair ID ~w: "
"~p\nStats ~p\n",
[Summary, MyName, UPI, RepairMode, RepairId,
Res, Stats]),
ets:delete(ETS),
exit({repair_final_status, Res});
_ ->
exit(not_all_in_same_epoch)
end.
sanitize_repair_state(#ch_mgr{repair_final_status=Res,
proj=#projection_v1{upi=[_|_]}}=S)

View file

@ -29,6 +29,7 @@
update_checksum/1,
update_dbg2/2,
compare/2,
get_epoch_id/1,
make_summary/1,
make_members_dict/1
]).
@ -138,6 +139,11 @@ compare(#projection_v1{epoch_number=E1},
E1 > E2 -> 1
end.
%% @doc Return the epoch_id of the projection.
get_epoch_id(#projection_v1{epoch_number=Epoch, epoch_csum=CSum}) ->
{Epoch, CSum}.
%% @doc Create a proplist-style summary of a projection record.
make_summary(#projection_v1{epoch_number=EpochNum,