machi/src/machi_chain_repair.erl

384 lines
17 KiB
Erlang
Raw Normal View History

%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc Erlang API for the Machi FLU TCP protocol version 1, with a
%% proxy-process style API for hiding messy details such as TCP
%% connection/disconnection with the remote Machi server.
%%
%% Machi is intentionally avoiding using distributed Erlang for
%% Machi's communication. This design decision makes Erlang-side code
%% more difficult & complex, but it's the price to pay for some
%% language independence. Later in Machi's life cycle, we need to
%% (re-)implement some components in a non-Erlang/BEAM-based language.
%%
%% This module implements a "man in the middle" proxy between the
%% Erlang client and Machi server (which is on the "far side" of a TCP
%% connection to somewhere). This proxy process will always execute
%% on the same Erlang node as the Erlang client that uses it. The
%% proxy is intended to be a stable, long-lived process that survives
%% TCP communication problems with the remote server.
-module(machi_chain_repair).
-include("machi_projection.hrl").
-define(SHORT_TIMEOUT, 5*1000).
-define(LONG_TIMEOUT, 60*1000).
-define(MAX_OFFSET, 999*1024*1024*1024*1024*1024*1024*1024).
%% These macros assume there's a bound variable called Verb.
2015-05-12 13:42:03 +00:00
-define(VERB(Fmt), if Verb -> io:format(Fmt ); true -> ok end).
-define(VERB(Fmt, Args), if Verb -> io:format(Fmt, Args); true -> ok end).
2015-05-14 05:04:31 +00:00
-ifdef(TEST).
-compile(export_all).
-endif. % TEST
-export([repair/7]).
repair_cp(_Src, _Dst, _MembersDict, _Opts) ->
%% TODO: add missing function: wipe away any trace of chunks
%% are present on Dst but missing on Src.
exit(todo_cp_mode).
repair(ap_mode=ConsistencyMode, Src, Repairing, UPI, MembersDict, ETS, Opts) ->
%% Use process dict so that 'after' clause can always quit all
%% proxy pids.
put(proxies_dict, orddict:new()),
Add = fun(Name, Pid) -> put(proxies_dict, orddict:store(Name, Pid, get(proxies_dict))) end,
OurFLUs = lists:usort([Src] ++ Repairing ++ UPI), % AP assumption!
RepairMode = proplists:get_value(repair_mode, Opts, repair),
Verb = proplists:get_value(verbose, Opts, true),
Res = try
[begin
{ok, Proxy} = machi_proxy_flu1_client:start_link(P),
Add(FLU, Proxy)
end || {FLU,P} <- MembersDict, lists:member(FLU, OurFLUs)],
ProxiesDict = get(proxies_dict),
D = dict:new(),
D2 = lists:foldl(fun({FLU, Proxy}, Dict) ->
get_file_lists(Proxy, FLU, Dict)
end, D, ProxiesDict),
MissingFileSummary = make_missing_file_summary(D2, OurFLUs),
?VERB("MissingFileSummary ~p\n", [MissingFileSummary]),
[ets:insert(ETS, {{directive_bytes, FLU}, 0}) || FLU <- OurFLUs],
%% Repair files from perspective of Src, i.e. tail(UPI).
SrcProxy = orddict:fetch(Src, ProxiesDict),
{ok, EpochID} = machi_proxy_flu1_client:get_epoch_id(
SrcProxy, ?SHORT_TIMEOUT),
?VERB("Make repair directives: "),
Ds =
[{File, make_repair_directives(
ConsistencyMode, RepairMode, File, Size, EpochID,
Verb,
Src, OurFLUs, ProxiesDict, ETS)} ||
{File, {Size, _MissingList}} <- MissingFileSummary],
?VERB(" done\n"),
[begin
[{_, Bytes}] = ets:lookup(ETS, {directive_bytes, FLU}),
?VERB("Out-of-sync data for FLU ~p: ~s MBytes\n",
[FLU, mbytes(Bytes)])
end || FLU <- OurFLUs],
?VERB("Execute repair directives: "),
ok = execute_repair_directives(ConsistencyMode, Ds, Src, EpochID,
Verb, OurFLUs, ProxiesDict, ETS),
?VERB(" done\n"),
ok
catch
What:Why ->
Stack = erlang:get_stacktrace(),
{error, {What, Why, Stack}}
after
[(catch machi_proxy_flu1_client:quit(Pid)) ||
Pid <- orddict:to_list(get(proxies_dict))]
end,
Res.
%% Create a list of servers where the file is completely missing.
%% In the "demo day" implementation and in an early integration WIP,
%% this was a useful thing. TODO: Can this be removed?
make_missing_file_summary(Dict, AllFLUs) ->
%% FileFilterFun = fun(_) -> true end,
FoldRes = lists:sort(dict:to_list(Dict)),
%% NOTE: MissingFileSummary = [{File, {FileSize, ServersMissingFrom}}]
MissingFileSummary =
[begin
{GotIt, Sizes} = lists:unzip(GotSizes),
Size = lists:max(Sizes),
Missing = {File, {Size, AllFLUs -- GotIt}},
Missing
end || {File, GotSizes} <- FoldRes %% , FileFilterFun(File)
],
MissingFileSummary.
get_file_lists(Proxy, FLU_name, D) ->
{ok, Res} = machi_proxy_flu1_client:list_files(Proxy, ?DUMMY_PV1_EPOCH,
?SHORT_TIMEOUT),
lists:foldl(fun({Size, File}, Dict) ->
dict:append(File, {FLU_name, Size}, Dict)
end, D, Res).
%% Wow, it's so easy to bikeshed this into a 1 year programming exercise.
%%
%% TODO: There are a lot of areas for exploiting parallelism here.
%% I've set the bikeshed aside for now, but "make repair faster" has a
%% lot of room for exploiting concurrency, overlapping reads & writes,
%% etc etc. There are also lots of different trade-offs to make with
%% regard to RAM use vs. disk use.
%%
2015-05-12 13:42:03 +00:00
%% TODO: There's no reason why repair can't be done 1).in parallel
%% across multiple repairees, and/or 2). with multiple byte ranges in
%% the same file, and/or 3). with bigger chunks.
%%
%% 1. Optimization
%% 2. Optimization
%% 3. Optimization, but it would be the easiest to implement, e.g. use
%% constant-sized 4MB chunks. Unfortuntely, it would also destroy
%% the ability to verify here that the chunk checksums are correct
%% *and* also propagate the correct checksum metadata to the
%% destination FLU.
%% As an additional optimization, add a bit of #2 to start the next
%% read while the current write is still in progress.
%%
%% Most/all of this could be executed in parallel on each FLU relative to
%% its own files. Then, in another TODO option, perhaps build a Merkle tree
%% or other summary of the local files & send that data structure to the
%% repair coordinator.
%%
%% Also, as another TODO note, repair_both_present() in the
%% prototype/demo-day code uses an optimization of calculating the MD5
%% checksum of the chunk checksum data as it arrives, and if the two MD5s
%% match, then we consider the two files in sync. If there isn't a match,
%% then we sort the lines and try another MD5, and if they match, then we're
%% in sync. In theory, that's lower overhead than the procedure used here.
%%
%% NOTE that one reason I chose the "directives list" method is to have an
%% option, later, of choosing to repair a subset of repairee FLUs if there
%% is a big discrepency between out of sync files: e.g., if FLU x has N
%% bytes out of sync but FLU y has 50N bytes out of sync, then it's likely
%% better to repair x only so that x can return to the UPI list quickly.
%% Also, in the event that all repairees are roughly comparably out of sync,
%% then the repair network traffic can be minimized by reading each chunk
%% only once.
2015-05-14 05:04:31 +00:00
make_repair_compare_fun(SrcFLU) ->
fun({{Offset_X, _Sz_a, _Cs_a, FLU_a}, _N_a},
{{Offset_X, _Sz_b, _CS_b, FLU_b}, _N_b}) ->
2015-05-14 05:04:31 +00:00
%% The repair source FLU always sorts less/earlier than anything else.
if FLU_a == SrcFLU ->
true;
FLU_b == SrcFLU ->
false;
true ->
%% Implicitly, smallest offset first.
%% Secondarily (and implicitly), sort smallest chunk size first
2015-05-14 05:04:31 +00:00
FLU_a < FLU_b
end;
(T_a, T_b) ->
%% See implicitly comments above
2015-05-14 05:04:31 +00:00
T_a =< T_b
end.
make_repair_directives(ConsistencyMode, RepairMode, File, Size, EpochID,
Verb, Src, FLUs0, ProxiesDict, ETS) ->
true = (Size < ?MAX_OFFSET),
FLUs = lists:usort(FLUs0),
C0 = [begin
%% erlang:garbage_collect(),
Proxy = orddict:fetch(FLU, ProxiesDict),
OffSzCs = case machi_proxy_flu1_client:checksum_list(
Proxy, EpochID, File, ?LONG_TIMEOUT) of
{ok, X} -> X;
{error, no_such_file} -> []
end,
[{?MAX_OFFSET, 0, <<>>, FLU}] % our end-of-file marker
++
[{Off, Sz, Cs, FLU} || {Off, Sz, Cs} <- OffSzCs]
end || FLU <- FLUs],
C1 = lists:append(C0),
%% erlang:garbage_collect(),
C2 = lists:sort(make_repair_compare_fun(Src), C1),
%% erlang:garbage_collect(),
Ds = make_repair_directives2(C2, ConsistencyMode, RepairMode,
File, Verb, Src, FLUs, ProxiesDict, ETS),
Ds.
make_repair_directives2(C2, ConsistencyMode, RepairMode,
File, Verb, Src, FLUs, ProxiesDict, ETS) ->
?VERB("."),
make_repair_directives3(C2, ConsistencyMode, RepairMode,
File, Verb, Src, FLUs, ProxiesDict, ETS, []).
make_repair_directives3([{?MAX_OFFSET, 0, <<>>, _FLU}|_Rest],
_ConsistencyMode, _RepairMode,
_File, _Verb, _Src, _FLUs, _ProxiesDict, _ETS, Acc) ->
lists:reverse(Acc);
make_repair_directives3([{Offset, Size, CSum, _FLU}=A|Rest0],
ConsistencyMode, RepairMode,
File, Verb, Src, FLUs, ProxiesDict, ETS, Acc) ->
{As0, Rest1} = take_same_offset_size(Rest0, Offset, Size),
As = [A|As0],
%% Sanity checking time
case lists:all(fun({_, _, Cs, _}) when Cs == CSum -> true;
(_) -> false
end, As) of
true ->
ok;
false ->
%% TODO: Pathology: someone has the wrong checksum.
%% 1. Fetch Src's chunk. If checksum is valid, use this chunk
%% to repair any invalid value.
%% 2. If Src's chunk is invalid, then check for other copies
%% in the UPI. If there is a valid chunk there, use it to
%% repair any invalid value.
%% 3a. If there is no valid UPI chunk, then delete this
%% byte range from all FLUs
%% 3b. Log big warning about data loss.
%% 4. Log any other checksum discrepencies as they are found.
exit({todo_repair_sanity_check, ?LINE, File, Offset, As})
end,
%% List construction guarantees us that there's at least one ?MAX_OFFSET
%% item remains. Sort order + our "taking" of all exact Offset+Size
%% tuples guarantees that if there's a disagreement about chunk size at
%% this offset, we can look ahead exactly one to see if there is sanity
%% or not.
[{Offset_next, _Size_next, _, _}=A_next|_] = Rest1,
if Offset + Size =< Offset_next ->
ok;
true ->
exit({todo_repair_sanity_check, ?LINE, File, Offset, Size,
next_is, A_next})
end,
Do = if ConsistencyMode == ap_mode ->
Gots = [FLU || {_Off, _Sz, _Cs, FLU} <- As],
Missing = FLUs -- Gots,
_ThisSrc = case lists:member(Src, Gots) of
true -> Src;
false -> hd(Gots)
end,
[ets:update_counter(ETS, {directive_bytes, FLU_m}, Size) ||
FLU_m <- Missing],
if Missing == [] ->
noop;
true ->
{copy, A, Missing}
end;
ConsistencyMode == cp_mode ->
exit({todo_cp_mode, ?MODULE, ?LINE})
end,
Acc2 = if Do == noop -> Acc;
true -> [Do|Acc]
end,
make_repair_directives3(Rest1,
ConsistencyMode, RepairMode,
File, Verb, Src, FLUs, ProxiesDict, ETS, Acc2).
take_same_offset_size(L, Offset, Size) ->
take_same_offset_size(L, Offset, Size, []).
take_same_offset_size([{Offset, Size, _CSum, _FLU}=A|Rest], Offset, Size, Acc) ->
take_same_offset_size(Rest, Offset, Size, [A|Acc]);
take_same_offset_size(Rest, _Offset, _Size, Acc) ->
{Acc, Rest}.
execute_repair_directives(ap_mode=_ConsistencyMode, Ds, _Src, EpochID, Verb,
_OurFLUs, ProxiesDict, ETS) ->
{_,_,_,_} = lists:foldl(fun execute_repair_directive/2,
{ProxiesDict, EpochID, Verb, ETS}, Ds),
ok.
execute_repair_directive({File, Cmds}, {ProxiesDict, EpochID, Verb, ETS}=Acc) ->
EtsKeys = [{in_files, t_in_files}, {in_chunks, t_in_chunks},
{in_bytes, t_in_bytes}, {out_files, t_out_files},
{out_chunks, t_out_chunks}, {out_bytes, t_out_bytes}],
[ets:insert(ETS, {L_K, 0}) || {L_K, _T_K} <- EtsKeys],
F = fun({copy, {Offset, Size, CSum, MySrc}, MyDsts}, Acc2) ->
SrcP = orddict:fetch(MySrc, ProxiesDict),
case ets:lookup_element(ETS, in_chunks, 2) rem 100 of
0 -> ?VERB(".", []);
_ -> ok
end,
_T1 = os:timestamp(),
{ok, Chunk} = machi_proxy_flu1_client:read_chunk(
SrcP, EpochID, File, Offset, Size,
?SHORT_TIMEOUT),
_T2 = os:timestamp(),
case machi_util:checksum_chunk(Chunk) of
CSum_now when CSum_now == CSum ->
[begin
DstP = orddict:fetch(DstFLU, ProxiesDict),
_T3 = os:timestamp(),
ok = machi_proxy_flu1_client:write_chunk(
DstP, EpochID, File, Offset, Chunk,
?SHORT_TIMEOUT),
_T4 = os:timestamp()
end || DstFLU <- MyDsts],
ets:update_counter(ETS, in_chunks, 1),
ets:update_counter(ETS, in_bytes, Size),
N = length(MyDsts),
ets:update_counter(ETS, out_chunks, N),
ets:update_counter(ETS, out_bytes, N*Size),
Acc2;
CSum_now ->
error_logger:error_msg(
"TODO: Checksum failure: "
"file ~p offset ~p size ~p: "
"expected ~p got ~p\n",
[File, Offset, Size, CSum, CSum_now]),
case ets:update_counter(ETS, t_bad_chunks, 1) of
N when N > 100 ->
throw(todo_wow_so_many_errors_so_verbose);
_ ->
ok
end,
Acc2
end
end,
ok = lists:foldl(F, ok, Cmds),
%% Copy this file's stats to the total counts.
[ets:update_counter(ETS, T_K, ets:lookup_element(ETS, L_K, 2)) ||
{L_K, T_K} <- EtsKeys],
Acc.
mbytes(0) ->
"0.0";
mbytes(Size) ->
lists:flatten(io_lib:format("~.1.0f", [max(0.1, Size / (1024*1024))])).
2015-05-14 05:04:31 +00:00
-ifdef(TEST).
repair_compare_fun_test() ->
F = make_repair_compare_fun(b),
List = [{{1,10,x,b},y},{{50,10,x,a},y},{{50,10,x,b},y},{{50,10,x,c},y},{{90,10,x,d},y}],
2015-05-14 05:04:31 +00:00
Input = lists:reverse(lists:sort(List)),
%% Although the merge func should never have two of the same FLU
%% represented, it doesn't matter for the purposes of this test.
%% 1. Smaller offset (element #1) wins, else...
%% 2. The FLU (element #2) that's the repair source always wins, else...
%% 3. The FLU with smallest name wins.
Expect = [{{1,10,x,b},y},{{50,10,x,b},y},{{50,10,x,a},y},{{50,10,x,c},y},{{90,10,x,d},y}],
2015-05-14 05:04:31 +00:00
Expect = lists:sort(F, Input).
-endif. % TEST