Refactor self-healtcheck

* Trigger GC with lower threshold * Unconditionally die if queue len is too large

Refactor self-healtcheck
* Trigger GC with lower threshold * Unconditionally die if queue len is too large
590a633a · Сергей Прохоров · 4f30749d · 590a633a · 590a633a
Unverified Commit 590a633a authored Oct 28, 2018 by Сергей Прохоров
Hide whitespace changes
Inline Side-by-side

Showing with 59 additions and 33 deletions

mtp_handler.erl src/mtp_handler.erl +56 -33

mtp_metric.erl src/mtp_metric.erl +3 -0

No files found.
--- a/src/mtp_handler.erl
+++ b/src/mtp_handler.erl
@@ -24,9 +24,11 @@
 -define(MAX_SOCK_BUF_SIZE, 1024 * 50).    % Decrease if CPU is cheaper than RAM
 -define(MAX_UP_INIT_BUF_SIZE, 1024 * 1024).     %1mb
-define(QUEUE_CHECK_INTERVAL, 5000).
-define(QUEUE_CHECK_MAX_LEN, 50).
+-define(HEALTH_CHECK_INTERVAL, 5000).
-define(QUEUE_CHECK_MAX_MEM, 5 * 1024 * 1024).  %5mb
+-define(HEALTH_CHECK_MAX_QLEN, 300).
+-define(HEALTH_CHECK_GC, 400 * 1024).            %400kb
+-define(HEALTH_CHECK_MAX_MEM, 4 * 1024 * 1024).  %4mb
 -define(APP, mtproto_proxy).
@@ -123,7 +125,7 @@ handle_cast({proxy_ans, Down, Data}, #state{down = Down} = S) ->
    %% telegram server -> proxy
    case up_send(Data, S) of
        {ok, S1} ->
-            check_queue_overflow(bump_timer(S1));
+            maybe_check_health(bump_timer(S1));
        {error, Reason} ->
            lager:error("Error sending tunnelled data to in socket: ~p", [Reason]),
            {stop, normal, S}
@@ -148,6 +150,7 @@ handle_info({tcp, Sock, Data}, #state{sock = Sock,
    case handle_upstream_data(Data, S) of
        {ok, S1} ->
            ok = Transport:setopts(Sock, [{active, once}]),
+            %% Consider checking health here as well
            {noreply, bump_timer(S1)};
        {error, Reason} ->
            lager:info("handle_data error ~p", [Reason]),
@@ -313,14 +316,14 @@ handle_upstream_header(DcId, #state{acc = Acc, ad_tag = Tag, addr = Addr} = S) -
 %% @doc Terminate if message queue is too big
-check_queue_overflow(#state{last_queue_check = LastCheck} = S) ->
+maybe_check_health(#state{last_queue_check = LastCheck} = S) ->
    NowMs = erlang:system_time(millisecond),
    Delta = NowMs - LastCheck,
-    case Delta < ?QUEUE_CHECK_INTERVAL of
+    case Delta < ?HEALTH_CHECK_INTERVAL of
        true ->
            {noreply, S};
        false ->
-            case do_check_queue_overflow(true) of
+            case check_health() of
                ok ->
                    {noreply, S#state{last_queue_check = NowMs}};
                overflow ->
@@ -328,33 +331,53 @@ check_queue_overflow(#state{last_queue_check = LastCheck} = S) ->
            end
    end.
-do_check_queue_overflow(Gc) ->
+%% 1. If proc queue > 300 - stop
-    [{_, QLen}, {_, Mem}, {_, Bin}] =
+%% 2. If proc total memory > 400kb - do GC and go to 3
-        erlang:process_info(self(), [message_queue_len, memory, binary]),
+%% 3. If proc total memory > 4mb - stop
-    %% BinSum = sum_binary(Bin),
+check_health() ->
-    %% lager:debug("Process size check: queue_len=~w, total_mem=~w, memory=~w, binary_sum=~w, binary=~w",
+    do_check_health([qlen, gc, total_mem], calc_health()).
-    %%             [QLen, (Mem + BinSum) / 1024, Mem, BinSum, Bin]),
-    case QLen > ?QUEUE_CHECK_MAX_LEN of
+do_check_health([qlen | _], #{message_queue_len := QLen} = Health) when
-        true ->
+      QLen > ?HEALTH_CHECK_MAX_QLEN ->
-            RefcBinSize = sum_binary(Bin),
+    mtp_metric:count_inc([?APP, healthcheck, total], 1,
-            TotalMem = Mem + RefcBinSize,
+                         #{labels => [message_queue_len]}),
-            case TotalMem > ?QUEUE_CHECK_MAX_MEM of
+    lager:warning("Upstream too large queue_len=~w, health=~p", [QLen, Health]),
-                true when Gc->
+    overflow;
-                    erlang:garbage_collect(self()),
+do_check_health([gc | Other], #{total_mem := TotalMem}) when
-                    do_check_queue_overflow(false);
+      TotalMem > ?HEALTH_CHECK_GC ->
-                true ->
+    %% Maybe it doesn't makes sense to do GC if queue len is more than, eg, 50?
-                    lager:warning(
+    %% In this case allmost all memory will be in msg queue
-                      "Process too large queue_len=~w, memory=~w, binary_sum=~w, binary=~p",
+    mtp_metric:count_inc([?APP, healthcheck, total], 1,
-                      [QLen, Mem, RefcBinSize, Bin]),
+                         #{labels => [force_gc]}),
-                    overflow;
+    erlang:garbage_collect(self()),
-                false -> ok
+    do_check_health(Other, calc_health());
-            end;
+do_check_health([total_mem | _Other], #{total_mem := TotalMem} = Health) when
-        false ->
+      TotalMem > ?HEALTH_CHECK_MAX_MEM ->
-            ok
+    mtp_metric:count_inc([?APP, healthcheck, total], 1,
-    end.
+                         #{labels => [total_memory]}),
+    lager:warning("Process too large total_mem=~p, health=~p",
+                  [TotalMem / 1024, Health]),
+    overflow;
+do_check_health([_Ok | Other], Health) ->
+    do_check_health(Other, Health);
+do_check_health([], _) ->
+    ok.
-sum_binary(Bin) ->
+calc_health() ->
-    trunc(lists:sum([Size / RefC || {_, Size, RefC} <- Bin])).
+    [{_, QLen}, {_, Mem}, {_, BinInfo}] =
+        erlang:process_info(self(), [message_queue_len, memory, binary]),
+    RefcBinSize = sum_binary(BinInfo),
+    TotalMem = Mem + RefcBinSize,
+    #{message_queue_len => QLen,
+      memory => Mem,
+      refc_bin_size => RefcBinSize,
+      refc_bin_count => length(BinInfo),
+      total_mem => TotalMem}.
+sum_binary(BinInfo) ->
+    trunc(lists:foldl(fun({_, Size, RefC}, Sum) ->
+                              Sum + (Size / RefC)
+                      end, 0, BinInfo)).
 hex(Bin) ->
    <<begin

--- a/src/mtp_metric.erl
+++ b/src/mtp_metric.erl
@@ -111,6 +111,9 @@ active_metrics() ->
     {count, [?APP, timer_switch, total],
      "Connection timeout mode switches",
      #{labels => [listener, from, to]}},
+     {count, [?APP, healthcheck, total],
+      "Upstream self-healthcheck triggered some action",
+      #{labels => [action]}},
     {count, [?APP, received, bytes],
      "Bytes transmitted from upstream/downstream socket",