Skip to content

Commit

Permalink
Move temp_gc timing to settings.py
Browse files Browse the repository at this point in the history
Allow for dynamic (with restart) changes of the temp_gc interval, sometimes enough temporary files
are created with larger more heavily used clusters to fill disks faster than the original 2 day timeout
on the temp_gc loop.

Defaults to 12 hours.
  • Loading branch information
oldmantaiter committed Jan 15, 2015
1 parent 87a755c commit 9614bee
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 5 deletions.
6 changes: 6 additions & 0 deletions lib/disco/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@
This adds -noshell to the erlang process. It provides compatibility for running
disco using a non-forking process type in the service definition.
.. envvar:: DATA_GC_INTERVAL
How long to wait before garbage collecting purged job data.
Default is ``12`` (hours).
.. envvar:: DISCO_WORKER_MAX_MEM
How much memory can be used by worker in total. Worker calls `resource.setrlimit(RLIMIT_AS, limit) <http://docs.python.org/library/resource.html#resource.setrlimit>`_ to set the limit when it starts. Can be either a percentage of total available memory or an exact number of bytes. Note that ``setrlimit`` behaves differently on Linux and Mac OS X, see *man setrlimit* for more information. Default is ``80%`` i.e. 80% of the total available memory.
Expand Down Expand Up @@ -320,6 +325,7 @@ class DiscoSettings(Settings):
'DISCO_USER': "os.getenv('LOGNAME')",
'DISCO_JOB_OWNER': "job_owner()",
'DISCO_WWW_ROOT': "os.path.join(DISCO_MASTER_HOME, 'www')",
'DATA_GC_INTERVAL': "12",
# GC
'DISCO_GC_AFTER': "100 * 365 * 24 * 60 * 60",
#'PROFILE'
Expand Down
9 changes: 4 additions & 5 deletions master/src/temp_gc.erl
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

-export([start_link/2]).

-define(GC_INTERVAL, 2 * ?DAY).

-spec start_link(node(), path()) -> no_return().
start_link(Master, DataRoot) ->
try register(temp_gc, self())
Expand All @@ -20,6 +18,7 @@ start_link(Master, DataRoot) ->

-spec loop(path()) -> no_return().
loop(DataRoot) ->
DataGCInterval = list_to_integer(disco:get_setting("DATA_GC_INTERVAL")) * ?HOUR,
try
{{ok, Purged}, {ok, Jobs}} = {get_purged(), event_server:get_jobs(get(master))},
case prim_file:list_dir(DataRoot) of
Expand All @@ -28,18 +27,18 @@ loop(DataRoot) ->
[Name || {Name, active, _Start} <- Jobs]),
process_dir(DataRoot, Dirs, gb_sets:from_ordset(Purged), Active);
E ->
% fresh install, try again after GC_INTERVAL
% fresh install, try again after DATA_GC_INTERVAL from settings
error_logger:info_msg("Tempgc: error listing ~p: ~p",
[DataRoot, E]),
ok
end
catch K:V ->
error_logger:info_msg("Tempgc: error contacting master from ~p: ~p:~p",
[node(), K,V])
% master busy, try again after GC_INTERVAL
% master busy, try again after DATA_GC_INTERVAL from settings
end,
error_logger:info_msg("Tempgc: one pass completed on ~p", [node()]),
timer:sleep(?GC_INTERVAL),
timer:sleep(DataGCInterval),
flush(),
loop(DataRoot).

Expand Down

0 comments on commit 9614bee

Please sign in to comment.