Move temp_gc timing to settings.py

Allow for dynamic (with restart) changes of the temp_gc interval, sometimes enough temporary files are created with larger more heavily used clusters to fill disks faster than the original 2 day timeout on the temp_gc loop. Defaults to 12 hours.
discoproject · Jan 15, 2015 · 9614bee · 9614bee
1 parent 87a755c
commit 9614bee
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 5 deletions.
diff --git a/lib/disco/settings.py b/lib/disco/settings.py
@@ -146,6 +146,11 @@
                 This adds -noshell to the erlang process. It provides compatibility for running
                 disco using a non-forking process type in the service definition.
 
+        .. envvar:: DATA_GC_INTERVAL
+
+                How long to wait before garbage collecting purged job data.
+                Default is ``12`` (hours).
+
         .. envvar:: DISCO_WORKER_MAX_MEM
 
                 How much memory can be used by worker in total. Worker calls `resource.setrlimit(RLIMIT_AS, limit) <http://docs.python.org/library/resource.html#resource.setrlimit>`_ to set the limit when it starts. Can be either a percentage of total available memory or an exact number of bytes. Note that ``setrlimit`` behaves differently on Linux and Mac OS X, see *man setrlimit* for more information. Default is ``80%`` i.e. 80% of the total available memory.
@@ -320,6 +325,7 @@ class DiscoSettings(Settings):
         'DISCO_USER':            "os.getenv('LOGNAME')",
         'DISCO_JOB_OWNER':       "job_owner()",
         'DISCO_WWW_ROOT':        "os.path.join(DISCO_MASTER_HOME, 'www')",
+        'DATA_GC_INTERVAL':      "12",
 # GC
         'DISCO_GC_AFTER':        "100 * 365 * 24 * 60 * 60",
 #'PROFILE'

diff --git a/master/src/temp_gc.erl b/master/src/temp_gc.erl
@@ -8,8 +8,6 @@
 
 -export([start_link/2]).
 
--define(GC_INTERVAL, 2 * ?DAY).
-
 -spec start_link(node(), path()) -> no_return().
 start_link(Master, DataRoot) ->
     try register(temp_gc, self())
@@ -20,6 +18,7 @@ start_link(Master, DataRoot) ->
 
 -spec loop(path()) -> no_return().
 loop(DataRoot) ->
+    DataGCInterval = list_to_integer(disco:get_setting("DATA_GC_INTERVAL")) * ?HOUR,
     try
         {{ok, Purged}, {ok, Jobs}} = {get_purged(), event_server:get_jobs(get(master))},
         case prim_file:list_dir(DataRoot) of
@@ -28,18 +27,18 @@ loop(DataRoot) ->
                            [Name || {Name, active, _Start} <- Jobs]),
                 process_dir(DataRoot, Dirs, gb_sets:from_ordset(Purged), Active);
             E ->
-                % fresh install, try again after GC_INTERVAL
+                % fresh install, try again after DATA_GC_INTERVAL from settings
                 error_logger:info_msg("Tempgc: error listing ~p: ~p",
                                       [DataRoot, E]),
                 ok
         end
     catch K:V ->
             error_logger:info_msg("Tempgc: error contacting master from ~p: ~p:~p",
                                   [node(), K,V])
-            % master busy, try again after GC_INTERVAL
+            % master busy, try again after DATA_GC_INTERVAL from settings
     end,
     error_logger:info_msg("Tempgc: one pass completed on ~p", [node()]),
-    timer:sleep(?GC_INTERVAL),
+    timer:sleep(DataGCInterval),
     flush(),
     loop(DataRoot).