Skip to content

Commit

Permalink
Evaluate a model on multiple sets of as_of_dates
Browse files Browse the repository at this point in the history
This commit addresses #663, #378, #223 by allowing a model to be
evaluated multiple times and thereby allowing users to see whether
performance of single trained model degrades over the time following
training.

Users must now set a timechop parameter, `test_evaluation_frequency` that
will add multiple test matrices to a time split. A model will be tested
once on each matrix in its list. Matrices are added until they reach the
label time limit, testing all models on the final test period (assuming
that you make model_update_frequency evenly dividable by
test_evaluation_frequency).

This initial commit only makes changes to timechop proper. Remaining
work includes:

- Write tests for the new behavior
- Make timechop plotting work with new behavior

New issues that I do not plan to address in the forthcoming PR:

- Incorporate multiple evaluation times into audition and/or
  postmodeling
- Maybe users should be able to set a maximum evaluation horizon so that
  early models are not tested for, say, 100 time periods
- Evaluation time-splitting could (or should) eventually not be done with
  pre-made matrices but on-the-fly atevaluation time
  • Loading branch information
ecsalomon committed Apr 24, 2019
1 parent 067cd59 commit f89256f
Showing 1 changed file with 96 additions and 58 deletions.
154 changes: 96 additions & 58 deletions src/triage/component/timechop/timechop.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,17 @@
# label_start_time: '2012-01-01',
# label_end_time: '2017-07-01',
#
# model_update_frequency: '1year',
#
# training_label_timespans: ['6month'],
# test_label_timespans: ['6month'],
#
# max_training_histories: ['2year'],
# test_durations: ['3month'],
#
# training_as_of_date_frequencies='1day',
# test_as_of_date_frequencies='1month'
# test_as_of_date_frequencies='1month',
#
# test_evaluation_frequency='1month'
# model_update_frequency: '1year',
#
# }

Expand All @@ -37,13 +38,14 @@ def __init__(
feature_end_time,
label_start_time,
label_end_time,
model_update_frequency,
training_as_of_date_frequencies,
max_training_histories,
training_label_timespans,
test_as_of_date_frequencies,
test_durations,
test_label_timespans,
test_evaluation_frequency,
model_update_frequency,
):
self.feature_start_time = dt_from_str(
feature_start_time
Expand All @@ -63,10 +65,6 @@ def __init__(
if self.label_start_time > self.label_end_time:
raise ValueError("Label start time after label end time.")

# how frequently to retrain models
self.model_update_frequency = convert_str_to_relativedelta(
model_update_frequency
)

# time between rows for same entity in train matrix
self.training_as_of_date_frequencies = utils.convert_to_list(
Expand All @@ -90,6 +88,15 @@ def __init__(
# how much time is included in a label in the test matrix
self.test_label_timespans = utils.convert_to_list(test_label_timespans)

# how often should we evaluate the model in test?
self.test_evaluation_frequency = convert_str_to_relativedelta(test_evaluation_frequency)

# how frequently to retrain models
self.model_update_frequency = convert_str_to_relativedelta(
model_update_frequency
)


def chop_time(self):
""" Given the attributes of the object, define all train/test splits
for all combinations of the temporal parameters.
Expand Down Expand Up @@ -189,16 +196,16 @@ def calculate_train_test_split_times(
# ensuring we leave enough of a buffer for the test_label_timespan to get a full
# set of labels for our last testing as_of_date
#
# in our example, last_test_label_time = 2017-07-01 - 6month = 2017-01-01
last_test_label_time = self.label_end_time - test_label_timespan
# in our example, self.last_test_label_time = 2017-07-01 - 6month = 2017-01-01
self.last_test_label_time = self.label_end_time - test_label_timespan

# final label must be able to have feature data associated with it
if last_test_label_time > self.feature_end_time:
last_test_label_time = self.feature_end_time
if self.last_test_label_time > self.feature_end_time:
self.last_test_label_time = self.feature_end_time
raise ValueError(
"Final test label date cannot be after end of feature time."
)
logging.info("Final label as of date: {}".format(last_test_label_time))
logging.info("Final label as of date: {}".format(self.last_test_label_time))

# all split times have to allow at least one training label before them
# e.g., earliest_possible_split_time = max(1995-01-01, 2012-01-01) + 6month = 2012-01-01
Expand All @@ -218,7 +225,7 @@ def calculate_train_test_split_times(
#
# e.g., last_split_time = 2017-01-01 - 3month = 2016-10-01
test_delta = convert_str_to_relativedelta(test_duration)
last_split_time = last_test_label_time - test_delta
last_split_time = self.last_test_label_time - test_delta
logging.info("Final split time: {}".format(last_split_time))
if last_split_time < earliest_possible_split_time:
raise ValueError("No valid train/test split times in temporal config.")
Expand Down Expand Up @@ -484,57 +491,88 @@ def define_test_matrices(
# test_duration = 3month
# test_label_timespan = 6month

# the as_of_time_limit is simply the split time plus the test_duration and we
# can avoid checking here for any issues with the label_end_time or
# feature_end_time since we've guaranteed that those limits would be
# satisfied when we calculated the train_test_split_times initially
# triage will keep testing a model until the end of your modeling time
# at a frequency defined by test_evaluation_frequency. this allows you
# to see how a model's performance degrades over time, so we will build
# multiple test matrices starting at the train_test_split_time and
# moving the matrix's start time forward until the matrix's final
# as_of_date exceeds self.last_test_label_time.
#
# for the example, as_of_time_limit = 2016-10-01 + 3month = 2017-01-01
# (note as well that this will be treated as an _exclusive_ limit)
# recall that for our example, self.last_test_label_time = 2017-10-01
# and self.test_evaluation_frequency = '3month'
logging.info(
"Generating test matrix definitions for train/test split %s",
train_test_split_time
)
test_definitions = []

# we start our first matrix at the train_test_split_time, 2016-10-01
matrix_test_start_time = train_test_split_time

# the matrix_as_of_time_limit is simply the start time of the current
# test matrix plus the test_duration, and we can avoid checking here
# for any issues with the label_end_time or feature_end_time since
# we've guaranteed that those limits would be satisfied for at least
# one test matrix when we calculated the train_test_split_times
# initially
#
# for the first test matrix in our example,
# matrix_as_of_time_limit = 2016-10-01 + 3month = 2017-01-01
# (note as well that this will be treated as an _exclusive_ limit)
test_delta = convert_str_to_relativedelta(test_duration)
as_of_time_limit = train_test_split_time + test_delta
logging.info("All test as of times before %s", as_of_time_limit)
matrix_as_of_time_limit = train_test_split_time + test_delta
logging.info("All test as of times before %s", matrix_as_of_time_limit)

while matrix_as_of_time_limit <= self.last_test_label_time:

# calculate the as_of_times associated with each test data frequency
# for our example, we just have one, 1month
for test_as_of_date_frequency in self.test_as_of_date_frequencies:
logging.info(
"Generating test matrix definitions for test data frequency %s",
test_as_of_date_frequency
)

# calculate the as_of_times associated with each test data frequency
# for our example, we just have one, 1month
for test_as_of_date_frequency in self.test_as_of_date_frequencies:
logging.info(
"Generating test matrix definitions for test data frequency %s",
test_as_of_date_frequency
)
# for test as_of_times we step _forwards_ from the train_test_split_time
# to ensure that we always have a prediction set made immediately after
# training is done (so, the freshest possible predictions) even if the
# frequency doesn't divide the test_duration evenly so there's a gap before
# the matrix_as_of_time_limit
#
# for our example, this will give three as_of_dates:
# [2016-10-01, 2016-11-01, 2016-12-01]
# since we start at the train_test_split_time (2016-10-01) and walk forward by
# the test_as_of_date_frequency (1 month) until we've exhausted the test_duration
# (3 months), exclusive (see comments in the method for details)
test_as_of_times = self.calculate_as_of_times(
as_of_start_limit=matrix_test_start_time,
as_of_end_limit=matrix_as_of_time_limit,
data_frequency=convert_str_to_relativedelta(test_as_of_date_frequency),
forward=True,
)
logging.info("test as of times: %s", test_as_of_times)
test_definition = {
"first_as_of_time": matrix_test_start_time,
"last_as_of_time": max(test_as_of_times),
"matrix_info_end_time": max(test_as_of_times)
+ convert_str_to_relativedelta(test_label_timespan),
"as_of_times": AsOfTimeList(test_as_of_times),
"test_label_timespan": test_label_timespan,
"test_as_of_date_frequency": test_as_of_date_frequency,
"test_duration": test_duration,
}
test_definitions.append(test_definition)

# after weve've built our first test matrix starting at 2016-10-01,
# we step forward by our test_evaluation_frequency, and the
# current_test_start_time = 2016-10-01 + 3month = 2017-01-01
matrix_test_start_time = matrix_test_start_time + self.test_evaluation_frequency

# a matrix starting at that time will have a
# matrix_as_of_time_limit = 2017-01-01 + 3month = 2017-04-01
# because this is less than self.last_label_start_time, we will
# continue in the loop and add another test matrix covering
# 2017-01-01 up to (but excluding) 2017-04-01
matrix_as_of_time_limit = matrix_test_start_time + test_delta

# for test as_of_times we step _forwards_ from the train_test_split_time
# to ensure that we always have a prediction set made immediately after
# training is done (so, the freshest possible predictions) even if the
# frequency doesn't divide the test_duration evenly so there's a gap before
# the as_of_time_limit
#
# for our example, this will give three as_of_dates:
# [2016-10-01, 2016-11-01, 2016-12-01]
# since we start at the train_test_split_time (2016-10-01) and walk forward by
# the test_as_of_date_frequency (1 month) until we've exhausted the test_duration
# (3 months), exclusive (see comments in the method for details)
test_as_of_times = self.calculate_as_of_times(
as_of_start_limit=train_test_split_time,
as_of_end_limit=as_of_time_limit,
data_frequency=convert_str_to_relativedelta(test_as_of_date_frequency),
forward=True,
)
logging.info("test as of times: %s", test_as_of_times)
test_definition = {
"first_as_of_time": train_test_split_time,
"last_as_of_time": max(test_as_of_times),
"matrix_info_end_time": max(test_as_of_times)
+ convert_str_to_relativedelta(test_label_timespan),
"as_of_times": AsOfTimeList(test_as_of_times),
"test_label_timespan": test_label_timespan,
"test_as_of_date_frequency": test_as_of_date_frequency,
"test_duration": test_duration,
}
test_definitions.append(test_definition)
return test_definitions

0 comments on commit f89256f

Please sign in to comment.