diff --git a/neuralmonkey/dataset.py b/neuralmonkey/dataset.py index 8a80502f0..410d2ec07 100644 --- a/neuralmonkey/dataset.py +++ b/neuralmonkey/dataset.py @@ -96,7 +96,7 @@ def __init__(self, def _bucket_boundaries(max_length, min_length=8, length_bucket_step=1.1): - """A default set of length-bucket boundaries.""" + """Create a default set of length-bucket boundaries.""" assert length_bucket_step > 1.0 x = min_length boundaries = [] @@ -110,28 +110,25 @@ def get_batching_scheme(batch_size: int, max_length: int = None, min_length_bucket: int = 8, length_bucket_step: float = 1.1, - drop_long_sequences: bool = False, shard_multiplier: int = 1, length_multiplier: int = 1, min_length: int = 0) -> BatchingScheme: - """A batching scheme based on model hyperparameters. + """Create a batching scheme based on model hyperparameters. + Every batch contains a number of sequences divisible by `shard_multiplier`. + Args: batch_size: int, total number of tokens in a batch. - max_length: int, sequences longer than this will be skipped. Defaults to - batch_size. + max_length: int, sequences longer than this will be skipped. Defaults + to batch_size. min_length_bucket: int length_bucket_step: float greater than 1.0 - drop_long_sequences: bool, if True, then sequences longer than - `max_length` are dropped. This prevents generating batches with - more than the usual number of tokens, which can cause out-of-memory - errors. - shard_multiplier: an integer increasing the batch_size to suit splitting - across datashards. + shard_multiplier: an integer increasing the batch_size to suit + splitting across datashards. length_multiplier: an integer multiplier that is used to increase the batch sizes and sequence length tolerance. min_length: int, sequences shorter than this will be skipped. - Returns: + Return: A dictionary with parameters that can be passed to input_pipeline: * boundaries: list of bucket boundaries * batch_sizes: list of batch sizes for each length bucket @@ -149,40 +146,33 @@ def get_batching_scheme(batch_size: int, max_length *= length_multiplier batch_sizes = [ - max(1, batch_size // length) for length in boundaries + [max_length] + max(1, batch_size // length) for length in boundaries + [max_length] ] max_batch_size = max(batch_sizes) # Since the Datasets API only allows a single constant for window_size, # and it needs divide all bucket_batch_sizes, we pick a highly-composite - # window size and then round down all batch sizes to divisors of that window - # size, so that a window can always be divided evenly into batches. - # TODO(noam): remove this when Dataset API improves. + # window size and then round down all batch sizes to divisors of that + # window size, so that a window can always be divided evenly into batches. highly_composite_numbers = [ - 1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260, 1680, - 2520, 5040, 7560, 10080, 15120, 20160, 25200, 27720, 45360, 50400, 55440, - 83160, 110880, 166320, 221760, 277200, 332640, 498960, 554400, 665280, - 720720, 1081080, 1441440, 2162160, 2882880, 3603600, 4324320, 6486480, - 7207200, 8648640, 10810800, 14414400, 17297280, 21621600, 32432400, - 36756720, 43243200, 61261200, 73513440, 110270160 + 1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260, + 1680, 2520, 5040, 7560, 10080, 15120, 20160, 25200, 27720, 45360, + 50400, 55440, 83160, 110880, 166320, 221760, 277200, 332640, 498960, + 554400, 665280, 720720, 1081080, 1441440, 2162160, 2882880, 3603600, + 4324320, 6486480, 7207200, 8648640, 10810800, 14414400, 17297280, + 21621600, 32432400, 36756720, 43243200, 61261200, 73513440, 110270160 ] window_size = max( - [i for i in highly_composite_numbers if i <= 3 * max_batch_size]) + [i for i in highly_composite_numbers if i <= 3 * max_batch_size]) divisors = [i for i in range(1, window_size + 1) if window_size % i == 0] batch_sizes = [max([d for d in divisors if d <= bs]) for bs in batch_sizes] window_size *= shard_multiplier batch_sizes = [bs * shard_multiplier for bs in batch_sizes] - # The Datasets API splits one window into multiple batches, which - # produces runs of many consecutive batches of the same size. This - # is bad for training. To solve this, we will shuffle the batches - # using a queue which must be several times as large as the maximum - # number of batches per window. - max_batches_per_window = window_size // min(batch_sizes) - shuffle_queue_size = max_batches_per_window * 3 ret = BatchingScheme(bucket_boundaries=boundaries, bucket_batch_sizes=batch_sizes) return ret + # The protected functions below are designed to convert the ambiguous spec # structures to a normalized form. diff --git a/neuralmonkey/readers/string_vector_reader.py b/neuralmonkey/readers/string_vector_reader.py index d6545b2a3..439a23838 100644 --- a/neuralmonkey/readers/string_vector_reader.py +++ b/neuralmonkey/readers/string_vector_reader.py @@ -13,7 +13,7 @@ def process_line(line: str, lineno: int, path: str) -> np.ndarray: return np.array(numbers, dtype=dtype) - def reader(files: List[str])-> Iterable[List[np.ndarray]]: + def reader(files: List[str]) -> Iterable[List[np.ndarray]]: for path in files: current_line = 0 diff --git a/tests/hier-multiattention.ini b/tests/hier-multiattention.ini index f4a4b5c68..f203ab665 100644 --- a/tests/hier-multiattention.ini +++ b/tests/hier-multiattention.ini @@ -4,6 +4,7 @@ tf_manager= output="tests/outputs/hier-multiattention" overwrite_output_dir=True epochs=1 +batch_size=1 train_dataset= val_dataset= trainer=