Skip to content

Commit

Permalink
fixing failed travis tests
Browse files Browse the repository at this point in the history
  • Loading branch information
varisd committed Jan 30, 2019
1 parent 7a62312 commit a97affc
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 31 deletions.
50 changes: 20 additions & 30 deletions neuralmonkey/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def __init__(self,


def _bucket_boundaries(max_length, min_length=8, length_bucket_step=1.1):
"""A default set of length-bucket boundaries."""
"""Create a default set of length-bucket boundaries."""
assert length_bucket_step > 1.0
x = min_length
boundaries = []
Expand All @@ -110,28 +110,25 @@ def get_batching_scheme(batch_size: int,
max_length: int = None,
min_length_bucket: int = 8,
length_bucket_step: float = 1.1,
drop_long_sequences: bool = False,
shard_multiplier: int = 1,
length_multiplier: int = 1,
min_length: int = 0) -> BatchingScheme:
"""A batching scheme based on model hyperparameters.
"""Create a batching scheme based on model hyperparameters.
Every batch contains a number of sequences divisible by `shard_multiplier`.
Args:
batch_size: int, total number of tokens in a batch.
max_length: int, sequences longer than this will be skipped. Defaults to
batch_size.
max_length: int, sequences longer than this will be skipped. Defaults
to batch_size.
min_length_bucket: int
length_bucket_step: float greater than 1.0
drop_long_sequences: bool, if True, then sequences longer than
`max_length` are dropped. This prevents generating batches with
more than the usual number of tokens, which can cause out-of-memory
errors.
shard_multiplier: an integer increasing the batch_size to suit splitting
across datashards.
shard_multiplier: an integer increasing the batch_size to suit
splitting across datashards.
length_multiplier: an integer multiplier that is used to increase the
batch sizes and sequence length tolerance.
min_length: int, sequences shorter than this will be skipped.
Returns:
Return:
A dictionary with parameters that can be passed to input_pipeline:
* boundaries: list of bucket boundaries
* batch_sizes: list of batch sizes for each length bucket
Expand All @@ -149,40 +146,33 @@ def get_batching_scheme(batch_size: int,
max_length *= length_multiplier

batch_sizes = [
max(1, batch_size // length) for length in boundaries + [max_length]
max(1, batch_size // length) for length in boundaries + [max_length]
]
max_batch_size = max(batch_sizes)
# Since the Datasets API only allows a single constant for window_size,
# and it needs divide all bucket_batch_sizes, we pick a highly-composite
# window size and then round down all batch sizes to divisors of that window
# size, so that a window can always be divided evenly into batches.
# TODO(noam): remove this when Dataset API improves.
# window size and then round down all batch sizes to divisors of that
# window size, so that a window can always be divided evenly into batches.
highly_composite_numbers = [
1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260, 1680,
2520, 5040, 7560, 10080, 15120, 20160, 25200, 27720, 45360, 50400, 55440,
83160, 110880, 166320, 221760, 277200, 332640, 498960, 554400, 665280,
720720, 1081080, 1441440, 2162160, 2882880, 3603600, 4324320, 6486480,
7207200, 8648640, 10810800, 14414400, 17297280, 21621600, 32432400,
36756720, 43243200, 61261200, 73513440, 110270160
1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260,
1680, 2520, 5040, 7560, 10080, 15120, 20160, 25200, 27720, 45360,
50400, 55440, 83160, 110880, 166320, 221760, 277200, 332640, 498960,
554400, 665280, 720720, 1081080, 1441440, 2162160, 2882880, 3603600,
4324320, 6486480, 7207200, 8648640, 10810800, 14414400, 17297280,
21621600, 32432400, 36756720, 43243200, 61261200, 73513440, 110270160
]
window_size = max(
[i for i in highly_composite_numbers if i <= 3 * max_batch_size])
[i for i in highly_composite_numbers if i <= 3 * max_batch_size])
divisors = [i for i in range(1, window_size + 1) if window_size % i == 0]
batch_sizes = [max([d for d in divisors if d <= bs]) for bs in batch_sizes]
window_size *= shard_multiplier
batch_sizes = [bs * shard_multiplier for bs in batch_sizes]
# The Datasets API splits one window into multiple batches, which
# produces runs of many consecutive batches of the same size. This
# is bad for training. To solve this, we will shuffle the batches
# using a queue which must be several times as large as the maximum
# number of batches per window.
max_batches_per_window = window_size // min(batch_sizes)
shuffle_queue_size = max_batches_per_window * 3

ret = BatchingScheme(bucket_boundaries=boundaries,
bucket_batch_sizes=batch_sizes)
return ret


# The protected functions below are designed to convert the ambiguous spec
# structures to a normalized form.

Expand Down
2 changes: 1 addition & 1 deletion neuralmonkey/readers/string_vector_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def process_line(line: str, lineno: int, path: str) -> np.ndarray:

return np.array(numbers, dtype=dtype)

def reader(files: List[str])-> Iterable[List[np.ndarray]]:
def reader(files: List[str]) -> Iterable[List[np.ndarray]]:
for path in files:
current_line = 0

Expand Down

0 comments on commit a97affc

Please sign in to comment.