From 4dba5a30811e4cef2627e72af5d2c2df8c9b3bde Mon Sep 17 00:00:00 2001
From: Michael Denkowski <mdenkows@amazon.com>
Date: Sun, 11 Dec 2022 05:11:32 -0600
Subject: [PATCH] Code cleanup: refactoring, type checking, and formatting
 (#1076)

---
 CHANGELOG.md                       |  8 ++++
 sockeye/__init__.py                |  2 +-
 sockeye/arguments.py               | 62 ++++++++++++++++--------------
 sockeye/beam_search.py             |  1 -
 sockeye/constants.py               |  7 +---
 sockeye/convert_deepspeed.py       |  2 +-
 sockeye/data_io.py                 |  7 ++--
 sockeye/device.py                  | 25 ------------
 sockeye/encoder.py                 |  3 +-
 sockeye/evaluate.py                | 12 +++---
 sockeye/generate_decoder_states.py | 12 +++---
 sockeye/inference.py               |  1 -
 sockeye/knn.py                     | 14 ++++---
 sockeye/score.py                   |  5 +--
 sockeye/train.py                   |  3 +-
 sockeye/translate.py               |  6 +--
 sockeye/utils.py                   | 34 +++++++++++++++-
 typechecked-files                  | 10 ++++-
 18 files changed, 117 insertions(+), 97 deletions(-)
 delete mode 100644 sockeye/device.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f3d73856b..76ccadf97 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,14 @@ Note that Sockeye has checks in place to not translate with an old model that wa
 
 Each version section may have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.
 
+## [3.1.29]
+
+### Changed
+
+- Running `sockeye-evaluate` no longer applies text tokenization for TER (same behavior as other metrics).
+- Turned on type checking for all `sockeye` modules except `test_utils` and addressed resulting type issues.
+- Refactored code in various modules without changing user-level behavior.
+
 ## [3.1.28]
 
 ### Added
diff --git a/sockeye/__init__.py b/sockeye/__init__.py
index 78bb654be..537052cce 100644
--- a/sockeye/__init__.py
+++ b/sockeye/__init__.py
@@ -11,4 +11,4 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-__version__ = '3.1.28'
+__version__ = '3.1.29'
diff --git a/sockeye/arguments.py b/sockeye/arguments.py
index 6a613ea08..13403e30f 100644
--- a/sockeye/arguments.py
+++ b/sockeye/arguments.py
@@ -138,7 +138,8 @@ def check_regular_directory(value_to_check):
 
 def int_greater_or_equal(threshold: int) -> Callable:
     """
-    Returns a method that can be used in argument parsing to check that the int argument is greater or equal to `threshold`.
+    Returns a method that can be used in argument parsing to check that the int argument is greater or equal to
+    `threshold`.
 
     :param threshold: The threshold that we assume the cli argument value is greater or equal to.
     :return: A method that can be used as a type in argparse.
@@ -155,7 +156,8 @@ def check_greater_equal(value: str):
 
 def float_greater_or_equal(threshold: float) -> Callable:
     """
-    Returns a method that can be used in argument parsing to check that the float argument is greater or equal to `threshold`.
+    Returns a method that can be used in argument parsing to check that the float argument is greater or equal to
+    `threshold`.
 
     :param threshold: The threshold that we assume the cli argument value is greater or equal to.
     :return: A method that can be used as a type in argparse.
@@ -571,7 +573,8 @@ def add_device_args(params):
     device_params.add_argument('--tf32',
                                type=bool_str(),
                                default=True,
-                               help='Globally enable transparent tf32 acceleration of float32 at the cost of reducing precision to 10 bits')
+                               help='Globally enable transparent tf32 acceleration of float32 at the cost of reducing '
+                                    'precision to 10 bits. Default: %(default)s.')
 
 
 def add_vocab_args(params):
@@ -829,22 +832,20 @@ def add_batch_args(params, default_batch_size=4096, default_batch_type=C.BATCH_T
 
 
 def add_nvs_train_parameters(params):
-    params.add_argument(
-        '--bow-task-weight',
-        type=float_greater_or_equal(0.0),
-        default=1.0,
-        help=
-        'The weight of the auxiliary Bag-of-word (BOW) loss when --neural-vocab-selection is enabled. Default %(default)s.'
-    )
-
-    params.add_argument(
-        '--bow-task-pos-weight',
-        type=float_greater_or_equal(0.0),
-        default=10,
-        help='The weight of the positive class (the set of words present on the target side) for the BOW loss '
-             'when --neural-vocab-selection is set as x * num_negative_class / num_positive_class where x is the '
-             '--bow-task-pos-weight. Higher values will bias more towards recall, resulting in larger vocabularies '
-             'at test time trading off larger vocabularies for higher translation quality. Default %(default)s.')
+    params.add_argument('--bow-task-weight',
+                        type=float_greater_or_equal(0.0),
+                        default=1.0,
+                        help='The weight of the auxiliary Bag-of-word (BOW) loss when --neural-vocab-selection is '
+                             'enabled. Default %(default)s.')
+
+    params.add_argument('--bow-task-pos-weight',
+                        type=float_greater_or_equal(0.0),
+                        default=10,
+                        help='The weight of the positive class (the set of words present on the target side) for the '
+                             'BOW loss when --neural-vocab-selection is set as x * num_negative_class / '
+                             'num_positive_class where x is the --bow-task-pos-weight. Higher values will bias more '
+                             'towards recall, resulting in larger vocabularies at test time trading off larger '
+                             'vocabularies for higher translation quality. Default %(default)s.')
 
 
 def add_training_args(params):
@@ -866,8 +867,9 @@ def add_training_args(params):
                               type=str,
                               default=None,
                               choices=[C.LENGTH_TASK_RATIO, C.LENGTH_TASK_LENGTH],
-                              help='If specified, adds an auxiliary task during training to predict source/target length ratios '
-                                    '(mean squared error loss), or absolute lengths (Poisson) loss. Default %(default)s.')
+                              help='If specified, adds an auxiliary task during training to predict source/target '
+                                   'length ratios (mean squared error loss), or absolute lengths (Poisson) loss. '
+                                   'Default %(default)s.')
     train_params.add_argument('--length-task-weight',
                               type=float_greater_or_equal(0.0),
                               default=1.0,
@@ -875,7 +877,8 @@ def add_training_args(params):
     train_params.add_argument('--length-task-layers',
                               type=int_greater_or_equal(1),
                               default=1,
-                              help='Number of fully-connected layers for predicting the length ratio. Default %(default)s.')
+                              help='Number of fully-connected layers for predicting the length ratio. '
+                                   'Default %(default)s.')
 
     add_nvs_train_parameters(train_params)
 
@@ -1088,7 +1091,8 @@ def add_training_args(params):
 
     train_params.add_argument('--keep-initializations',
                               action="store_true",
-                              help='In addition to keeping the last n params files, also keep params from checkpoint 0.')
+                              help='In addition to keeping the last n params files, also keep params from checkpoint '
+                                   '0.')
 
     train_params.add_argument('--cache-last-best-params',
                               required=False,
@@ -1349,7 +1353,8 @@ def add_inference_args(params):
 
     decode_params.add_argument('--skip-nvs',
                                action='store_true',
-                               help='Manually turn off Neural Vocabulary Selection (NVS) to do a softmax over the full target vocabulary.',
+                               help='Manually turn off Neural Vocabulary Selection (NVS) to do a softmax over the full '
+                                    'target vocabulary.',
                                default=False)
 
     decode_params.add_argument('--nvs-thresh',
@@ -1406,13 +1411,14 @@ def add_brevity_penalty_args(params):
     params.add_argument('--brevity-penalty-weight',
                         default=1.0,
                         type=float_greater_or_equal(0.0),
-                        help='Scaler for the brevity penalty in beam search: weight * log(BP) + score. Default: %(default)s')
+                        help='Scaler for the brevity penalty in beam search: weight * log(BP) + score. '
+                             'Default: %(default)s')
     params.add_argument('--brevity-penalty-constant-length-ratio',
                         default=0.0,
                         type=float_greater_or_equal(0.0),
-                        help='Has effect if --brevity-penalty-type is set to \'constant\'. If positive, overrides the length '
-                             'ratio, used for brevity penalty calculation, for all inputs. If zero, uses the average of length '
-                             'ratios from the training data over all models. Default: %(default)s.')
+                        help='Has effect if --brevity-penalty-type is set to \'constant\'. If positive, overrides the '
+                             'length ratio, used for brevity penalty calculation, for all inputs. If zero, uses the '
+                             'average of length ratios from the training data over all models. Default: %(default)s.')
 
 
 def add_clamp_to_dtype_arg(params):
diff --git a/sockeye/beam_search.py b/sockeye/beam_search.py
index a82ef83c4..6d2bfd77d 100644
--- a/sockeye/beam_search.py
+++ b/sockeye/beam_search.py
@@ -678,7 +678,6 @@ def __init__(self,
         self.output_vocab_size = inference.model_output_vocab_size
         self.output_factor_vocab_size = inference.model_output_factor_vocab_size
         self._inference = inference
-        self.global_avoid_trie = None
         assert inference._skip_softmax, "skipping softmax must be enabled for GreedySearch"
         self.work_block = GreedyTop1()
 
diff --git a/sockeye/constants.py b/sockeye/constants.py
index 670c24b48..552378f77 100644
--- a/sockeye/constants.py
+++ b/sockeye/constants.py
@@ -138,7 +138,6 @@
 JSON_RESTRICT_LEXICON_KEY = "restrict_lexicon"
 JSON_CONSTRAINTS_KEY = "constraints"
 JSON_AVOID_KEY = "avoid"
-JSON_ENCODING = "utf-8"
 
 VERSION_NAME = "version"
 CONFIG_NAME = "config"
@@ -285,7 +284,6 @@
 DTYPE_BF16 = 'bfloat16'
 DTYPE_FP16 = 'float16'
 DTYPE_FP32 = 'float32'
-DTYPE_TF32 = 'tf32'
 DTYPE_INT8 = 'int8'
 DTYPE_INT16 = 'int16'
 DTYPE_INT32 = 'int32'
@@ -364,7 +362,6 @@
 # sequence length count types
 SEQ_LEN_IN_CHARACTERS = "char"
 SEQ_LEN_IN_TOKENS = "token"
-SEQ_LEN_IN_WORDS = "word"  # use case: merge sub-words to original word before counting
 
 # scoring
 SCORING_TYPE_NEGLOGPROB = 'neglogprob'
@@ -383,7 +380,7 @@
 BREVITY_PENALTY_LEARNED = 'learned'
 BREVITY_PENALTY_NONE = 'none'
 
-# k-nn 
+# k-nn
 KNN_STATE_DATA_STORE_NAME = "keys.npy"
 KNN_WORD_DATA_STORE_NAME = "vals.npy"
 KNN_WORD_DATA_STORE_DTYPE = DTYPE_INT32
@@ -391,4 +388,4 @@
 KNN_INDEX_NAME = "key_index"
 KNN_EPSILON = 1e-6
 DEFAULT_DATA_STORE_BLOCK_SIZE = 1024 * 1024
-DEFAULT_KNN_LAMBDA = 0.8
\ No newline at end of file
+DEFAULT_KNN_LAMBDA = 0.8
diff --git a/sockeye/convert_deepspeed.py b/sockeye/convert_deepspeed.py
index 211ea26bb..7642ce23d 100644
--- a/sockeye/convert_deepspeed.py
+++ b/sockeye/convert_deepspeed.py
@@ -67,7 +67,7 @@ def convert_checkpoint_to_params(model_config_fname: str, checkpoint_dirname: st
     model_config = model.SockeyeModel.load_config(model_config_fname)
     sockeye_model = model.SockeyeModel(model_config)
     # Gather the float32 params on CPU
-    state_dict = get_fp32_state_dict_from_zero1_checkpoint(checkpoint_dirname)
+    state_dict = dict(get_fp32_state_dict_from_zero1_checkpoint(checkpoint_dirname))
     # Strip the first prefix from each param name to match the SockeyeModel
     # Ex: 'model.encoder.layers...' -> 'encoder.layers...'
     state_dict = {name[name.find('.') + 1:]: param for (name, param) in state_dict.items()}
diff --git a/sockeye/data_io.py b/sockeye/data_io.py
index db2917ac1..e0724fd2c 100644
--- a/sockeye/data_io.py
+++ b/sockeye/data_io.py
@@ -363,8 +363,8 @@ def create_shards(source_fnames: List[str],
     :param target_fnames: The path to the target text (and optional token-parallel factor files).
     :param num_shards: The total number of shards.
     :param output_prefix: The prefix under which the shard files will be created.
-    :return: List of tuples of source (and source factor) file names and target (and target factor) file names for each shard
-             and a flag of whether the returned file names are temporary and can be deleted.
+    :return: List of tuples of source (and source factor) file names and target (and target factor) file names for each
+             shard and a flag of whether the returned file names are temporary and can be deleted.
     """
     if num_shards == 1:
         return [(tuple(source_fnames), tuple(target_fnames))], True
@@ -595,7 +595,8 @@ def prepare_data(source_fnames: List[str],
                  pool: multiprocessing.pool.Pool = None,
                  shards: List[Tuple[Tuple[str, ...], Tuple[str, ...]]] = None):
     """
-    :param shards: List of num_shards shards of parallel source and target tuples which in turn contain tuples to shard data factor file paths.
+    :param shards: List of num_shards shards of parallel source and target tuples which in turn contain tuples to shard
+                   data factor file paths.
     """
     logger.info("Preparing data.")
     # write vocabularies to data folder
diff --git a/sockeye/device.py b/sockeye/device.py
deleted file mode 100644
index 976b70cb2..000000000
--- a/sockeye/device.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import torch
-import argparse
-from typing import Optional
-
-def init_device(args: argparse.Namespace, logger=None, local_rank : Optional[int] = None):
-    """
-    return requested torch device, optionally enabling tf32
-
-    :param args "Device Parameters". args.use_cpu will be set if cuda is not available
-    :param logger optional logger.info(msg)
-    :param local_rank optional int LOCAL_RANK env for multiple GPU training
-    """
-    if not torch.cuda.is_available():
-        if logger is not None:
-            logger.info("CUDA not available, using cpu")
-        args.use_cpu = True
-    device = torch.device('cpu') if args.use_cpu else torch.device('cuda', args.device_id if local_rank is None else local_rank)
-    if not args.use_cpu:
-        # Ensure that GPU operations use the correct device by default
-        torch.cuda.set_device(device)
-        if args.tf32:
-            if logger is not None:
-                logger.info("CUDA: allow tf32 (float32 but with 10 bits precision)")
-                torch.backends.cuda.matmul.allow_tf32 = True
-    return device
diff --git a/sockeye/encoder.py b/sockeye/encoder.py
index 4f4630743..86838fb03 100644
--- a/sockeye/encoder.py
+++ b/sockeye/encoder.py
@@ -208,7 +208,8 @@ def forward(self, data: pt.Tensor, valid_length: pt.Tensor) -> Tuple[pt.Tensor,
 
         _, max_len, __ = data.size()
         # length_mask for source attention masking. Shape: (batch_size, max_len)
-        single_head_att_mask = layers.prepare_source_length_mask(valid_length, self.config.attention_heads, max_length=max_len, expand=False)
+        single_head_att_mask = layers.prepare_source_length_mask(valid_length, self.config.attention_heads,
+                                                                 max_length=max_len, expand=False)
         # Shape: (batch_size, max_len) -> (batch_size * heads, 1, max_len)
         att_mask = single_head_att_mask.unsqueeze(1).expand(-1, self.config.attention_heads, -1).reshape((-1, max_len)).unsqueeze(1)
         att_mask = att_mask.expand(-1, max_len, -1)
diff --git a/sockeye/evaluate.py b/sockeye/evaluate.py
index ecf5a3783..5e5c18ce2 100644
--- a/sockeye/evaluate.py
+++ b/sockeye/evaluate.py
@@ -47,7 +47,7 @@ def raw_corpus_bleu(hypotheses: Iterable[str], references: Iterable[str],
     :param offset: Smoothing constant.
     :return: BLEU score as float between 0 and 1.
     """
-    return sacrebleu.raw_corpus_bleu(hypotheses, [references], smooth_value=offset).score / 100.0
+    return sacrebleu.raw_corpus_bleu(hypotheses, [references], smooth_value=offset).score / 100.0  # type: ignore
 
 
 def raw_corpus_chrf(hypotheses: Iterable[str], references: Iterable[str]) -> float:
@@ -58,7 +58,7 @@ def raw_corpus_chrf(hypotheses: Iterable[str], references: Iterable[str]) -> flo
     :param references: Reference stream.
     :return: chrF score as float between 0 and 1.
     """
-    return sacrebleu.corpus_chrf(hypotheses, [references]).score
+    return sacrebleu.corpus_chrf(hypotheses, [references]).score  # type: ignore
 
 
 def raw_corpus_ter(hypotheses: Iterable[str], references: Iterable[str]) -> float:
@@ -69,8 +69,8 @@ def raw_corpus_ter(hypotheses: Iterable[str], references: Iterable[str]) -> floa
     :param references: Reference stream.
     :return: TER score as float between 0 and 1.
     """
-    ter = sacrebleu.metrics.TER(argparse.Namespace())
-    return ter.corpus_score(hypotheses, [references]).score
+    ter = sacrebleu.metrics.TER()
+    return ter.corpus_score(hypotheses, [references]).score  # type: ignore
 
 
 def raw_corpus_rouge1(hypotheses: Iterable[str], references: Iterable[str]) -> float:
@@ -186,8 +186,8 @@ def _print_mean_std_score(metrics: List[Tuple[str, Callable]], scores: Dict[str,
     scores_mean_std = []  # type: List[str]
     for name, _ in metrics:
         if len(scores[name]) > 1:
-            score_mean = np.item(np.mean(scores[name]))
-            score_std = np.item(np.std(scores[name], ddof=1))
+            score_mean = np.mean(scores[name]).item()
+            score_std = np.std(scores[name], ddof=1).item()
             scores_mean_std.append("%.3f\t%.3f" % (score_mean, score_std))
         else:
             score = scores[name][0]
diff --git a/sockeye/generate_decoder_states.py b/sockeye/generate_decoder_states.py
index 8df008f77..2c636767a 100644
--- a/sockeye/generate_decoder_states.py
+++ b/sockeye/generate_decoder_states.py
@@ -14,7 +14,7 @@
 import argparse
 import logging
 import os
-from typing import Dict, List
+from typing import List, Optional
 
 import numpy as np
 import torch as pt
@@ -50,7 +50,7 @@ def __init__(self,
         self.num_dim = num_dim  # dimension of a single entry
         self.dtype = dtype
         self.block_size = -1
-        self.mmap = None
+        self.mmap = None  # type: Optional[np.memmap]
         self.tail_idx = 0  # where the next entry should be inserted
         self.size = 0  # size of storage already assigned
 
@@ -120,12 +120,12 @@ def __init__(self,
         self.max_seq_len_target = max_seq_len_target
 
         self.output_dir = output_dir
-        self.state_store_file = None
-        self.words_store_file = None
+        self.state_store_file = None  # type: Optional[NumpyMemmapStorage]
+        self.words_store_file = None  # type: Optional[NumpyMemmapStorage]
 
         # info for KNNConfig
         self.num_states = 0
-        self.dimension = None
+        self.dimension = None  # type: Optional[int]
         self.state_data_type = utils.get_numpy_dtype(state_data_type)
         self.word_data_type = utils.get_numpy_dtype(word_data_type)
 
@@ -186,7 +186,7 @@ def generate_states_and_store(self,
                     trace_inputs = {'get_decoder_states': model_inputs}
                     self.traced_model = pt.jit.trace_module(self.model, trace_inputs, strict=False)
                 # shape: (batch, seq_len, hidden_dim)
-                decoder_states = self.traced_model.get_decoder_states(*model_inputs)
+                decoder_states = self.traced_model.get_decoder_states(*model_inputs)  # type: ignore
 
                 # flatten batch and seq_len dimensions, remove pads on the target
                 pad_mask = (batch.target != C.PAD_ID)[:, :, 0]  # shape: (batch, seq_len)
diff --git a/sockeye/inference.py b/sockeye/inference.py
index f1cc993e5..25e6480c2 100644
--- a/sockeye/inference.py
+++ b/sockeye/inference.py
@@ -110,7 +110,6 @@ def get_max_output_length(input_length: int):
     return max_input_len, get_max_output_length
 
 
-BeamHistory = Dict[str, List]
 Tokens = List[str]
 TokenIds = List[List[int]]  # each token id may contain multiple factors
 SentenceId = Union[int, str]
diff --git a/sockeye/knn.py b/sockeye/knn.py
index dbad4a38e..ce345183f 100755
--- a/sockeye/knn.py
+++ b/sockeye/knn.py
@@ -82,14 +82,14 @@ def init_faiss_index(self, train_sample: Optional[np.memmap] = None):
 
         return index
 
-    def add_items(self, index, keys: np.array):
+    def add_items(self, index, keys: np.ndarray):
         """Add items to the index (must call `init_faiss_index` first)."""
         item_count, key_dim = keys.shape
         assert key_dim == self.config.dimension
 
         index.add(keys.astype(np.float32))  # unfortunately, faiss index only supports float32
 
-    def block_add_items(self, index, keys: np.array, block_size: int = C.DEFAULT_DATA_STORE_BLOCK_SIZE):
+    def block_add_items(self, index, keys: np.ndarray, block_size: int = C.DEFAULT_DATA_STORE_BLOCK_SIZE):
         """Add items to the index in blocks -- used for a large number of items (must call `init_faiss_index` first)."""
         item_count, key_dim = keys.shape
         assert key_dim == self.config.dimension
@@ -106,7 +106,7 @@ def block_add_items(self, index, keys: np.array, block_size: int = C.DEFAULT_DAT
             index.add(keys[start:item_count].astype(np.float32))  # unfortunately, faiss index only supports float32
 
     @staticmethod
-    def build_train_sample(keys: np.array, sample_size: int):
+    def build_train_sample(keys: np.ndarray, sample_size: int):
         """Randomly sample `sample_size` keys as training sample."""
         item_count, _ = keys.shape
         assert 0 < sample_size <= item_count
@@ -119,7 +119,7 @@ def build_train_sample(keys: np.array, sample_size: int):
 
         return train_sample
 
-    def build_faiss_index(self, keys: np.array, train_sample: Optional[np.memmap] = None):
+    def build_faiss_index(self, keys: np.ndarray, train_sample: Optional[np.memmap] = None):
         """
         Top-level function of the class to build faiss index for a set of keys, optionally with samples for training.
         """
@@ -149,8 +149,10 @@ def get_config_path(dir):
 
 
 def build_knn_index_package(args):
-    """Top-level function that builds a kNN index package (kNN index and config file)
-    from an existing state and word store."""
+    """
+    Top-level function that builds a kNN index package (kNN index and config file) from an existing state and word
+    store.
+    """
     state_store_filename = get_state_store_path(args.input_dir)
     word_store_filename = get_word_store_path(args.input_dir)
     config_filename = get_config_path(args.input_dir)
diff --git a/sockeye/score.py b/sockeye/score.py
index 552df3b24..12eef0147 100644
--- a/sockeye/score.py
+++ b/sockeye/score.py
@@ -18,14 +18,11 @@
 import logging
 import os
 
-import torch as pt
-
 from . import arguments
 from . import constants as C
 from . import data_io
 from . import utils
 from .beam_search import CandidateScorer
-from .device import init_device
 from .log import setup_main_logger
 from .model import load_model
 from .output_handler import get_output_handler
@@ -51,7 +48,7 @@ def score(args: argparse.Namespace):
 
     utils.log_basic_info(args)
 
-    device = init_device(args, logger)
+    device = utils.init_device(args)
     logger.info(f"Scoring device: {device}")
 
     model, source_vocabs, target_vocabs = load_model(args.model, device=device, dtype=args.dtype)
diff --git a/sockeye/train.py b/sockeye/train.py
index ac9a9fbe6..57100b690 100644
--- a/sockeye/train.py
+++ b/sockeye/train.py
@@ -57,7 +57,6 @@
 from . import utils
 from . import vocab
 from .config import Config
-from .device import init_device
 from .log import setup_main_logger
 from .utils import check_condition
 
@@ -998,7 +997,7 @@ def train(args: argparse.Namespace, custom_metrics_logger: Optional[Callable] =
     logger.info("Adjusting maximum length to reserve space for a BOS/EOS marker. New maximum length: (%d, %d)",
                 max_seq_len_source, max_seq_len_target)
 
-    device = init_device(args, logger, utils.get_local_rank() if utils.is_distributed() else None)
+    device = utils.init_device(args)
     logger.info(f'Training Device: {device}')
     utils.seed_rngs(args.seed)
 
diff --git a/sockeye/translate.py b/sockeye/translate.py
index f336c3e72..17989ac49 100644
--- a/sockeye/translate.py
+++ b/sockeye/translate.py
@@ -22,9 +22,6 @@
 from contextlib import ExitStack
 from typing import Dict, Generator, List, Optional, Union
 
-import torch as pt
-
-from .device import init_device
 from sockeye.lexicon import load_restrict_lexicon, RestrictLexicon
 from sockeye.log import setup_main_logger
 from sockeye.model import load_models
@@ -33,6 +30,7 @@
 from . import arguments
 from . import constants as C
 from . import inference
+from . import utils
 
 logger = logging.getLogger(__name__)
 
@@ -67,7 +65,7 @@ def run_translate(args: argparse.Namespace):
     output_handler = get_output_handler(args.output_type,
                                         args.output)
 
-    device = init_device(args, logger)
+    device = utils.init_device(args)
     logger.info(f"Translate Device: {device}")
 
     models, source_vocabs, target_vocabs = load_models(device=device,
diff --git a/sockeye/utils.py b/sockeye/utils.py
index 1e1e981aa..8d2219746 100644
--- a/sockeye/utils.py
+++ b/sockeye/utils.py
@@ -14,6 +14,7 @@
 """
 A set of utility methods.
 """
+import argparse
 import binascii
 import gzip
 import itertools
@@ -324,7 +325,8 @@ def shift_prefix_factors(prefix_factors: pt.Tensor) -> pt.Tensor:
     :return new prefix_factors_shift (batch size, length + 1, num of factors)
     """
     prefix_factors_sizes = prefix_factors.size()
-    prefix_factors_shift = pt.zeros(prefix_factors_sizes[0], prefix_factors_sizes[1] + 1, prefix_factors_sizes[2], dtype=prefix_factors.dtype, device=prefix_factors.device)
+    prefix_factors_shift = pt.zeros(prefix_factors_sizes[0], prefix_factors_sizes[1] + 1, prefix_factors_sizes[2],
+                                    dtype=prefix_factors.dtype, device=prefix_factors.device)
     prefix_factors_shift[:, 1:] = prefix_factors
     return prefix_factors_shift
 
@@ -785,3 +787,33 @@ def compute_isometric_score(hypothesis: str, hypothesis_score: float, source: st
         isometric_score = pred_sub_score + synchrony_sub_score
 
         return isometric_score
+
+
+def init_device(args: argparse.Namespace) -> pt.device:
+    """
+    Select Torch device based on CLI args:
+    - When CUDA is not available, the device defaults to CPU.
+    - When using CUDA, tf32 is enabled if specified.
+    - When running distributed training, the CUDA device is determined by local
+      rank instead of CLI args.
+
+    :param args: Parsed CLI args including device parameters.
+
+    :return: Torch device.
+    """
+
+    use_cpu = args.use_cpu
+    if not use_cpu and not pt.cuda.is_available():
+        logger.info('CUDA not available, defaulting to CPU device')
+        use_cpu = True
+    if use_cpu:
+        return pt.device('cpu')
+
+    device = pt.device('cuda', get_local_rank() if is_distributed() else args.device_id)
+    # Ensure that GPU operations use the correct device by default
+    pt.cuda.set_device(device)
+    if args.tf32:
+        pt.backends.cuda.matmul.allow_tf32 = True
+        logger.info('CUDA: allow tf32 (float32 but with 10 bits precision)')
+
+    return device
diff --git a/typechecked-files b/typechecked-files
index 7e6a578a7..e0218c892 100644
--- a/typechecked-files
+++ b/typechecked-files
@@ -1,24 +1,31 @@
 sockeye/__init__.py
 sockeye/arguments.py
 sockeye/average.py
+sockeye/beam_search.py
 sockeye/checkpoint_decoder.py
 sockeye/config.py
 sockeye/constants.py
-sockeye/beam_search.py
+sockeye/convert_deepspeed.py
 sockeye/data_io.py
 sockeye/decoder.py
 sockeye/embeddings.py
 sockeye/encoder.py
+sockeye/evaluate.py
+sockeye/generate_decoder_states.py
 sockeye/inference.py
+sockeye/initial_setup.py
+sockeye/knn.py
 sockeye/layers.py
 sockeye/lexicon.py
 sockeye/log.py
 sockeye/loss.py
 sockeye/lr_scheduler.py
 sockeye/model.py
+sockeye/nvs.py
 sockeye/optimizers.py
 sockeye/output_handler.py
 sockeye/prepare_data.py
+sockeye/quantize.py
 sockeye/rerank.py
 sockeye/score.py
 sockeye/scoring.py
@@ -28,4 +35,3 @@ sockeye/transformer.py
 sockeye/translate.py
 sockeye/utils.py
 sockeye/vocab.py
-sockeye/initial_setup.py