Merge pull request #114 from NTMC-Community/dev

Version 1.1.1
NTMC-Community · Dec 12, 2019 · 0f08a4f · 0f08a4f
2 parents 2d27487 + 068d8ac
commit 0f08a4f
Show file tree

Hide file tree

Showing 18 changed files with 541 additions and 570 deletions.
diff --git a/README.md b/README.md
@@ -96,11 +96,13 @@ trainset = mz.dataloader.Dataset(
     data_pack=train_processed,
     mode='pair',
     num_dup=1,
-    num_neg=4
+    num_neg=4,
+    batch_size=32
 )
 validset = mz.dataloader.Dataset(
     data_pack=valid_processed,
-    mode='point'
+    mode='point',
+    batch_size=32
 )
 ```
 
@@ -110,13 +112,11 @@ padding_callback = mz.models.ArcI.get_default_padding_callback()
 
 trainloader = mz.dataloader.DataLoader(
     dataset=trainset,
-    batch_size=32,
     stage='train',
     callback=padding_callback
 )
 validloader = mz.dataloader.DataLoader(
     dataset=validset,
-    batch_size=32,
     stage='dev',
     callback=padding_callback
 )
@@ -127,6 +127,8 @@ Initialize the model, fine-tune the hyper-parameters:
 ```python
 model = mz.models.ArcI()
 model.params['task'] = ranking_task
+model.params['embedding_output_dim'] = 100
+model.params['embedding_input_dim'] = preprocessor.context['embedding_input_dim']
 model.guess_and_fill_missing_params()
 model.build()
 ```

diff --git a/matchzoo/auto/preparer/preparer.py b/matchzoo/auto/preparer/preparer.py
@@ -159,14 +159,20 @@ def _build_matrix(self, preprocessor, embedding):
             return np.random.uniform(-0.2, 0.2, matrix_shape)
 
     def _build_dataset_builder(self, model, embedding_matrix, preprocessor):
-        builder_kwargs = dict(callbacks=[])
+        builder_kwargs = dict(
+            callbacks=[],
+            batch_size=self._config['batch_size'],
+            shuffle=self._config['shuffle'],
+            sort=self._config['sort']
+        )
 
         if isinstance(self._task.losses[0], (mz.losses.RankHingeLoss,
                                              mz.losses.RankCrossEntropyLoss)):
             builder_kwargs.update(dict(
                 mode='pair',
                 num_dup=self._config['num_dup'],
-                num_neg=self._config['num_neg']
+                num_neg=self._config['num_neg'],
+                resample=self._config['resample'],
             ))
 
         if isinstance(model, mz.models.CDSSM):
@@ -201,11 +207,7 @@ def _build_dataset_builder(self, model, embedding_matrix, preprocessor):
 
     def _build_dataloader_builder(self, model, callback):
         builder_kwargs = dict(
-            batch_size=self._config['batch_size'],
             stage=self._config['stage'],
-            resample=self._config['resample'],
-            shuffle=self._config['shuffle'],
-            sort=self._config['sort'],
             callback=callback
         )
         return DataLoaderBuilder(**builder_kwargs)

diff --git a/matchzoo/dataloader/callbacks/__init__.py b/matchzoo/dataloader/callbacks/__init__.py
@@ -1,5 +1,4 @@
 from .lambda_callback import LambdaCallback
-from .dynamic_pooling import DynamicPooling
 from .histogram import Histogram
 from .ngram import Ngram
 from .padding import BasicPadding

diff --git a/matchzoo/dataloader/callbacks/dynamic_pooling.py b/matchzoo/dataloader/callbacks/dynamic_pooling.py
diff --git a/matchzoo/dataloader/callbacks/padding.py b/matchzoo/dataloader/callbacks/padding.py
@@ -1,10 +1,35 @@
 import typing
+from collections import Iterable
 
 import numpy as np
 
 from matchzoo.engine.base_callback import BaseCallback
 
 
+def _infer_dtype(value):
+    """Infer the dtype for the features.
+
+    It is required as the input is usually array of objects before padding.
+    """
+    while isinstance(value, (list, tuple)) and len(value) > 0:
+        value = value[0]
+
+    if not isinstance(value, Iterable):
+        return np.array(value).dtype
+
+    if value is not None and len(value) > 0 and np.issubdtype(
+            np.array(value).dtype, np.generic):
+        dtype = np.array(value[0]).dtype
+    else:
+        dtype = value.dtype
+
+    # Single Precision
+    if dtype == np.double:
+        dtype = np.float32
+
+    return dtype
+
+
 def _padding_2D(input, output, mode: str = 'pre'):
     """
     Pad the input 2D-tensor to the output 2D-tensor.
@@ -122,24 +147,26 @@ def on_batch_unpacked(self, x: dict, y: np.ndarray):
             pad_length_right = self._fixed_length_right
 
         for key, value in x.items():
+            dtype = _infer_dtype(value)
+
             if key == 'text_left':
                 padded_value = np.full([batch_size, pad_length_left],
-                                       self._pad_word_value, dtype=value.dtype)
+                                       self._pad_word_value, dtype=dtype)
                 _padding_2D(value, padded_value, self._pad_word_mode)
             elif key == 'text_right':
                 padded_value = np.full([batch_size, pad_length_right],
-                                       self._pad_word_value, dtype=value.dtype)
+                                       self._pad_word_value, dtype=dtype)
                 _padding_2D(value, padded_value, self._pad_word_mode)
             elif key == 'ngram_left':
                 padded_value = np.full(
                     [batch_size, pad_length_left, ngram_length],
-                    self._pad_ngram_value, dtype=value.dtype
+                    self._pad_ngram_value, dtype=dtype
                 )
                 _padding_3D(value, padded_value, self._pad_ngram_mode)
             elif key == 'ngram_right':
                 padded_value = np.full(
                     [batch_size, pad_length_right, ngram_length],
-                    self._pad_ngram_value, dtype=value.dtype
+                    self._pad_ngram_value, dtype=dtype
                 )
                 _padding_3D(value, padded_value, self._pad_ngram_mode)
             else:
@@ -193,18 +220,21 @@ def on_batch_unpacked(self, x: dict, y: np.ndarray):
             if key != 'text_left' and key != 'text_right' and \
                     key != 'match_histogram':
                 continue
-            elif key == 'text_left':
+
+            dtype = _infer_dtype(value)
+
+            if key == 'text_left':
                 padded_value = np.full([batch_size, pad_length_left],
-                                       self._pad_value, dtype=value.dtype)
+                                       self._pad_value, dtype=dtype)
                 _padding_2D(value, padded_value, self._pad_mode)
             elif key == 'text_right':
                 padded_value = np.full([batch_size, pad_length_right],
-                                       self._pad_value, dtype=value.dtype)
+                                       self._pad_value, dtype=dtype)
                 _padding_2D(value, padded_value, self._pad_mode)
             else:  # key == 'match_histogram'
                 padded_value = np.full(
                     [batch_size, pad_length_left, bin_size],
-                    self._pad_value, dtype=value.dtype)
+                    self._pad_value, dtype=dtype)
                 _padding_3D(value, padded_value, self._pad_mode)
             x[key] = padded_value