Skip to content

Commit

Permalink
Merge pull request #114 from NTMC-Community/dev
Browse files Browse the repository at this point in the history
Version 1.1.1
  • Loading branch information
Chriskuei authored Dec 12, 2019
2 parents 2d27487 + 068d8ac commit 0f08a4f
Show file tree
Hide file tree
Showing 18 changed files with 541 additions and 570 deletions.
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,13 @@ trainset = mz.dataloader.Dataset(
data_pack=train_processed,
mode='pair',
num_dup=1,
num_neg=4
num_neg=4,
batch_size=32
)
validset = mz.dataloader.Dataset(
data_pack=valid_processed,
mode='point'
mode='point',
batch_size=32
)
```

Expand All @@ -110,13 +112,11 @@ padding_callback = mz.models.ArcI.get_default_padding_callback()

trainloader = mz.dataloader.DataLoader(
dataset=trainset,
batch_size=32,
stage='train',
callback=padding_callback
)
validloader = mz.dataloader.DataLoader(
dataset=validset,
batch_size=32,
stage='dev',
callback=padding_callback
)
Expand All @@ -127,6 +127,8 @@ Initialize the model, fine-tune the hyper-parameters:
```python
model = mz.models.ArcI()
model.params['task'] = ranking_task
model.params['embedding_output_dim'] = 100
model.params['embedding_input_dim'] = preprocessor.context['embedding_input_dim']
model.guess_and_fill_missing_params()
model.build()
```
Expand Down
14 changes: 8 additions & 6 deletions matchzoo/auto/preparer/preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,14 +159,20 @@ def _build_matrix(self, preprocessor, embedding):
return np.random.uniform(-0.2, 0.2, matrix_shape)

def _build_dataset_builder(self, model, embedding_matrix, preprocessor):
builder_kwargs = dict(callbacks=[])
builder_kwargs = dict(
callbacks=[],
batch_size=self._config['batch_size'],
shuffle=self._config['shuffle'],
sort=self._config['sort']
)

if isinstance(self._task.losses[0], (mz.losses.RankHingeLoss,
mz.losses.RankCrossEntropyLoss)):
builder_kwargs.update(dict(
mode='pair',
num_dup=self._config['num_dup'],
num_neg=self._config['num_neg']
num_neg=self._config['num_neg'],
resample=self._config['resample'],
))

if isinstance(model, mz.models.CDSSM):
Expand Down Expand Up @@ -201,11 +207,7 @@ def _build_dataset_builder(self, model, embedding_matrix, preprocessor):

def _build_dataloader_builder(self, model, callback):
builder_kwargs = dict(
batch_size=self._config['batch_size'],
stage=self._config['stage'],
resample=self._config['resample'],
shuffle=self._config['shuffle'],
sort=self._config['sort'],
callback=callback
)
return DataLoaderBuilder(**builder_kwargs)
Expand Down
1 change: 0 additions & 1 deletion matchzoo/dataloader/callbacks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from .lambda_callback import LambdaCallback
from .dynamic_pooling import DynamicPooling
from .histogram import Histogram
from .ngram import Ngram
from .padding import BasicPadding
Expand Down
92 changes: 0 additions & 92 deletions matchzoo/dataloader/callbacks/dynamic_pooling.py

This file was deleted.

46 changes: 38 additions & 8 deletions matchzoo/dataloader/callbacks/padding.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,35 @@
import typing
from collections import Iterable

import numpy as np

from matchzoo.engine.base_callback import BaseCallback


def _infer_dtype(value):
"""Infer the dtype for the features.
It is required as the input is usually array of objects before padding.
"""
while isinstance(value, (list, tuple)) and len(value) > 0:
value = value[0]

if not isinstance(value, Iterable):
return np.array(value).dtype

if value is not None and len(value) > 0 and np.issubdtype(
np.array(value).dtype, np.generic):
dtype = np.array(value[0]).dtype
else:
dtype = value.dtype

# Single Precision
if dtype == np.double:
dtype = np.float32

return dtype


def _padding_2D(input, output, mode: str = 'pre'):
"""
Pad the input 2D-tensor to the output 2D-tensor.
Expand Down Expand Up @@ -122,24 +147,26 @@ def on_batch_unpacked(self, x: dict, y: np.ndarray):
pad_length_right = self._fixed_length_right

for key, value in x.items():
dtype = _infer_dtype(value)

if key == 'text_left':
padded_value = np.full([batch_size, pad_length_left],
self._pad_word_value, dtype=value.dtype)
self._pad_word_value, dtype=dtype)
_padding_2D(value, padded_value, self._pad_word_mode)
elif key == 'text_right':
padded_value = np.full([batch_size, pad_length_right],
self._pad_word_value, dtype=value.dtype)
self._pad_word_value, dtype=dtype)
_padding_2D(value, padded_value, self._pad_word_mode)
elif key == 'ngram_left':
padded_value = np.full(
[batch_size, pad_length_left, ngram_length],
self._pad_ngram_value, dtype=value.dtype
self._pad_ngram_value, dtype=dtype
)
_padding_3D(value, padded_value, self._pad_ngram_mode)
elif key == 'ngram_right':
padded_value = np.full(
[batch_size, pad_length_right, ngram_length],
self._pad_ngram_value, dtype=value.dtype
self._pad_ngram_value, dtype=dtype
)
_padding_3D(value, padded_value, self._pad_ngram_mode)
else:
Expand Down Expand Up @@ -193,18 +220,21 @@ def on_batch_unpacked(self, x: dict, y: np.ndarray):
if key != 'text_left' and key != 'text_right' and \
key != 'match_histogram':
continue
elif key == 'text_left':

dtype = _infer_dtype(value)

if key == 'text_left':
padded_value = np.full([batch_size, pad_length_left],
self._pad_value, dtype=value.dtype)
self._pad_value, dtype=dtype)
_padding_2D(value, padded_value, self._pad_mode)
elif key == 'text_right':
padded_value = np.full([batch_size, pad_length_right],
self._pad_value, dtype=value.dtype)
self._pad_value, dtype=dtype)
_padding_2D(value, padded_value, self._pad_mode)
else: # key == 'match_histogram'
padded_value = np.full(
[batch_size, pad_length_left, bin_size],
self._pad_value, dtype=value.dtype)
self._pad_value, dtype=dtype)
_padding_3D(value, padded_value, self._pad_mode)
x[key] = padded_value

Expand Down
Loading

0 comments on commit 0f08a4f

Please sign in to comment.