From dcf28c4f0ed45d71f0cc1bb43cb209ff4b82dec7 Mon Sep 17 00:00:00 2001 From: Haifeng Wu Date: Wed, 20 Nov 2024 22:16:51 +0800 Subject: [PATCH] fix var len layer --- deeptables/models/layers.py | 19 +----- .../tests/models/var_len_categorical_test.py | 60 +++++++++---------- .../models/zdask_var_len_categorical_test.py | 28 ++++----- 3 files changed, 46 insertions(+), 61 deletions(-) diff --git a/deeptables/models/layers.py b/deeptables/models/layers.py index 2538a77..d235c93 100644 --- a/deeptables/models/layers.py +++ b/deeptables/models/layers.py @@ -931,20 +931,11 @@ def __init__(self, pooling_strategy='max', dropout_rate=0., **kwargs): super(VarLenColumnEmbedding, self).__init__(**kwargs) def build(self, input_shape): - import keras super(VarLenColumnEmbedding, self).build(input_shape) - - height = input_shape[1] - if self.pooling_strategy == "mean": - self._pooling_layer = keras.layers.AveragePooling2D(pool_size=(height, 1)) - else: - self._pooling_layer = keras.layers.MaxPooling2D(pool_size=(height, 1)) - if self.dropout_rate > 0: self._dropout = SpatialDropout1D(self.dropout_rate) else: self._dropout = None - self.built = True def call(self, inputs): @@ -957,14 +948,8 @@ def call(self, inputs): else: dropout_output = embedding_output - # 3. expand dim for polling - inputs_4d = tf.expand_dims(dropout_output, 3) # add channels dim - - # 4. polling - tensor_pooling = self._pooling_layer(inputs_4d) - - # 5. format output - return tf.squeeze(tensor_pooling, 3) + # 3. format output + return dropout_output def compute_mask(self, inputs, mask): return None diff --git a/deeptables/tests/models/var_len_categorical_test.py b/deeptables/tests/models/var_len_categorical_test.py index 407936f..04e26c3 100644 --- a/deeptables/tests/models/var_len_categorical_test.py +++ b/deeptables/tests/models/var_len_categorical_test.py @@ -5,33 +5,33 @@ from hypernets.tabular import get_tool_box -# class TestVarLenCategoricalFeature: -# -# def setup_class(cls): -# cls.df = dsutils.load_movielens().drop(['timestamp', "title"], axis=1) -# -# def test_var_categorical_feature(self): -# X = self.df.copy() -# y = X.pop('rating').values.astype('float32') -# -# conf = deeptable.ModelConfig(nets=['dnn_nets'], -# task=consts.TASK_REGRESSION, -# categorical_columns=["movie_id", "user_id", "gender", "occupation", "zip", "title", -# "age"], -# metrics=['mse'], -# fixed_embedding_dim=True, -# embeddings_output_dim=4, -# apply_gbm_features=False, -# apply_class_weight=True, -# earlystopping_patience=5, -# var_len_categorical_columns=[('genres', "|", "max")] -# ) -# -# dt = deeptable.DeepTable(config=conf) -# -# X_train, X_validation, y_train, y_validation = get_tool_box(X).train_test_split(X, y, test_size=0.2) -# -# model, history = dt.fit(X_train, y_train, validation_data=(X_validation, y_validation), -# epochs=10, batch_size=32) -# -# assert 'genres' in model.model.input_names +class TestVarLenCategoricalFeature: + + def setup_class(cls): + cls.df = dsutils.load_movielens().drop(['timestamp', "title"], axis=1) + + def test_var_categorical_feature(self): + X = self.df.copy() + y = X.pop('rating').values.astype('float32') + + conf = deeptable.ModelConfig(nets=['dnn_nets'], + task=consts.TASK_REGRESSION, + categorical_columns=["movie_id", "user_id", "gender", "occupation", "zip", "title", + "age"], + metrics=['mse'], + fixed_embedding_dim=True, + embeddings_output_dim=4, + apply_gbm_features=False, + apply_class_weight=True, + earlystopping_patience=5, + var_len_categorical_columns=[('genres', "|", "max")] + ) + + dt = deeptable.DeepTable(config=conf) + + X_train, X_validation, y_train, y_validation = get_tool_box(X).train_test_split(X, y, test_size=0.2) + + model, history = dt.fit(X_train, y_train, validation_data=(X_validation, y_validation), + epochs=10, batch_size=32) + names = [_.name for _ in model.model.inputs] + assert 'genres' in names diff --git a/deeptables/tests/models/zdask_var_len_categorical_test.py b/deeptables/tests/models/zdask_var_len_categorical_test.py index 5887558..3d30713 100644 --- a/deeptables/tests/models/zdask_var_len_categorical_test.py +++ b/deeptables/tests/models/zdask_var_len_categorical_test.py @@ -1,16 +1,16 @@ # -*- encoding: utf-8 -*- -# from hypernets.tests.tabular.tb_dask import is_dask_installed, if_dask_ready, setup_dask -# from .var_len_categorical_test import TestVarLenCategoricalFeature -# -# if is_dask_installed: -# import dask.dataframe as dd -# +from hypernets.tests.tabular.tb_dask import is_dask_installed, if_dask_ready, setup_dask +from .var_len_categorical_test import TestVarLenCategoricalFeature -# @if_dask_ready -# class TestVarLenCategoricalFeatureByDask(TestVarLenCategoricalFeature): -# -# def setup_class(self): -# TestVarLenCategoricalFeature.setup_class(self) -# -# setup_dask(self) -# self.df = dd.from_pandas(self.df, npartitions=2) +if is_dask_installed: + import dask.dataframe as dd + + +@if_dask_ready +class TestVarLenCategoricalFeatureByDask(TestVarLenCategoricalFeature): + + def setup_class(self): + TestVarLenCategoricalFeature.setup_class(self) + + setup_dask(self) + self.df = dd.from_pandas(self.df, npartitions=2)