Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for numpy 2.0.0 #2269

Merged
merged 14 commits into from
Oct 30, 2024
15 changes: 9 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,16 @@ dependencies = [
'botocore>=1.31,<2.0.0',
'cloudpickle>=2.1.0',
'graphviz>=0.13.2',
"numpy>=1.21.0,<2.0.0;python_version<'3.10'",
"numpy>=1.23.3,<2.0.0;python_version>='3.10' and python_version<'3.12'",
"numpy>=1.26.0,<2.0.0;python_version>='3.12'",
"numpy>=1.21.0;python_version<'3.10'",
"numpy>=1.23.3;python_version>='3.10' and python_version<'3.12'",
"numpy>=1.26.0;python_version>='3.12'",
"pandas>=1.4.0;python_version<'3.11'",
"pandas>=1.5.0;python_version>='3.11' and python_version<'3.12'",
"pandas>=2.1.1;python_version>='3.12'",
'tqdm>=4.29',
'copulas>=0.11.0',
'ctgan>=0.10.0',
'deepecho>=0.6.0',
'ctgan>=0.10.2',
'deepecho>=0.6.1',
'rdt>=1.12.3',
'sdmetrics>=0.16.0',
'platformdirs>=4.0',
Expand Down Expand Up @@ -207,7 +207,10 @@ select = [
# print statements
"T201",
# pandas-vet
"PD"
"PD",
# numpy 2.0
"NPY201"

]
ignore = [
# pydocstyle
Expand Down
1 change: 1 addition & 0 deletions sdv/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def _is_datetime_type(value):
bool(_get_datetime_format([value]))
or isinstance(value, pd.Timestamp)
or isinstance(value, datetime)
or isinstance(value, pd.Period)
or (isinstance(value, str) and pd.notna(pd.to_datetime(value, errors='coerce')))
):
return False
Expand Down
8 changes: 8 additions & 0 deletions sdv/constraints/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,14 @@ def _transform(self, table_data):
pandas.DataFrame:
Transformed data.
"""
# To make the NaN to None mapping work for pd.Categorical data, we need to convert
# the columns to object before replacing NaNs with None.
table_data[self._columns] = table_data[self._columns].astype({
col: object
for col in self._columns
if pd.api.types.is_categorical_dtype(table_data[col])
})
Comment on lines +339 to +343
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on this discussion, should we just use fillna instead of replace? Then we don't have to convert

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@R-Palazzo I forgot that fillna cannot be used with None which is probably why we have this line of code in the first place. This solution is good


table_data[self._columns] = table_data[self._columns].replace({np.nan: None})
combinations = table_data[self._columns].itertuples(index=False, name=None)
uuids = map(self._combinations_to_uuids.get, combinations)
Expand Down
23 changes: 17 additions & 6 deletions sdv/data_processing/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@
)
from sdv.data_processing.datetime_formatter import DatetimeFormatter
from sdv.data_processing.errors import InvalidConstraintsError, NotFittedError
from sdv.data_processing.numerical_formatter import NumericalFormatter
from sdv.data_processing.numerical_formatter import INTEGER_BOUNDS, NumericalFormatter
from sdv.data_processing.utils import load_module_from_path
from sdv.errors import SynthesizerInputError, log_exc_stacktrace
from sdv.metadata.single_table import SingleTableMetadata

LOGGER = logging.getLogger(__name__)
INTEGER_BOUNDS = {str(key).lower(): value for key, value in INTEGER_BOUNDS.items()}


class DataProcessor:
Expand Down Expand Up @@ -561,26 +562,36 @@ def _create_config(self, data, columns_created_by_constraints):
)

if sdtype == 'id':
is_numeric = pd.api.types.is_numeric_dtype(data[column].dtype)
function_name = 'bothify'
column_dtype = data[column].dtype
is_numeric = pd.api.types.is_numeric_dtype(column_dtype)
if column_metadata.get('regex_format', False):
transformers[column] = self.create_regex_generator(
column, sdtype, column_metadata, is_numeric
)
sdtypes[column] = 'text'

else:
bothify_format = 'sdv-id-??????'
if is_numeric:
bothify_format = '#########'
function_name = 'random_int'
column_dtype = str(column_dtype).lower()
function_kwargs = {'min': 0, 'max': 9999999}
for key in INTEGER_BOUNDS:
if key in column_dtype:
_, max_value = INTEGER_BOUNDS[key]
function_kwargs = {'min': 0, 'max': max_value}

else:
function_kwargs = {'text': 'sdv-id-??????'}

cardinality_rule = None
if column in self._keys:
cardinality_rule = 'unique'

transformers[column] = AnonymizedFaker(
provider_name=None,
function_name='bothify',
function_kwargs={'text': bothify_format},
function_name=function_name,
function_kwargs=function_kwargs,
cardinality_rule=cardinality_rule,
)

Expand Down
3 changes: 3 additions & 0 deletions tests/benchmark/excluded_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
('numerical', 'np.string', 'Positive'),
('numerical', 'np.string', 'Negative'),
('numerical', 'np.string', 'ScalarInequality'),
('numerical', 'np.bytes', 'Positive'),
('numerical', 'np.bytes', 'Negative'),
('numerical', 'np.bytes', 'ScalarInequality'),
('numerical', 'np.unicode', 'Positive'),
('numerical', 'np.unicode', 'Negative'),
('numerical', 'np.unicode', 'ScalarInequality'),
Expand Down
11 changes: 7 additions & 4 deletions tests/benchmark/numpy_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,17 @@
}),
'np.string': pd.DataFrame({
'np.string': pd.Series([
np.string_('string1'),
np.string_('string2'),
np.string_('string3'),
np.str_('string1'),
np.str_('string2'),
np.str_('string3'),
])
}),
'np.bytes': pd.DataFrame({
'np.bytes': pd.Series([np.bytes_('bytes1'), np.bytes_('bytes2'), np.bytes_('bytes3')])
}),
'np.unicode': pd.DataFrame({
'np.unicode': pd.Series(
[np.unicode_('unicode1'), np.unicode_('unicode2'), np.unicode_('unicode3')],
[np.str_('unicode1'), np.str_('unicode2'), np.str_('unicode3')],
dtype='string',
)
}),
Expand Down
2 changes: 1 addition & 1 deletion tests/benchmark/supported_dtypes_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@
'np.object': 'categorical',
'np.bool': 'categorical',
'np.string': 'categorical',
'np.bytes': 'categorical',
'np.unicode': 'categorical',
# PyArrow
'pa.int8': 'numerical',
Expand Down Expand Up @@ -378,7 +379,6 @@ def _create_single_column_constraint_and_data(constraint, data, dtype, sdtype):


def _create_multi_column_constraint_data_and_metadata(constraint, data, dtype, sdtype, metadata):
_dtype = data.dtypes[dtype]
constraint_class = constraint.get('constraint_class')
constraints = []
if constraint_class == 'FixedCombinations':
Expand Down
50 changes: 25 additions & 25 deletions tests/integration/single_table/test_copulas.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,31 +347,31 @@ def test_numerical_columns_gets_pii():

# Assert
expected_sampled = pd.DataFrame({
'id': {
0: 807994768,
1: 746439230,
2: 201363792,
3: 364823003,
4: 726973888,
5: 693331380,
6: 795819284,
7: 607278621,
8: 783746695,
9: 162118876,
},
'city': {
0: 'Danielfort',
1: 'Glendaside',
2: 'Port Jenniferchester',
3: 'Port Susan',
4: 'West Michellemouth',
5: 'West Jason',
6: 'Ryanfort',
7: 'West Stephenland',
8: 'Davidland',
9: 'Port Christopher',
},
'numerical': {0: 22, 1: 24, 2: 22, 3: 23, 4: 22, 5: 24, 6: 23, 7: 24, 8: 24, 9: 24},
'id': [
1089619006166876142,
8373046707753416652,
9070705361670139280,
7227045982112645011,
3461931576753619633,
1005734164466301683,
3312031189447929384,
82456842876428117,
1819741328868365520,
8019169766233150107,
],
'city': [
'Danielfort',
'Glendaside',
'Port Jenniferchester',
'Port Susan',
'West Michellemouth',
'West Jason',
'Ryanfort',
'West Stephenland',
'Davidland',
'Port Christopher',
],
'numerical': [22, 24, 22, 23, 22, 24, 23, 24, 24, 24],
})
pd.testing.assert_frame_equal(expected_sampled, sampled)

Expand Down
22 changes: 22 additions & 0 deletions tests/unit/constraints/test_tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,28 @@ def test_transform_non_string(self):
expected_out_a = pd.Series(['a', 'b', 'c'], name='a')
pd.testing.assert_series_equal(expected_out_a, out['a'])

def test_transform_categorical_dtype(self):
"""Test ``transform`` with categorical columns."""
# Setup
table_data = pd.DataFrame({
'a': ['a', 'b', 'c'],
'b': pd.Categorical(['d', None, 'f']),
'c': pd.Categorical(['g', 'h', np.nan]),
})
columns = ['b', 'c']
instance = FixedCombinations(column_names=columns)
instance.fit(table_data)

# Run
out = instance.transform(table_data)

# Assert
assert out['b#c'].isna().sum() == 0
assert instance._combinations_to_uuids is not None
assert instance._uuids_to_combinations is not None
expected_out_a = pd.Series(['a', 'b', 'c'], name='a')
pd.testing.assert_series_equal(expected_out_a, out['a'])

def test_transform_not_all_columns_provided(self):
"""Test the ``FixedCombinations.transform`` method.

Expand Down
39 changes: 29 additions & 10 deletions tests/unit/data_processing/test_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1137,7 +1137,9 @@ def test__create_config(self):
'first_name': ['John', 'Doe', 'Johanna'],
'id': ['ID_001', 'ID_002', 'ID_003'],
'id_no_regex': ['ID_001', 'ID_002', 'ID_003'],
'id_numeric': [0, 1, 2],
'id_numeric_int8': pd.Series([1, 2, 3], dtype='Int8'),
'id_numeric_int16': pd.Series([1, 2, 3], dtype='Int16'),
'id_numeric_int32': pd.Series([1, 2, 3], dtype='Int32'),
'id_column': ['ID_999', 'ID_999', 'ID_007'],
'date': ['2021-02-01', '2022-03-05', '2023-01-31'],
'unknown': ['a', 'b', 'c'],
Expand All @@ -1151,9 +1153,9 @@ def test__create_config(self):
dp.create_anonymized_transformer.return_value = 'AnonymizedFaker'
dp.create_regex_generator.return_value = 'RegexGenerator'
dp.metadata.primary_key = 'id'
dp.metadata.alternate_keys = ['id_no_regex', 'id_numeric']
dp.metadata.alternate_keys = ['id_no_regex', 'id_numeric_int8']
dp._primary_key = 'id'
dp._keys = ['id', 'id_no_regex', 'id_numeric']
dp._keys = ['id', 'id_no_regex', 'id_numeric_int8']
dp.metadata.columns = {
'int': {'sdtype': 'numerical'},
'float': {'sdtype': 'numerical'},
Expand All @@ -1163,7 +1165,9 @@ def test__create_config(self):
'first_name': {'sdtype': 'first_name'},
'id': {'sdtype': 'id', 'regex_format': 'ID_\\d{3}[0-9]'},
'id_no_regex': {'sdtype': 'id'},
'id_numeric': {'sdtype': 'id'},
'id_numeric_int8': {'sdtype': 'id'},
'id_numeric_int16': {'sdtype': 'id'},
'id_numeric_int32': {'sdtype': 'id'},
'id_column': {'sdtype': 'id'},
'date': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
'unknown': {'sdtype': 'unknown'},
Expand All @@ -1188,7 +1192,9 @@ def test__create_config(self):
'first_name': 'pii',
'id': 'text',
'id_no_regex': 'text',
'id_numeric': 'text',
'id_numeric_int8': 'text',
'id_numeric_int16': 'text',
'id_numeric_int32': 'text',
'id_column': 'text',
'date': 'datetime',
'unknown': 'pii',
Expand Down Expand Up @@ -1236,11 +1242,24 @@ def test__create_config(self):
assert id_no_regex_transformer.function_kwargs == {'text': 'sdv-id-??????'}
assert id_no_regex_transformer.cardinality_rule == 'unique'

id_numeric_transformer = config['transformers']['id_numeric']
assert isinstance(id_numeric_transformer, AnonymizedFaker)
assert id_numeric_transformer.function_name == 'bothify'
assert id_numeric_transformer.function_kwargs == {'text': '#########'}
assert id_numeric_transformer.cardinality_rule == 'unique'
id_numeric_int_8_transformer = config['transformers']['id_numeric_int8']
assert isinstance(id_numeric_int_8_transformer, AnonymizedFaker)
assert id_numeric_int_8_transformer.function_name == 'random_int'
assert id_numeric_int_8_transformer.function_kwargs == {'min': 0, 'max': 127}
assert id_numeric_int_8_transformer.cardinality_rule == 'unique'

id_numeric_int_16_transformer = config['transformers']['id_numeric_int16']
assert isinstance(id_numeric_int_16_transformer, AnonymizedFaker)
assert id_numeric_int_16_transformer.function_name == 'random_int'
assert id_numeric_int_16_transformer.function_kwargs == {'min': 0, 'max': 32767}

id_numeric_int_32_transformer = config['transformers']['id_numeric_int32']
assert isinstance(id_numeric_int_32_transformer, AnonymizedFaker)
assert id_numeric_int_32_transformer.function_name == 'random_int'
assert id_numeric_int_32_transformer.function_kwargs == {
'min': 0,
'max': 2147483647,
}

id_column_transformer = config['transformers']['id_column']
assert isinstance(id_column_transformer, AnonymizedFaker)
Expand Down
20 changes: 20 additions & 0 deletions tests/unit/test__utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,26 @@ def test__is_datetime_type_with_datetime_series():
assert is_datetime


def test__is_datetime_type_with_period():
"""Test the ``_is_datetime_type`` function when a period series is passed.

Expect to return True when a period series is passed.

Input:
- A pandas.Series of type `period`
Output:
- True
"""
# Setup
data = pd.Series(pd.period_range('2023-01', periods=3, freq='M'))

# Run
is_datetime = _is_datetime_type(data)

# Assert
assert is_datetime


def test__is_datetime_type_with_mixed_array():
"""Test the ``_is_datetime_type`` function with a list of mixed datetime types."""
# Setup
Expand Down
Loading