diff --git a/datar/__init__.py b/datar/__init__.py index 46e20290..33e52cbc 100644 --- a/datar/__init__.py +++ b/datar/__init__.py @@ -1,5 +1,6 @@ from .core import operator as _ from .core import f, options_context, options, add_option, get_option, logger +from .core.options import apply_init_callbacks __all__ = ( "f", @@ -11,10 +12,10 @@ "logger", ) -options(enable_pdtypes=True) - __all__ = ("f", "get_versions") -__version__ = "0.7.2" +__version__ = "0.8.0" + +apply_init_callbacks() def get_versions(prnt: bool = True): diff --git a/datar/all.py b/datar/all.py index bbbfd1e8..2460f19e 100644 --- a/datar/all.py +++ b/datar/all.py @@ -1,34 +1,34 @@ """Import all constants, verbs and functions""" + +_locs = locals() + +from . import base as _base +_base_conflict_names = _base._conflict_names +for _key in _base.__all__: + if _key not in _base_conflict_names: + _locs[_key] = getattr(_base, _key) + +from . import dplyr as _dplyr +_dplyr_conflict_names = _dplyr._conflict_names +for _key in _dplyr.__all__: + if _key not in _dplyr_conflict_names: + _locs[_key] = getattr(_dplyr, _key) + from .core.defaults import f -from .base import ( - _no_warn as _, -) # don't override from datar.all import _no_warn -from .base import _builtin_names as _base_builtin_names -from .base import * -from .base import _warn as _ from .forcats import * from .datar import * -from .dplyr import _no_warn as _ -from .dplyr import _builtin_names as _dplyr_builtin_names -from .dplyr import * -from .dplyr import _warn as _ from .tibble import * from .tidyr import * from .base import rank # overwrite dplyr.rank -_builtin_names = _base_builtin_names.copy() -_builtin_names.update(_dplyr_builtin_names) -# builtin names included -__all__ = [var_ for var_ in locals() if not var_.startswith("_")] - -for name in _builtin_names: - # let __getattr__ handles the builtins, otherwise - # from datar.all import filter - # will not warn - del locals()[name] +from .core.import_names_conflict import ( + handle_import_names_conflict as _handle_import_names_conflict, +) -from .core.warn_builtin_names import ( - warn_builtin_names as _warn_builtin_names, +__all__, _getattr = _handle_import_names_conflict( + _locs, + _base_conflict_names | _dplyr_conflict_names, ) -__getattr__ = _warn_builtin_names(**_builtin_names) +if _getattr is not None: + __getattr__ = _getattr diff --git a/datar/base/__init__.py b/datar/base/__init__.py index 397cc0b0..359d073d 100644 --- a/datar/base/__init__.py +++ b/datar/base/__init__.py @@ -67,6 +67,7 @@ rank, outer, ) +from .glimpse import glimpse from .logical import ( FALSE, TRUE, @@ -193,22 +194,13 @@ ) from .which import which, which_max, which_min +from ..core.import_names_conflict import ( + handle_import_names_conflict as _handle_import_names_conflict +) -__all__ = [name for name in locals() if not name.startswith("_")] - -_builtin_names = { - "min": min_, - "max": max_, - "sum": sum_, - "abs": abs_, - "round": round_, - "all": all_, - "any": any_, - "re": re_, -} -__all__.extend(_builtin_names) +_conflict_names = {"min", "max", "sum", "abs", "round", "all", "any", "re"} -# warn when builtin names are imported directly -from ..core.warn_builtin_names import warn_builtin_names +__all__, _getattr = _handle_import_names_conflict(locals(), _conflict_names) -__getattr__ = warn_builtin_names(**_builtin_names) +if _getattr is not None: + __getattr__ = _getattr diff --git a/datar/base/factor.py b/datar/base/factor.py index bcbe3584..c83a4d4d 100644 --- a/datar/base/factor.py +++ b/datar/base/factor.py @@ -7,6 +7,7 @@ from pipda import register_func from ..core.backends.pandas import Categorical, Series +from ..core.backends.pandas.core.groupby import SeriesGroupBy from ..core.backends.pandas.api.types import is_categorical_dtype, is_scalar from ..core.contexts import Context @@ -71,6 +72,7 @@ def is_ordered(x) -> bool: return _ensure_categorical(x).ordered +@register_func(None, context=Context.EVAL) def factor(x=None, levels=None, exclude=np.nan, ordered=False): """encode a vector as a factor (the terms ‘category’ and ‘enumerated type’ are also used for factors). @@ -87,6 +89,15 @@ def factor(x=None, levels=None, exclude=np.nan, ordered=False): ordered: logical flag to determine if the levels should be regarded as ordered (in the order given). """ + if isinstance(x, SeriesGroupBy): + out = factor.__origfunc__( + x.obj, + levels=levels, + exclude=exclude, + ordered=ordered, + ) + return Series(out, index=x.obj.index).groupby(x.grouper) + if x is None: x = [] diff --git a/datar/base/glimpse.py b/datar/base/glimpse.py new file mode 100644 index 00000000..6d37e74c --- /dev/null +++ b/datar/base/glimpse.py @@ -0,0 +1,183 @@ +"""Provides glimpse""" +import textwrap +import html +from functools import singledispatch +from shutil import get_terminal_size + +from pipda import register_verb + +from ..core.tibble import TibbleGrouped, TibbleRowwise +from ..core.backends.pandas import DataFrame +from ..core.backends.pandas.core.groupby import SeriesGroupBy + + +@singledispatch +def formatter(x): + """Formatter passed to glimpse to format a single element of a dataframe.""" + return str(x) + + +@formatter.register(DataFrame) +def _dataframe_formatter(x): + """Format a dataframe element.""" + return f"" + + +@formatter.register(str) +def _str_formatter(x): + """Format a string""" + return repr(x) + + +def _is_notebook() -> bool: # pragma: no cover + """Check if the current environment is notebook""" + try: + from IPython import get_ipython + shell = get_ipython().__class__.__name__ + if shell == "ZMQInteractiveShell": + return True # Jupyter notebook or qtconsole + elif shell == "TerminalInteractiveShell": + return False # Terminal running IPython + else: + return False # Other type (?) + except (ImportError, NameError): + return False # Probably standard Python interpreter + + +class Glimpse: + """Glimpse class + + Args: + x: The data to be glimpseed + width: The width of the output + formatter: The formatter to use to format data elements + """ + def __init__(self, x, width, formatter) -> None: + self.x = x + self.width = width or get_terminal_size((100, 20)).columns + self.formatter = formatter + self.colwidths = (0, 0) + + def __repr__(self) -> str: + return f"" + + def __str__(self) -> str: + self._calculate_output_widths() + return "\n".join( + ( + "\n".join(self._general()), + "\n".join(self._variables()), + ) + ) + + def _repr_html_(self): + out = [] + for gen in self._general(): + out.append(f"
{gen}
") + out.append("") + out.extend(self._variables(fmt="html")) + out.append("
") + return "\n".join(out) + + def _general(self): + if isinstance(self.x, TibbleGrouped): + groups = ", ".join((str(name) for name in self.x.group_vars)) + group_title = ( + "Rowwise" if isinstance(self.x, TibbleRowwise) else "Groups" + ) + return ( + f"Rows: {self.x.shape[0]}", + f"Columns: {self.x.shape[1]}", + f"{group_title}: {groups} " + f"[{self.x._datar['grouped'].grouper.ngroups}]", + ) + + return ( + f"Rows: {self.x.shape[0]}", + f"Columns: {self.x.shape[1]}", + ) + + def _calculate_output_widths(self): + colname_width = max(len(str(colname)) for colname in self.x.columns) + dtype_width = max(len(str(dtype)) for dtype in self.x.dtypes) + 2 + self.colwidths = (colname_width, dtype_width) + + def _variables(self, fmt="str"): + for col in self.x: + yield self._format_variable( + col, + self.x[col].dtype, + self.x[col].obj.values + if isinstance(self.x[col], SeriesGroupBy) + else self.x[col].values, + fmt=fmt, + ) + + def _format_variable(self, col, dtype, data, fmt="str"): + if fmt == "str": + return self._format_variable_str(col, dtype, data) + + return self._format_variable_html(col, dtype, data) + + def _format_data(self, data): + """Format the data for the glimpse view + + Formatting 10 elements in a batch in case of a long dataframe. + Since we don't need to format all the data, but only the first a few + till the line (terminal width or provided width) overflows. + """ + out = "" + placeholder = "…" + i = 0 + chunk_size = 10 + while not out.endswith(placeholder) and i < data.size: + if out: + out += ", " + out += ", ".join( + self.formatter(d) for d in data[i:i + chunk_size] + ) + i += chunk_size + out = textwrap.shorten( + out, + break_long_words=True, + break_on_hyphens=True, + width=self.width - 4 - sum(self.colwidths), + placeholder=placeholder, + ) + return out + + def _format_variable_str(self, col, dtype, data): + name_col = col.ljust(self.colwidths[0]) + dtype_col = f'<{dtype}>'.ljust(self.colwidths[1]) + data_col = self._format_data(data) + return f". {name_col} {dtype_col} {data_col}" + + def _format_variable_html(self, col, dtype, data): + name_col = f". {col}" + dtype_col = f"<{dtype}>" + data_col = html.escape(self._format_data(data)) + return ( + f"{name_col}" + f"{dtype_col}" + f"{data_col}" + ) + + def show(self): + """Show the glimpse view""" + if _is_notebook(): # pragma: no cover + from IPython.display import display, HTML + display(HTML(self._repr_html_())) + else: + print(self.__str__()) + + +@register_verb(DataFrame) +def glimpse(x, width=None, formatter=formatter): + """Get a glimpse of your data + + Args: + x: An object to glimpse at. + width: Width of output, defaults to the width of the console. + formatter: A single-dispatch function to format a single element. + """ + Glimpse(x, width=width, formatter=formatter).show() diff --git a/datar/core/factory.py b/datar/core/factory.py index 8518460d..6c2b5077 100644 --- a/datar/core/factory.py +++ b/datar/core/factory.py @@ -359,6 +359,10 @@ def _pipda_func(__x, *args, **kwargs): _pipda_func.__name__ = funcname _pipda_func.__qualname__ = qualname _pipda_func.__doc__ = doc or func.__doc__ + try: + _pipda_func.__module__ = func.__module__ + except AttributeError: + pass _pipda_func.dispatched = dispatched _pipda_func.register = _register_factory(dispatched, func) _pipda_func.__raw__ = func diff --git a/datar/core/import_names_conflict.py b/datar/core/import_names_conflict.py new file mode 100644 index 00000000..2a0347cc --- /dev/null +++ b/datar/core/import_names_conflict.py @@ -0,0 +1,65 @@ +"""Provides handle_import_names_conflict""" +from .options import get_option + +WARNED = set() + + +def handle_import_names_conflict(imports, conflict_names): + """Handle the import names conflict. + + Args: + imports: The `locals()` of the importing module. + conflict_names: The names that conflict with builtin names. + There are always underscore-suffixed names existing in imports + For example, `sum_` for `sum` + When `import_names_conflict` is `'underscore_suffixed'`, we are + always using `sum_` for `sum`. Directly importing `sum` will raise + `ImportError` + When `import_names_conflict` is `'warn'`, a warning will be shown + when names from `conflict_names` are imported + When `import_names_conflict` is `'silent'`, do nothing when names + from `conflict_names` are imported + + Returns: + All names for the module, defining what will be imported when doing + `from module import *`. + Getattr function for the module, that is used to transform + `sum` to `sum_`. + """ + _import_names_conflict = get_option("import_names_conflict") + if _import_names_conflict == "underscore_suffixed": + return [name for name in imports if not name.startswith("_")], None + + import sys + from executing import Source + from .utils import logger + + def _getattr(name: str): + # Using get_option("import_names_conflict") to get the value + # instead of `import_names_conflict` + # OPTIONS changed in lifetime + opt_maybe_changed = get_option("import_names_conflict") + if ( + name == "__path__" + or name not in conflict_names + or opt_maybe_changed == "underscore_suffixed" + ): + raise AttributeError + + # from ... import xxx + if ( + name not in WARNED + and opt_maybe_changed == "warn" + and not Source.executing(sys._getframe(1)).node + ): + WARNED.add(name) + logger.warning( + 'Builtin name "%s" has been masked by datar.', + name, + ) + + return imports[f"{name}_"] + + _all = [name for name in imports if not name.startswith("_")] + _all.extend(conflict_names) + return _all, _getattr diff --git a/datar/core/options.py b/datar/core/options.py index ccf90461..a53d0b03 100644 --- a/datar/core/options.py +++ b/datar/core/options.py @@ -1,39 +1,19 @@ """Provide options""" +from os import access, R_OK from typing import Any, Generator, Mapping, Union, Callable from contextlib import contextmanager +from pathlib import Path +import toml from diot import Diot from pipda import options as pipda_options - _key_transform = lambda key: key.replace("_", ".") _dict_transform_back = lambda dic: { key.replace(".", "_"): val for key, val in dic.items() } -OPTIONS = Diot( - # Whether use 0-based numbers when index is involved, acts similar like R - dplyr_summarise_inform=True, - # whether warn about importing functions that override builtin ones. - warn_builtin_names=True, - # - enable_pdtypes=False, - # add_option=True, - # allow 'a.b' to access 'a_b' - diot_transform=_key_transform, - # Warn about failure to get ast node - warn_astnode_failure=True, - # All piping mode: - # - Assume all verbs are using PIPING_VERB env - # - Assume all data functions are using PIPING env - # - Assume all non-data functions are using PIPING verbs - # This is useful when source code is not available. - assume_all_piping=False, - # The backend for datar - backend="pandas", # or "modin" -) - def enable_pdtypes_callback(enable: bool) -> None: # pragma: no cover from .utils import logger @@ -64,6 +44,54 @@ def enable_pdtypes_callback(enable: bool) -> None: # pragma: no cover ) +def _read_options(path: Path) -> dict: + """Read options from a file""" + if not path.is_file() or not access(path, R_OK): + return {} + + with path.open("r") as fopt: + return toml.load(fopt) + + +OPTION_FILE_HOME = Path("~/.datar.toml").expanduser() +OPTION_FILE_CWD = Path("./.datar.toml").resolve() + + +OPTIONS = Diot( + # Whether use 0-based numbers when index is involved, acts similar like R + dplyr_summarise_inform=True, + # What to do when there are conflicts importing names + # - `warn`: show warnings + # - `silent`: ignore the conflicts + # - `underscore_suffixed`: add suffix `_` to the conflicting names + # (and don't do any warnings) + import_names_conflict="warn", + # Enable pdtypes + enable_pdtypes=True, + # add_option=True, + # allow 'a.b' to access 'a_b' + diot_transform=_key_transform, + # Warn about failure to get ast node + warn_astnode_failure=True, + # All piping mode: + # - Assume all verbs are using PIPING_VERB env + # - Assume all data functions are using PIPING env + # - Assume all non-data functions are using PIPING verbs + # This is useful when source code is not available. + assume_all_piping=False, + # The backend for datar + backend="pandas", # or "modin" +) +OPTIONS.update(_read_options(OPTION_FILE_HOME)) +OPTIONS.update(_read_options(OPTION_FILE_CWD)) + + +def apply_init_callbacks(): + """Apply the callbacks when options are initialized""" + for key in OPTION_CALLBACKS: + OPTION_CALLBACKS[key](OPTIONS[key]) + + def options( *args: Union[str, Mapping[str, Any]], _return: bool = None, diff --git a/datar/core/warn_builtin_names.py b/datar/core/warn_builtin_names.py deleted file mode 100644 index c4e2d628..00000000 --- a/datar/core/warn_builtin_names.py +++ /dev/null @@ -1,64 +0,0 @@ -"""Let datar warn when builtin names are tried to be imported""" -import sys -from typing import Callable, Any -from executing import Source - -WARNED = set() - - -def warn_builtin_names(**names: Callable) -> Callable[[str], Any]: - """Generate __getattr__ function to warn the builtin names""" - from .utils import logger - from .options import get_option - - # Enables tempoarory warn on or off - warn = True - - def _getattr(name: str): - """A route to let us check if the function is imported by - >>> from module import func - But not - >>> import module - >>> module.func - - If func is a python built-in function, then it gets overriden by datar. - - The side effects: - 1. This executes every time when `module.func` is called - 2. When the source is not avaiable for `module.func`, it will be a - false alarm treat this as `from module import func` - 3. It warns even you do `from module import func as alias` - - Instead, you can do `from module import func_` to access function `func` - - Args: - **names: The name-module pairs - - Returns: - A function that can be used as `__getattr__` for a module - """ - nonlocal warn - if name == "_warn": - warn = True - return None - if name == "_no_warn": - warn = False - return None - - if name == "__path__" or name not in names: - raise AttributeError - - if ( - warn - and name not in WARNED - and get_option("warn.builtin.names", True) - ): - node = Source.executing(sys._getframe(1)).node - if not node: - WARNED.add(name) - logger.warning( - 'Builtin name "%s" has been overriden by datar.', name - ) - return names[name] - - return _getattr diff --git a/datar/dplyr/__init__.py b/datar/dplyr/__init__.py index f6c5ec5c..48503e05 100644 --- a/datar/dplyr/__init__.py +++ b/datar/dplyr/__init__.py @@ -93,17 +93,13 @@ where, ) -# make sure builtin names are included when -# from datar.dplyr import * -_builtin_names = { - "filter": filter_, - "slice": slice_, -} +from ..core.import_names_conflict import ( + handle_import_names_conflict as _handle_import_names_conflict +) -__all__ = [var_ for var_ in locals() if not var_.startswith("_")] -__all__.extend(_builtin_names) +_conflict_names = {"filter", "slice"} -# warn when builtin names are imported directly -from ..core.warn_builtin_names import warn_builtin_names +__all__, _getattr = _handle_import_names_conflict(locals(), _conflict_names) -__getattr__ = warn_builtin_names(**_builtin_names) +if _getattr is not None: + __getattr__ = _getattr diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 7e194e61..460a5dbf 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.8.0 + +- ✨ Support `base.glimpse()` (#107, machow/siuba#409) +- 🐛 Register `base.factor()` and accept grouped data (#108) +- ✨ Allow configuration file to save default options +- 💥 Replace option `warn_builtin_names` with `imiport_names_conflict` (#73) +- 🩹 Attach original `__module__` to `func_factory` registed functions +- ⬆️ Bump `pipda` to `0.5.9` + ## 0.7.2 - ✨ Allow tidyr.unite() to unite multiple columns into a list, instead of join them (#105) diff --git a/docs/import.md b/docs/import.md index f2d73409..4cbe193f 100644 --- a/docs/import.md +++ b/docs/import.md @@ -48,25 +48,25 @@ You don't have to worry about other datasets to be imported and take up the memo See also [datasets](../datasets) for details about available datasets. -## Warn about python reserved names to be overriden by `datar` +## Warn about python reserved names to be masked by `datar` -Sometimes it will be confusing especially when python builtin functions are overriden by `datar`. There are a couple of datar (`r-base`, `dplyr`) functions with the same name as python builtin functions do. For example: `filter`. +Sometimes it will be confusing especially when python builtin functions are overriden by `datar`. There are a couple of datar (`r-base`, `dplyr`) functions with the same name as python builtin functions. For example: `filter`. To make you aware of the loss of access to builtin functions or other python preserved names, warnings will be reported when those names are imported directly: ```python >>> from datar.dplyr import filter -[2021-06-18 17:58:23][datar][WARNING] Builtin name "filter" has been overriden by datar. +[2021-06-18 17:58:23][datar][WARNING] Builtin name "filter" has been masked by datar. >>> from datar.all import * -2021-06-18 17:58:48][datar][WARNING] Builtin name "min" has been overriden by datar. -[2021-06-18 17:58:48][datar][WARNING] Builtin name "max" has been overriden by datar. -[2021-06-18 17:58:48][datar][WARNING] Builtin name "sum" has been overriden by datar. -[2021-06-18 17:58:48][datar][WARNING] Builtin name "abs" has been overriden by datar. -[2021-06-18 17:58:48][datar][WARNING] Builtin name "round" has been overriden by datar. -[2021-06-18 17:58:48][datar][WARNING] Builtin name "all" has been overriden by datar. -[2021-06-18 17:58:48][datar][WARNING] Builtin name "any" has been overriden by datar. -[2021-06-18 17:58:48][datar][WARNING] Builtin name "re" has been overriden by datar. -[2021-06-18 17:58:48][datar][WARNING] Builtin name "slice" has been overriden by datar. +2021-06-18 17:58:48][datar][WARNING] Builtin name "min" has been masked by datar. +[2021-06-18 17:58:48][datar][WARNING] Builtin name "max" has been masked by datar. +[2021-06-18 17:58:48][datar][WARNING] Builtin name "sum" has been masked by datar. +[2021-06-18 17:58:48][datar][WARNING] Builtin name "abs" has been masked by datar. +[2021-06-18 17:58:48][datar][WARNING] Builtin name "round" has been masked by datar. +[2021-06-18 17:58:48][datar][WARNING] Builtin name "all" has been masked by datar. +[2021-06-18 17:58:48][datar][WARNING] Builtin name "any" has been masked by datar. +[2021-06-18 17:58:48][datar][WARNING] Builtin name "re" has been masked by datar. +[2021-06-18 17:58:48][datar][WARNING] Builtin name "slice" has been masked by datar. ``` However, when they are not imported directly, no warnings will show: @@ -76,64 +76,66 @@ from datar import dplyr dplyr.filter # ok ``` -There are a couple of ways to disable: +There are a couple of other options for this behavior: -1. Use option: `warn.builtin.names` +1. Use option: `import_names_conflict` or `import.names.conflict` -```python ->>> from datar import options ->>> options(warn_builtin_names=False) ->>> from datar.all import * # ok ->>> options(warn_builtin_names=True) ->>> from datar.all import * -[2021-06-18 18:02:35][datar][WARNING] Builtin name "min" has been overriden by datar. -[2021-06-18 18:02:35][datar][WARNING] Builtin name "max" has been overriden by datar. -[2021-06-18 18:02:35][datar][WARNING] Builtin name "sum" has been overriden by datar. -[2021-06-18 18:02:35][datar][WARNING] Builtin name "abs" has been overriden by datar. -[2021-06-18 18:02:35][datar][WARNING] Builtin name "round" has been overriden by datar. -[2021-06-18 18:02:35][datar][WARNING] Builtin name "all" has been overriden by datar. -[2021-06-18 18:02:35][datar][WARNING] Builtin name "any" has been overriden by datar. -[2021-06-18 18:02:35][datar][WARNING] Builtin name "re" has been overriden by datar. -[2021-06-18 18:02:35][datar][WARNING] Builtin name "filter" has been overriden by datar. -[2021-06-18 18:02:35][datar][WARNING] Builtin name "slice" has been overriden by datar. -``` + ```python + >>> from datar import options + >>> options(import_names_conflict="silent") + >>> from datar.all import * # ok + >>> options(import_names_conflict="warn") + >>> from datar.all import * + [2021-06-18 18:02:35][datar][WARNING] Builtin name "min" has been masked by datar. + [2021-06-18 18:02:35][datar][WARNING] Builtin name "max" has been masked by datar. + [2021-06-18 18:02:35][datar][WARNING] Builtin name "sum" has been masked by datar. + [2021-06-18 18:02:35][datar][WARNING] Builtin name "abs" has been masked by datar. + [2021-06-18 18:02:35][datar][WARNING] Builtin name "round" has been masked by datar. + [2021-06-18 18:02:35][datar][WARNING] Builtin name "all" has been masked by datar. + [2021-06-18 18:02:35][datar][WARNING] Builtin name "any" has been masked by datar. + [2021-06-18 18:02:35][datar][WARNING] Builtin name "re" has been masked by datar. + [2021-06-18 18:02:35][datar][WARNING] Builtin name "filter" has been masked by datar. + [2021-06-18 18:02:35][datar][WARNING] Builtin name "slice" has been masked by datar. + ``` -2. Import `_no_warn` + If you don't want to use the conflict names at all: + ```python + >>> from datar import options + >>> options(import_names_conflict="underscore_suffixed") + >>> from datar.all import * + >>> filter + >>> # # builtin filter + >>> filter_ + >>> # + ``` -```python ->>> from datar.dplyr import _no_warn ->>> from datar.dplyr import * # ok ->>> from datar.dplyr import _warn ->>> from datar.dplyr import * # warn again -[2021-06-18 18:03:54][datar][WARNING] Builtin name "filter" has been overriden by datar. -[2021-06-18 18:03:54][datar][WARNING] Builtin name "slice" has been overriden by datar. -``` + You can also change the default options and save them in the configuration file if you don't want to change the options every time you use `datar`. See [options](../options/#configuration-files) for details. -3. Let the `logger` hide the message +2. Let the `logger` hide the message -```python ->>> from datar.core import logger ->>> logger.setLevel(40) ->>> from datar.dplyr import * # ok -``` + ```python + >>> from datar.core import logger + >>> logger.setLevel(40) + >>> from datar.dplyr import * # ok + ``` -4. Use aliases instead +3. Use aliases instead -```python ->>> from datar.all import filter -[2021-06-24 10:06:19][datar][WARNING] Builtin name "filter" has been overriden by datar. -``` + ```python + >>> from datar.all import filter + [2021-06-24 10:06:19][datar][WARNING] Builtin name "filter" has been masked by datar. + ``` -But this is Okay: -```python ->>> from datar.all import filter_ ->>> filter_ - -``` + But this is Okay: + ```python + >>> from datar.all import filter_ + >>> filter_ + + ``` -Or you could even aliase it to `filter` by yourself: -```python ->>> from datar.all import filter_ as filter # no warnings ->>> filter - -``` + Or you could even aliase it to `filter` by yourself: + ```python + >>> from datar.all import filter_ as filter # no warnings + >>> filter + + ``` diff --git a/docs/notebooks/nb_helpers.py b/docs/notebooks/nb_helpers.py index 0963e6cf..2ead087a 100644 --- a/docs/notebooks/nb_helpers.py +++ b/docs/notebooks/nb_helpers.py @@ -7,7 +7,7 @@ from varname.helpers import debug # noqa from datar import options -options(warn_builtin_names=False) +options(import_names_conflict="silent") InteractiveShell.ast_node_interactivity = "all" diff --git a/docs/notebooks/nest.ipynb b/docs/notebooks/nest.ipynb index 4348be36..e6e778fb 100644 --- a/docs/notebooks/nest.ipynb +++ b/docs/notebooks/nest.ipynb @@ -377,7 +377,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.142333Z", @@ -831,7 +831,7 @@ "49 5.0 3.3 1.4 0.2" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -843,7 +843,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.186712Z", @@ -911,7 +911,7 @@ "2 virginica " ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -923,7 +923,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.207533Z", @@ -996,7 +996,7 @@ "2 virginica " ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -1007,7 +1007,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.265853Z", @@ -1080,7 +1080,7 @@ "2 virginica " ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1091,7 +1091,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.333173Z", @@ -1257,7 +1257,7 @@ "[TibbleGrouped: fish (n=19)]" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1268,7 +1268,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.401063Z", @@ -1343,7 +1343,7 @@ "[TibbleGrouped: cyl (n=3)]" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1363,7 +1363,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.451681Z", @@ -1443,7 +1443,7 @@ "3 3 3 2" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1462,7 +1462,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.535876Z", @@ -1549,7 +1549,7 @@ "4 3 3.0 2.0" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1560,7 +1560,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.550880Z", @@ -1633,7 +1633,7 @@ "2 c 3 22" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1649,7 +1649,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.614822Z", @@ -1736,7 +1736,7 @@ "4 c 3 22" ] }, - "execution_count": 15, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } diff --git a/docs/options.md b/docs/options.md new file mode 100644 index 00000000..60438d82 --- /dev/null +++ b/docs/options.md @@ -0,0 +1,62 @@ +Options are used to change some behaviors in `datar`. + +## Available options + +### dplyr_summarise_inform + +Default: `True` + +With `dplyr.summarise()`, when `_groups` is not specified, a message is printed to inform the choice (`drop_last` or `keep`), based on the number of rows in the results. + +See [https://dplyr.tidyverse.org/reference/summarise.html](https://dplyr.tidyverse.org/reference/summarise.html) + +### import_names_conflict + +What to do when there are conflicts importing names +- `warn` (default): show warnings +- `silent`: ignore the conflicts +- `underscore_suffixed`: add suffix `_` to the conflicting names + (and don't do any warnings) + +See also [Import datar/Warn abbout python reserved names](../import/#warn-about-python-reserved-names-to-be-overriden-by-datar) + +### enable_pdtypes + +Default: `True` + +Whether to enable `pdtypes`, a package that shows data types right beneith the column names when a data frame is present in string, HTML or a jupyter notebook. See: + +[https://github.com/pwwang/pdtypes](https://github.com/pwwang/pdtypes) + +### warn_astnode_failure + +Default: `True` + +Warn about failure to get ast node, will update `pipda.options.warn_astnode_failure` + +### assume_all_piping + +Default: `False` + +When `True`, enables all-piping mode: + +- Assume all verbs are using `PIPING_VERB` env +- Assume all data functions are using `PIPING` env +- Assume all non-data functions are using `PIPING` verbs + +This is useful when source code is not available. + +### backend + +The backend for datar. `pandas` (default) or `modin` + + +## Configuration files + +You can change the default behavior of datar by configuring a `.toml.toml` file in your home directory. For example, to always use underscore-suffixed names for conflicting names, you can add the following to your `~/.datar.toml` file: + +```toml +import_names_conflict = "underscore_suffixed" +``` + +You can also have a project/directory-based configuration file (`./.datar.toml`) in your current working directory, which has higher priority than the home directory configuration file. diff --git a/docs/reference-maps/base.md b/docs/reference-maps/base.md index 6ebf5f8b..56e4f5d9 100644 --- a/docs/reference-maps/base.md +++ b/docs/reference-maps/base.md @@ -285,6 +285,7 @@ See [here](../stats) for APIs ported from `r-stats` and [here](../utils) for API |API|Description|Notebook example| |---|---|---:| +|[`glimpse()`][166]|Get a glimpse of your data|| |[`cut()`][113]|Convert Numeric to Factor|[:material-notebook:][163]| |[`diff()`][164]|Returns suitably lagged and iterated differences.|[:material-notebook:][163]| |[`identity()`][114]|Identity Function|[:material-notebook:][163]| @@ -464,3 +465,4 @@ See [here](../stats) for APIs ported from `r-stats` and [here](../utils) for API [163]: ../../notebooks/base-funs [164]: ../../api/datar.base.funs/#datar.base.funs.diff [165]: ../../api/datar.base.funs/#datar.base.funs.outer +[166]: ../../api/datar.base.glimpse/#datar.base.glimpse.glimpse diff --git a/mkdocs.yml b/mkdocs.yml index bd85db22..2264086d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -60,6 +60,7 @@ nav: - 'Porting rules': 'porting_rules.md' - 'Import datar': 'import.md' - 'Backends': 'backends.md' + - 'Options': 'options.md' - 'The f-expression': 'f.md' - 'Caveats': 'Indexing/Selection': 'caveats/indexing.md' diff --git a/poetry.lock b/poetry.lock index 3af8fd3b..810dafd4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -188,7 +188,7 @@ pandas = ">=1.2,<2.0" [[package]] name = "pipda" -version = "0.5.8" +version = "0.5.9" description = "A framework for data piping in python" category = "main" optional = false @@ -236,14 +236,14 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "pyparsing" -version = "3.0.7" -description = "Python parsing module" +version = "3.0.8" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.6.8" [package.extras] -diagrams = ["jinja2", "railroad-diagrams"] +diagrams = ["railroad-diagrams", "jinja2"] [[package]] name = "pytest" @@ -419,15 +419,15 @@ python-versions = "*" [[package]] name = "zipp" -version = "3.7.0" +version = "3.8.0" description = "Backport of pathlib-compatible object wrapper for zip files" category = "dev" optional = false python-versions = ">=3.7" [package.extras] -docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy"] +docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] [extras] modin = ["modin"] @@ -590,8 +590,8 @@ pdtypes = [ {file = "pdtypes-0.0.4.tar.gz", hash = "sha256:4f76fac05a785d558b809aa59d2ec13ee22e105267f2bde025007242fe8f4983"}, ] pipda = [ - {file = "pipda-0.5.8-py3-none-any.whl", hash = "sha256:7c0cfc4e10dade109f9bfab2c8ae2c7c005c21353c24eca6a769573edc2822a2"}, - {file = "pipda-0.5.8.tar.gz", hash = "sha256:16faf22f860b9017b8a801343e426c7dcbbe7e3aa0962d2e0d29b84b85dff455"}, + {file = "pipda-0.5.9-py3-none-any.whl", hash = "sha256:fa8dfb2926430704aa6d9e198a7514ffa2e72435c3f05bb52e7f4ef13ac374a3"}, + {file = "pipda-0.5.9.tar.gz", hash = "sha256:0127bd2351bb2b565259106b0bbb90d32f64f9703c8356873c9c946531661572"}, ] pluggy = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, @@ -606,8 +606,8 @@ py = [ {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] pyparsing = [ - {file = "pyparsing-3.0.7-py3-none-any.whl", hash = "sha256:a6c06a88f252e6c322f65faf8f418b16213b51bdfaece0524c1c1bc30c63c484"}, - {file = "pyparsing-3.0.7.tar.gz", hash = "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea"}, + {file = "pyparsing-3.0.8-py3-none-any.whl", hash = "sha256:ef7b523f6356f763771559412c0d7134753f037822dad1b16945b7b846f7ad06"}, + {file = "pyparsing-3.0.8.tar.gz", hash = "sha256:7bf433498c016c4314268d95df76c81b842a4cb2b276fa3312cfb1e1d85f6954"}, ] pytest = [ {file = "pytest-7.1.1-py3-none-any.whl", hash = "sha256:92f723789a8fdd7180b6b06483874feca4c48a5c76968e03bb3e7f806a1869ea"}, @@ -687,6 +687,6 @@ wcwidth = [ {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"}, ] zipp = [ - {file = "zipp-3.7.0-py3-none-any.whl", hash = "sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"}, - {file = "zipp-3.7.0.tar.gz", hash = "sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d"}, + {file = "zipp-3.8.0-py3-none-any.whl", hash = "sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099"}, + {file = "zipp-3.8.0.tar.gz", hash = "sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad"}, ] diff --git a/pyproject.toml b/pyproject.toml index f5552ea1..a7d0502d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "datar" -version = "0.7.2" +version = "0.8.0" description = "Port of dplyr and other related R packages in python, using pipda." authors = ["pwwang "] readme = "README.md" @@ -10,7 +10,7 @@ repository = "https://github.com/pwwang/datar" [tool.poetry.dependencies] python = "^3.7.1" # align with pandas -pipda = "^0.5.7" +pipda = "^0.5.9" diot = "^0.1.1" # pipda already requires the following # executing = "*" diff --git a/tests/base/test_factor.py b/tests/base/test_factor.py index f5f4723e..5c455a58 100644 --- a/tests/base/test_factor.py +++ b/tests/base/test_factor.py @@ -2,6 +2,7 @@ import numpy as np from datar.core.backends.pandas import Series +from datar.core.backends.pandas.core.groupby import SeriesGroupBy from datar.base.factor import ( droplevels, factor, @@ -13,7 +14,7 @@ ordered, # is_categorical, ) -from ..conftest import assert_iterable_equal +from ..conftest import assert_factor_equal, assert_iterable_equal def test_droplevels(): @@ -45,6 +46,13 @@ def test_factor(): assert_iterable_equal(levels(out), [2, 3]) +def test_factor_sgb(): + x = Series([1, 2, 3]).groupby([1, 1, 3]) + out = factor(x) + assert isinstance(out, SeriesGroupBy) + assert_factor_equal(out.obj.values, factor([1, 2, 3])) + + def test_as_facotr(): out = as_factor([1, 2, 3]) assert_iterable_equal(out, [1, 2, 3]) diff --git a/tests/base/test_glimpse.py b/tests/base/test_glimpse.py new file mode 100644 index 00000000..14339a0d --- /dev/null +++ b/tests/base/test_glimpse.py @@ -0,0 +1,43 @@ +import pytest + +from datar.base.glimpse import Glimpse, formatter +from datar.all import ( + f, + group_by, + glimpse, + tibble, + nest, +) + + +def test_glimpse_str_df(capsys): + df = tibble(x=f[:10], y=[str(i) for i in range(10)]) + glimpse(df) + out = capsys.readouterr().out + assert "Rows: 10" in out + assert "Columns: 2" in out + assert "0, 1, 2" in out + + +def test_glimpse_str_nest_df(capsys): + df = tibble(x=f[:10], y=f[10:20]) >> nest(data=~f.x) + glimpse(df) + out = capsys.readouterr().out + assert "Rows: 10" in out + assert "Columns: 2" in out + assert ", " in out + + +def test_glimpse_str_gf(capsys): + df = tibble(x=f[:10], y=[str(i) for i in range(10)]) >> group_by(f.y) + glimpse(df) + assert "Groups: y [10]" in capsys.readouterr().out + + +def test_glimpse_html_df(): + df = tibble(x=f[:20], y=[str(i) for i in range(20)]) + g = Glimpse(df, 100, formatter) + assert repr(g).startswith("" in out diff --git a/tests/conftest.py b/tests/conftest.py index 98d2a267..024fd74a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,13 +2,15 @@ import pytest +from datar.core import import_names_conflict + @pytest.fixture(scope="function", autouse=True) def no_astnode_warn(): warnings.filterwarnings( - action='ignore', + action="ignore", category=UserWarning, - message=r'Failed to fetch the node.+', + message=r"Failed to fetch the node.+", ) @@ -19,10 +21,12 @@ def pytest_addoption(parser): def pytest_sessionstart(session): backend = session.config.getoption("backend") from datar import options + options(backend=backend) from datar.base import set_seed - options(warn_astnode_failure=False, warn_builtin_names=False) + + options(warn_astnode_failure=False, import_names_conflict="silent") set_seed(8888) @@ -31,6 +35,7 @@ def pytest_sessionstart(session): def assert_iterable_equal(x, y, na=SENTINEL, approx=False): import pandas as pd + x = [na if pd.isnull(elt) else elt for elt in x] y = [na if pd.isnull(elt) else elt for elt in y] if approx is True: diff --git a/tests/core/test_import_name_conflict.py b/tests/core/test_import_name_conflict.py new file mode 100644 index 00000000..fb268e8c --- /dev/null +++ b/tests/core/test_import_name_conflict.py @@ -0,0 +1,101 @@ +import pytest +import os +import inspect +import builtins +import importlib +from pathlib import Path +from contextlib import contextmanager + +import toml +from datar.core import options + + +@pytest.fixture(autouse=True) +def clear_warns(): + from datar.core import import_names_conflict + + oldopt = options("warn.builtin.names") + options(import_names_conflict="warn") + import_names_conflict.WARNED.clear() + yield + options(import_names_conflict=oldopt) + + +def write_options(optfile, **opts): + with optfile.open("w") as f: + toml.dump(opts, f) + + +@contextmanager +def reload_dplyr(tmpdir, **opts): + oldcwd = Path.cwd() + newcwd = tmpdir / "cwd" + newcwd.mkdir() + os.chdir(newcwd) + + configfile = newcwd / ".datar.toml" + write_options(configfile, **opts) + opt_module = inspect.getmodule(options) + from datar import dplyr + # from datar.core import import_names_conflict + + importlib.reload(opt_module) + # importlib.reload(import_names_conflict) + importlib.reload(dplyr) + try: + yield + finally: + os.chdir(oldcwd) + importlib.reload(opt_module) + importlib.reload(dplyr) + + +def test_nonexist_names_donot_exist(): + with pytest.raises(ImportError): + from datar.dplyr import x + + with pytest.raises(ImportError): + from datar.all import x + + +def test_trailing_underscore_bypass_warn(caplog): + from datar.dplyr import filter_ + + assert callable(filter_) + assert filter_ is not builtins.filter + assert caplog.text == "" + + +def test_direct_import_gets_warning(caplog): + from datar.dplyr import filter + + assert '"filter" has been masked' in caplog.text + caplog.clear() + + # second import won't warn + from datar.dplyr import filter + + assert caplog.text == "" + + +def test_direct_import_from_all_warns_once(caplog): + from datar.all import slice + + assert caplog.text.count('"slice"') == 1 + + +def test_config_file_controls_silent(caplog, tmp_path): + + with reload_dplyr(tmp_path, import_names_conflict="silent"): + from datar.dplyr import slice + + assert caplog.text == "" + + +def test_config_file_controls_underscore_suffixed(tmp_path): + with reload_dplyr(tmp_path, import_names_conflict="underscore_suffixed"): + + # from datar.dplyr import slice_ + + with pytest.raises(ImportError): + from datar.dplyr import slice diff --git a/tests/core/test_options.py b/tests/core/test_options.py index 983c3d10..c9f4029a 100644 --- a/tests/core/test_options.py +++ b/tests/core/test_options.py @@ -4,7 +4,6 @@ options_context, add_option, get_option, - OPTIONS, ) @@ -16,6 +15,7 @@ def reset_options(): def test_options_empty_args_returns_full_options(): + from datar.core.options import OPTIONS out = options() assert out == OPTIONS diff --git a/tests/core/test_warn_builtin_names.py b/tests/core/test_warn_builtin_names.py deleted file mode 100644 index 6dfaec8e..00000000 --- a/tests/core/test_warn_builtin_names.py +++ /dev/null @@ -1,38 +0,0 @@ -import pytest -import builtins -from datar.core.options import options - -@pytest.fixture(autouse=True) -def clear_warns(): - from datar.core import warn_builtin_names - oldopt = options('warn.builtin.names') - options(warn_builtin_names=True) - warn_builtin_names.WARNED.clear() - yield - options(warn_builtin_names=oldopt) - -def test_nonexist_names_donot_exist(): - with pytest.raises(ImportError): - from datar.dplyr import x - - with pytest.raises(ImportError): - from datar.all import x - -def test_trailing_underscore_bypass_warn(caplog): - from datar.dplyr import filter_ - assert callable(filter_) - assert filter_ is not builtins.filter - assert caplog.text == '' - -def test_direct_import_gets_warning(caplog): - from datar.dplyr import filter - assert '"filter" has been overriden' in caplog.text - caplog.clear() - - # second import won't warn - from datar.dplyr import filter - assert caplog.text == '' - -def test_direct_import_from_all_warns_once(caplog): - from datar.all import slice - assert caplog.text.count('"slice"') == 1