diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2513008f..e8d34970 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.7, 3.8] + python-version: [3.7, 3.8, 3.9] steps: - uses: actions/checkout@v2 diff --git a/.gitignore b/.gitignore index e075e17d..d19f5519 100644 --- a/.gitignore +++ b/.gitignore @@ -108,6 +108,8 @@ poetry.lock docs/index.md docs/logo.png +docs/example.png +docs/example2.png docs/api/ # vscode's local history extension diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f0f2cedd..3d8e3c6b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -42,4 +42,11 @@ repos: language: system args: [tests/] pass_filenames: false - files: ^tests/.+$|^pipda/.+$ + files: ^tests/.+$|^datar/.+$ + - id: mkdocs + name: Compile docs + entry: mkdocs + language: system + args: [build] + pass_filenames: false + files: ^datar/.+$ diff --git a/README.md b/README.md index 05113e97..482fb6b2 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Port of R data packages ([tidyr][1], [dplyr][2], [tibble][4], etc) in python, us -[API][5] +[Documentation][5] ## Installtion @@ -81,128 +81,16 @@ df = tibble(x=numpy.linspace(0, 2*pi, 500)) # for example: klib import klib from pipda import register_verb +from datar.core.contexts import Context from datar.datasets import iris from datar.dplyr import pull -dist_plot = register_verb(func=klib.dist_plot) +dist_plot = register_verb(context=Context.EVAL)(klib.dist_plot) iris >> pull(f.Sepal_Length) >> dist_plot() ``` ![example](./example2.png) -## Examples - -### dplyr - One table verbs - -- [x] [`arrange()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Farrange.ipynb): Arrange rows by column values -- [x] [`count()`, `tally()`, `add_count()`, `add_tally()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fcount.ipynb): Count observations by group -- [x] [`distinct()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fdistinct.ipynb): Subset distinct/unique rows -- [x] [`filter()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Ffilter.ipynb): Subset rows using column values -- [x] [`mutate()`, `transmute()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fmutate.ipynb): Create, modify, and delete columns -- [x] [`pull()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fpull.ipynb): Extract a single column -- [x] [`relocate()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Frelocate.ipynb): Change column order -- [x] [`rename()`, `rename_with()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Frename.ipynb): Rename columns -- [x] [`select()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fselect.ipynb): Subset columns using their names and types -- [x] [`summarise()`, `summarize()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fsummarise.ipynb): Summarise each group to fewer rows -- [x] [`slice()`, `slice_head()`, `slice_tail()`, `slice_min()`, `slice_max()`, `slice_sample()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fslice.ipynb): Subset rows using their positions (TODO: implement groupby-aware `slice`) - -### dplyr - Two table verbs -- [x] [`bind_rows()`, `bind_cols()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fbind.ipynb): Efficiently bind multiple data frames by row and column -- [x] [`inner_join()`, `left_join()`, `right_join()`, `full_join()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fmutate-joins.ipynb): Mutating joins -- [x] [`nest_join()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fnest_join.ipynb): Nest join (TODO: check API consistency) -- [x] [`semi_join()`, `anti_join()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Ffilter-joins.ipynb): Filtering joins (TODO: warning when `by` is not specified) - -### dplyr - Grouping -- [x] [`group_by()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fgroup_by) [`ungroup()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fgroup_by): Group by one or more variables -- [x] `group_cols()`: Select grouping variables -- [x] [`rowwise()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Frowwise): Group input by rows - -### dplyr - Vector functions -- [x] [`across()`, `c_across()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Facross.ipynb): Apply a function (or a set of functions) to a set of columns -- [x] `between()`: No need, use `a < x < b` in python instead -- [x] [`case_when()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fcase_when.ipynb): A general vectorised if -- [x] [`coalesce()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fcoalesce.ipynb): Find first non-missing element -- [x] [`cumall()`, `cumany()`, `cummean()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fcumall.ipynb): Cumulativate versions of any, all, and mean -- [x] [`desc()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fdesc.ipynb): Descending order -- [x] [`if_else()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fif_else.ipynb): Vectorised if -- [x] [`lag()`, `lead()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Flead-lag.ipynb): Compute lagged or leading values -- [x] `order_by()`: A helper function for ordering window function output (will not be implemented). The behavior is implemented in the functions themselves (i.e, `lead` and `lag`). -- [x] [`n()`, `cur_data()`, `cur_data_all()`, `cur_group()`, `cur_group_id()`, `cur_group_rows()`, `cur_column()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fcontext.ipynb): Context dependent expressions -- [x] [`n_distinct()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fn_distinct.ipynb): Efficiently count the number of unique values in a set of vector -- [x] [`na_if()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fna_if.ipynb): Convert values to NA -- [x] [`near()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fnear.ipynb): Compare two numeric vectors -- [x] [`nth()`, `first()`, `last()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fnth.ipynb): Extract the first, last or nth value from a vector -- [x] [`row_number()`, `ntile()`, `min_rank()`, `dense_rank()`, `percent_rank()`, `cume_dist()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Franking.ipynb): Windowed rank functions. -- [x] [`recode()`, `recode_factor()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Frecode.ipynb): Recode values - -### dplyr - Data -- [x] `band_members`, `band_instruments`, `band_instruments2`: Band membership -- [x] `starwars`: Starwars characters -- [x] `storms`: Storm tracks data - -### dplyr - Experimental - -Experimental functions in dplyr. - -- [x] [`group_map()`, `group_modify()`, `group_walk()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fgroup_map.ipynb): Apply a function to each group -- [x] [`group_trim()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fgroup_trim.ipynb): Trim grouping structure -- [x] [`group_split()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fgroup_split.ipynb): Split data frame by groups -- [x] [`with_groups()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fwith_groups.ipynb): Perform an operation with temporary groups - - -### tidyr - Pivoting - - -**Pivoting** changes the representation of a rectangular dataset, without changing the data inside of it. - -- [x] [`pivot_longer()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fpivot_longer.ipynb): Pivot data from wide to long -- [x] [`pivot_wider()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fpivot_wider.ipynb): Pivot data from long to wide - - -### tidyr - Character vectors - - -Multiple variables are sometimes pasted together into a single column, and these tools help you separate back out into individual columns. - -- [x] [`extract()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fextract.ipynb): Extract a character column into multiple columns using regular expression groups -- [x] [`separate()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fseparate.ipynb): Separate a character column into multiple columns with a regular expression or numeric locations -- [x] [`separate_rows()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fseparate.ipynb): Separate a collapsed column into multiple rows -- [x] [`unite()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Funite.ipynb): Unite multiple columns into one by pasting strings together - -### tidyr - Missing values - - -Tools for converting between implicit (absent rows) and explicit (`NA`) missing values, and for handling explicit `NA`s. - -- [ ] [`complete()`](htts://pwwang.github.io/datar/reference/complete.ipynb): Complete a data frame with missing combinations of data -- [x] [`drop_na()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fdrop_na.ipynb): Drop rows containing missing values -- [ ] [`expand()`, `crossing()`, `nesting()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fexpand.ipynb): Expand data frame to include all possible combinations of values -- [x] [`expand_grid()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fexpand_grid.ipynb): Create a tibble from all combinations of inputs -- [x] [`fill()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Ffill.ipynb): Fill in missing values with previous or next value -- [x] [`full_seq()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Ffull_seq.ipynb): Create the full sequence of values in a vector -- [x] [`replace_na()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Freplace_na.ipynb): Replace NAs with specified values - -### tidyr - Miscellanea - -- [x] [`uncount()`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Funcount.ipynb): "Uncount" a data frame - -### tidyr - Data - -- [x] `billboard`: Song rankings for billboard top 100 in the year 2000 -- [x] `construction`: Completed construction in the US in 2018 -- [x] `fish_encounters`: Fish encounters -- [x] `relig_income`: Pew religion and income survey -- [x] `smiths`: Some data about the Smith family -- [x] `table1`, `table3`, `table4a`, `table4b`, `table5`: Example tabular representations -- [x] `us_rent_income`: US rent and income data -- [x] `who`, `population`: World Health Organization TB data -- [x] `world_bank_pop`: Population data from the world bank - -### datar - Specific verbs and functions - -- [x] [`get`](https://mybinder.org/v2/gh/pwwang/datar/HEAD?filepath=examples%2Fget.ipynb): Get an element, or a subset of a dataframe. -- [x] `flatten`: Flatten the whole dataframe into an array/list. - [1]: https://tidyr.tidyverse.org/index.html [2]: https://dplyr.tidyverse.org/index.html [3]: https://github.com/pwwang/pipda diff --git a/datar/base/__init__.py b/datar/base/__init__.py index b832dc74..df9ef609 100644 --- a/datar/base/__init__.py +++ b/datar/base/__init__.py @@ -7,9 +7,10 @@ as_int, as_integer, as_logical, as_bool, table, as_numeric, c, ceiling, floor, cummax, cummin, cumprod, cumsum, cut, sample, is_categorical, is_character, is_double, is_factor, is_float, - is_int, is_na, is_numeric, sum, mean, median, min, max, as_int64, + is_int, is_int64, is_integer, is_na, is_numeric, sum, mean, median, + min, max, as_int64, unique, Im, Re, is_in, is_element, length, seq_along, seq_len, seq, abs, pmax, pmin, round, sqrt, - droplevels, sin, cos + droplevels, sin, cos, identity, expandgrid ) # plain functions from .funcs import factor, rep, context diff --git a/datar/base/funcs.py b/datar/base/funcs.py index e9ba01bf..516264f9 100644 --- a/datar/base/funcs.py +++ b/datar/base/funcs.py @@ -6,26 +6,29 @@ import builtins import math import datetime +import itertools import functools from typing import Any, Iterable, List, Optional, Type, Union import numpy import pandas from pandas.core.dtypes.common import ( - is_categorical_dtype, is_float_dtype, is_numeric_dtype, is_string_dtype + is_categorical_dtype, is_float_dtype, is_int64_dtype, is_integer_dtype, + is_numeric_dtype, is_string_dtype ) from pandas import Series, Categorical, DataFrame from pandas.core.groupby.generic import SeriesGroupBy from pipda import Context, register_func from .constants import NA -from ..core.utils import categorize, objectize -from ..core.middlewares import Collection, ContextWithData +from ..core.utils import categorize, objectize, logger +from ..core.middlewares import Collection, WithDataEnv from ..core.types import ( BoolOrIter, DataFrameType, DoubleOrIter, IntOrIter, NumericOrIter, - NumericType, SeriesLikeType, StringOrIter, is_int, IntType, + NumericType, SeriesLikeType, StringOrIter, is_scalar_int, IntType, is_iterable, is_series_like, is_scalar ) +from ..core.contexts import Context # pylint: disable=redefined-builtin,invalid-name @@ -50,7 +53,7 @@ def _( tz: Union[int, datetime.timedelta] = 0, origin: Any = None ) -> datetime.date: - if is_int(tz): + if is_scalar_int(tz): tz = datetime.timedelta(hours=int(tz)) return x + tz @@ -64,7 +67,7 @@ def _( tz: Union[IntType, datetime.timedelta] = 0, origin: Any = None ): - if is_int(tz): + if is_scalar_int(tz): tz = datetime.timedelta(hours=int(tz)) return (x + tz).date() @@ -78,7 +81,7 @@ def _( tz: Union[IntType, datetime.timedelta] = 0, origin: Any = None ) -> datetime.date: - if is_int(tz): + if is_scalar_int(tz): tz = datetime.timedelta(hours=int(tz)) try_formats = try_formats or [ @@ -301,6 +304,7 @@ def as_integer(x: Any) -> Union[numpy.int64, Iterable[numpy.int64]]: @register_func(None, context=Context.EVAL) def as_numeric(x: Any) -> NumericOrIter: + """Make elements numeric""" try: return as_integer(x) except (ValueError, TypeError): @@ -354,6 +358,24 @@ def is_categorical(x: Any) -> bool: is_factor = is_categorical +@register_func(None, context=Context.EVAL) +def is_int64(x: Any) -> BoolOrIter: + """Check if x is double/float data""" + x = objectize(x) + if is_scalar(x): + return isinstance(x, (int, numpy.int64)) + return is_int64_dtype(x) + +@register_func(None, context=Context.EVAL) +def is_integer(x: Any) -> BoolOrIter: + """Check if x is double/float data""" + x = objectize(x) + if is_scalar(x): + return is_scalar_int(x) + return is_integer_dtype(x) + +is_int = is_integer + @register_func(None, context=Context.EVAL) def is_double(x: Any) -> BoolOrIter: """Check if x is double/float data""" @@ -384,16 +406,25 @@ def c(*elems: Any) -> Collection: """ return Collection(*elems) -@register_func(None) +@register_func(None, context=Context.EVAL) def seq_along(along_with: Iterable[Any]) -> SeriesLikeType: """Generate sequences along an iterable""" return numpy.array(range(len(along_with))) -@register_func(None) -def seq_len(length_out: int) -> SeriesLikeType: +@register_func(None, context=Context.EVAL) +def seq_len(length_out: IntOrIter) -> SeriesLikeType: """Generate sequences with the length""" + if is_scalar(length_out): + return numpy.array(range(int(length_out))) + if len(length_out) > 1: + logger.warning( + "In seq_len(%r) : first element used of 'length_out' argument", + length_out + ) + length_out = int(list(length_out)[0]) return numpy.array(range(length_out)) + @register_func(None, context=Context.EVAL) def seq( from_: IntType = None, @@ -824,47 +855,11 @@ def levels(x: Union[Series, Categorical]) -> Optional[List[Any]]: return None -# --------------------------------- -# Plain functions -# --------------------------------- - -def factor( - x: Iterable[Any], - levels: Optional[Iterable[Any]] = None, - exclude: Any = NA, - ordered: bool = False -) -> Categorical: - """encode a vector as a factor (the terms ‘category’ and ‘enumerated type’ - are also used for factors). - - If argument ordered is TRUE, the factor levels are assumed to be ordered - - Args: - x: a vector of data - levels: an optional vector of the unique values (as character strings) - that x might have taken. - exclude: a vector of values to be excluded when forming the set of - levels. This may be factor with the same level set as x or - should be a character - ordered: logical flag to determine if the levels should be regarded - as ordered (in the order given). - """ - if is_categorical_dtype(x): - x = x.to_numpy() - ret = Categorical( - objectize(x), - categories=levels, - ordered=ordered - ) - if is_scalar(exclude): - exclude = [exclude] - - return ret.remove_categories(exclude) - +@register_func(None, context=Context.EVAL) def rep( x: Any, times: Union[int, Iterable[int]] = 1, - length: Optional[int] = None, + length: Optional[int] = None, # pylint: disable=redefined-outer-name each: int = 1 ) -> Iterable[Any]: """replicates the values in x @@ -892,7 +887,7 @@ def rep( "Unexpected each argument when times is an iterable." ) - if is_int(times): + if is_scalar_int(times): x = [elem for elem in x for _ in range(each)] * int(times) else: x = [elem for n, elem in zip(times, x) for _ in range(n)] @@ -902,6 +897,56 @@ def rep( x = x * repeats return x[:length] +@register_func(None, context=Context.EVAL) +def unique(x: Iterable[Any]) -> numpy.ndarray: + """Get unique elements""" + return numpy.unique(x) + +@register_func(None, context=Context.EVAL) +def length(x: Any) -> int: + """Length of an object""" + if is_scalar(x): + return 1 + return len(x) + +# --------------------------------- +# Plain functions +# --------------------------------- + +def factor( + x: Iterable[Any], + # pylint: disable=redefined-outer-name + levels: Optional[Iterable[Any]] = None, + exclude: Any = NA, + ordered: bool = False +) -> Categorical: + """encode a vector as a factor (the terms ‘category’ and ‘enumerated type’ + are also used for factors). + + If argument ordered is TRUE, the factor levels are assumed to be ordered + + Args: + x: a vector of data + levels: an optional vector of the unique values (as character strings) + that x might have taken. + exclude: a vector of values to be excluded when forming the set of + levels. This may be factor with the same level set as x or + should be a character + ordered: logical flag to determine if the levels should be regarded + as ordered (in the order given). + """ + if is_categorical_dtype(x): + x = x.to_numpy() + ret = Categorical( + objectize(x), + categories=levels, + ordered=ordered + ) + if is_scalar(exclude): + exclude = [exclude] + + return ret.remove_categories(exclude) + def context(data: DataFrameType) -> Any: """Evaluate verbs, functions in the possibly modifying (a copy of) the original data. @@ -920,4 +965,53 @@ def context(data: DataFrameType) -> Any: Returns: The original or modified data """ - return ContextWithData(data) + return WithDataEnv(data) + +@register_func(None, context=None) +def identity(x: Any) -> Any: + """Return whatever passed in + + Expression objects are evaluated using parent context + """ + return x + +@register_func(None) +def expandgrid(*args: Iterable[Any], **kwargs: Iterable[Any]) -> DataFrame: + """Expand all combinations into a dataframe""" + iters = {} + for i, arg in enumerate(args): + iters[f'Var{i}'] = arg + iters.update(kwargs) + + return DataFrame( + list(itertools.product(*iters.values())), + columns=iters.keys() + ) + +@register_func(None) +def Re(numbers: NumericOrIter) -> numpy.ndarray: + """Real part of complex numbers""" + if is_scalar(numbers): + return numbers.real + ret = numpy.real(numbers) + return ret + +@register_func(None) +def Im(numbers: NumericOrIter) -> numpy.ndarray: + """Imaginary part of complex numbers""" + if is_scalar(numbers): + return numbers.imag + return numpy.imag(numbers) + +@register_func(None) +def is_element(elem: Any, elems: Iterable[Any]) -> BoolOrIter: + """Alias for R's is.element. + + We can't do `a %in% b` in python (in behaves differently), so + use this function instead + """ + if is_scalar(elem): + return elem in elems + return numpy.isin(elem, elems) + +is_in = is_element diff --git a/datar/base/verbs.py b/datar/base/verbs.py index 28865fac..149a45f6 100644 --- a/datar/base/verbs.py +++ b/datar/base/verbs.py @@ -4,12 +4,13 @@ import numpy from pandas.core.frame import DataFrame from pandas.core.groupby.generic import DataFrameGroupBy -from pipda import register_verb, Context +from pipda import register_verb from ..core.utils import objectize from ..core.types import IntType, is_scalar, DataFrameType +from ..core.contexts import Context -@register_verb((DataFrame, DataFrameGroupBy)) +@register_verb((DataFrame, DataFrameGroupBy), context=Context.EVAL) def colnames( df: DataFrameType, names: Optional[Iterable[str]] = None, @@ -39,7 +40,7 @@ def colnames( return ret.groupby(grouper, dropna=False) return ret -@register_verb((DataFrame, DataFrameGroupBy)) +@register_verb((DataFrame, DataFrameGroupBy), context=Context.EVAL) def rownames( df: DataFrameType, names: Optional[Iterable[str]] = None, @@ -69,7 +70,7 @@ def rownames( return ret.groupby(grouper, dropna=False) return ret -@register_verb((DataFrame, DataFrameGroupBy)) +@register_verb((DataFrame, DataFrameGroupBy), context=Context.EVAL) def dim(x: DataFrameType) -> Tuple[int]: """Retrieve the dimension of a dataframe. @@ -81,7 +82,7 @@ def dim(x: DataFrameType) -> Tuple[int]: """ return objectize(x).shape -@register_verb((DataFrame, DataFrameGroupBy)) +@register_verb((DataFrame, DataFrameGroupBy), context=Context.EVAL) def nrow(_data: DataFrameType) -> int: """Get the number of rows in a dataframe @@ -93,7 +94,7 @@ def nrow(_data: DataFrameType) -> int: """ return dim(_data)[0] -@register_verb((DataFrame, DataFrameGroupBy)) +@register_verb((DataFrame, DataFrameGroupBy), context=Context.EVAL) def ncol(_data: DataFrameType): """Get the number of columns in a dataframe @@ -105,7 +106,7 @@ def ncol(_data: DataFrameType): """ return dim(_data)[1] -@register_verb +@register_verb(context=Context.EVAL) def diag( x: Any = 1, nrow: Optional[IntType] = None, # pylint: disable=redefined-outer-name @@ -163,7 +164,7 @@ def _( return x -@register_verb(DataFrame) +@register_verb(DataFrame, context=Context.EVAL) def t(_data: DataFrame, copy: bool = False) -> DataFrame: """Get the transposed dataframe diff --git a/datar/core/contexts.py b/datar/core/contexts.py index b79a3c5d..42aeebe8 100644 --- a/datar/core/contexts.py +++ b/datar/core/contexts.py @@ -1,21 +1,53 @@ +"""Provides specific contexts for datar""" from collections import defaultdict -from pipda.context import ContextEval, ContextSelect +from enum import Enum +from typing import ClassVar +from pandas.core.frame import DataFrame +from pipda.context import ( + ContextEval as ContextEvalPipda, + ContextMixed, + ContextPending, + ContextSelect +) +from .utils import copy_flags +from .exceptions import ColumnNotExistingError -class ContextEvalWithUsedRefs(ContextEval): +class ContextEval(ContextEvalPipda): + """Evaluation context with used references traced""" + name: ClassVar[str] = 'eval' def __init__(self): self.used_refs = defaultdict(lambda: 0) - def getattr(self, data, ref): + def getitem(self, parent, ref): + """Interpret f[ref]""" self.used_refs[ref] += 1 - return super().getattr(data, ref) + if isinstance(parent, DataFrame) and ref not in parent: + cols = [col for col in parent.columns if col.startswith(f'{ref}$')] + if not cols: + raise ColumnNotExistingError(ref) + ret = parent.loc[: cols] + ret.columns = [col[(len(ref)+1):] for col in cols] + copy_flags(ret, parent) + return ret + try: + return super().getitem(parent, ref) + except KeyError as keyerr: + raise ColumnNotExistingError(str(keyerr)) from None - def getitem(self, data, ref): - self.used_refs[ref] += 1 - return super().getitem(data, ref) + getattr = getitem # make sure f.size gets f['size'] class ContextSelectSlice(ContextSelect): """Mark the context to interpret slice Whether turn f[:3] to first 3 columns or just the slice itself. """ + name: ClassVar[str] = 'select-slice' + +class Context(Enum): + """Context enumerator for types of contexts""" + UNSET = None + PENDING = ContextPending() + SELECT = ContextSelect() + EVAL = ContextEval() + MIXED = ContextMixed() diff --git a/datar/core/exceptions.py b/datar/core/exceptions.py index 148343a2..e11bb9b1 100644 --- a/datar/core/exceptions.py +++ b/datar/core/exceptions.py @@ -1,4 +1,6 @@ +"""Exceptions for datar""" class DatarException(Exception): + """Base exectpion for datar""" ... class ColumnNotExistingError(DatarException): @@ -9,3 +11,6 @@ class ColumnNameInvalidError(DatarException): class DataUnalignableError(DatarException): """When two data cannot be aligned to each other""" + +class NameNonUniqueError(DatarException): + """When check_unique fails""" diff --git a/datar/core/middlewares.py b/datar/core/middlewares.py index 9d9c012a..3b788508 100644 --- a/datar/core/middlewares.py +++ b/datar/core/middlewares.py @@ -1,25 +1,24 @@ +"""Middlewares for datar""" import builtins -from typing import Any, Iterable, List, Mapping, Optional, Set, Tuple, Union +from typing import ( + Any, Callable, Iterable, List, Mapping, Optional, Tuple, Union +) from abc import ABC -from threading import Lock from pandas import DataFrame from pandas.core.series import Series -import pipda from pipda.symbolic import DirectRefAttr -from pipda.context import Context, ContextBase, ContextSelect -from pipda.utils import DataContext +from pipda.context import Context, ContextBase +from pipda.utils import DataEnv, functype from .utils import ( - objectize, expand_collections, list_diff, sanitize_slice, select_columns, - logger + df_assign_item, objectize, expand_collections, list_diff, sanitize_slice, + select_columns, logger, to_df ) from .contexts import ContextSelectSlice from .types import DataFrameType, is_scalar -LOCK = Lock() - class Collection(list): """Mimic the c function in R @@ -80,6 +79,7 @@ def __ne__(self, other: object) -> bool: @property def complements(self): + """Complements of inverted columns""" if isinstance(self.context, ContextSelectSlice): # slice literal not being expanded return self @@ -91,7 +91,7 @@ def __repr__(self) -> str: return f"Inverted({self.elems})" class Negated: - + """Negated object""" def __init__(self, elems: Union[slice, list]) -> None: """In case of -[1,2,3] or -c(1,2,3) or -f[1:3]""" self.elems = [elems] if isinstance(elems, slice) else elems @@ -99,16 +99,17 @@ def __init__(self, elems: Union[slice, list]) -> None: def __repr__(self) -> str: return f"Negated({self.elems})" -class DescSeries(Series): - +class DescSeries(Series): # pylint: disable=too-many-ancestors + """Marking a series as descending""" @property def _constructor(self): return DescSeries class CurColumn: - + """Current column in across""" @classmethod def replace_args(cls, args: Tuple[Any], column: str) -> Tuple[Any]: + """Replace self with the real column in args""" return tuple(column if isinstance(arg, cls) else arg for arg in args) @classmethod @@ -117,14 +118,26 @@ def replace_kwargs( kwargs: Mapping[str, Any], column: str ) -> Mapping[str, Any]: + """Replace self with the real column in kwargs""" return { key: column if isinstance(val, cls) else val for key, val in kwargs.items() } class Across: - - def __init__(self, data, cols, fns, names, args, kwargs): + """Across object""" + def __init__( + self, + data: DataFrameType, + cols: Optional[Iterable[str]] = None, + fns: Optional[Union[ + Callable, + Iterable[Callable], + Mapping[str, Callable] + ]] = None, + names: Optional[str] = None, + kwargs: Optional[Mapping[str, Any]] = None + ) -> None: from ..dplyr.funcs import everything cols = everything(data) if cols is None else cols if not isinstance(cols, (list, tuple)): @@ -136,7 +149,7 @@ def __init__(self, data, cols, fns, names, args, kwargs): fns_list.append({'fn': fns}) elif isinstance(fns, (list, tuple)): fns_list.extend( - {'fn': fn, '_fn': i+1, '_fn0': i} + {'fn': fn, '_fn': i, '_fn1': i+1} for i, fn in enumerate(fns) ) elif isinstance(fns, dict): @@ -144,64 +157,29 @@ def __init__(self, data, cols, fns, names, args, kwargs): {'fn': value, '_fn': key} for key, value in fns.items() ) - # else: - # todo: check format of fns + elif fns is not None: + raise ValueError( + 'Argument `_fns` of across must be None, a function, ' + 'a formula, or a dict of functions.' + ) self.data = data self.cols = cols self.fns = fns_list self.names = names - self.args = args - self.kwargs = kwargs - self.context = None - - def desc_cols(self) -> Set[str]: - from ..dplyr.funcs import desc - if len(self.fns) != 1: - return set() - if self.fns[0]['fn'] is not desc: - return set() - return set(self.cols) - - def evaluate( - self, - context: Union[Context, ContextBase], - data: Optional[DataFrameType] = None - ) -> Any: - if data is None: - data = self.data - - if isinstance(context, Context): - context = context.value - - if isinstance(context, ContextSelect): - if not self.fns: - return self.cols - fn = self.fns[0]['fn'] - # todo: check # fns - pipda_type = getattr(fn, '__pipda__', None) - return [ - fn(col, *self.args, **self.kwargs) if not pipda_type - else fn( - col, - *CurColumn.replace_args(self.args, col), - **CurColumn.replace_kwargs(self.kwargs, col), - _calling_type='piping' - ).evaluate(data) - for col in self.cols - ] + self.kwargs = kwargs or {} + def evaluate(self, context: Optional[ContextBase] = None) -> DataFrame: + """Evaluate object with context""" if not self.fns: self.fns = [{'fn': lambda x: x}] - ret = {} - # Todo: groupby + ret = None for column in self.cols: for fn_info in self.fns: render_data = fn_info.copy() render_data['_col'] = column fn = render_data.pop('fn') - name_format = self.names if not name_format: name_format = ( @@ -210,141 +188,53 @@ def evaluate( ) name = name_format.format(**render_data) - pipda_type = getattr(fn, '__pipda__', None) - if not pipda_type: - ret[name] = fn( - context.getattr(data, column), - *CurColumn.replace_args(self.args, column), + if functype(fn) == 'plain': + value = fn( + self.data[column], **CurColumn.replace_kwargs(self.kwargs, column) ) else: # use fn's own context - ret[name] = fn( - DirectRefAttr(data, column), - *CurColumn.replace_args(self.args, column), + value = fn( + DirectRefAttr(self.data, column), **CurColumn.replace_kwargs(self.kwargs, column), - _calling_type='piping' - ).evaluate(data) - return ret - -class CAcross(Across): - - def __init__(self, data, cols, fns, names, args, kwargs): - super().__init__(data, cols, fns, names, args, kwargs) - - if not self.fns: - raise ValueError( - "No functions specified for c_across. " - "Note that the usage of c_across is different from R's. " - "You have to specify the function inside c_across, instead of " - "calling it with c_across(...) as arguments." - ) - - if len(self.fns) > 1: - raise ValueError("Only a single function is allowed in c_across.") - - self.fn = self.fns[0]['fn'] - - def evaluate( - self, - context: Union[Context, ContextBase], - data: Optional[DataFrame] = None - ) -> Any: - if isinstance(context, Context): - context = context.value - - if data is None: - data = self.data - - if not isinstance(data, RowwiseDataFrame): - return super().evaluate(context, data) - - return { - self.names: data[self.cols].apply( - self.fn, - axis=1, - args=self.args, - **self.kwargs - ) - } + _env='piping' + )(self.data, context) + + # todo: check if it is proper + # group information lost + value = objectize(value) + if ret is None: + ret = to_df(value, name) + else: + df_assign_item(ret, name, value) + return DataFrame() if ret is None else ret class IfCross(Across, ABC): - + """Base class for IfAny and IfAll""" if_type = None - def __init__(self, data, cols, fns, names, args, kwargs): - super().__init__(data, cols, fns, names, args, kwargs) - - func_name = f"if_{self.__class__.if_type}" - if not self.fns: - raise ValueError(f"No functions specified for {func_name!r}.") - - if len(self.fns) > 1: - raise ValueError( - f"Only a single function is allowed in {func_name!r}." - ) - - self.fn = self.fns[0]['fn'] - - def evaluate( - self, - context: Union[Context, ContextBase], - data: Optional[DataFrame] = None - ) -> Any: - if not self.fns: - raise ValueError("No functions specified for if_any.") - - if isinstance(context, Context): - context = context.value - - if data is None: - data = self.data - + def evaluate(self, context: Optional[ContextBase] = None) -> DataFrame: + """Evaluate the object with context""" agg_func = getattr(builtins, self.__class__.if_type) - - pipda_type = getattr(self.fn, '__pipda__', None) - if pipda_type not in (None, 'PlainFunction'): - def transform_fn(*args, **kwargs): - return self.fn(data, *args, **kwargs) - transform_fn = lambda *args, **kwargs: self.fn( - data, *args, **kwargs - ) - else: - transform_fn = self.fn - - def if_fn(_series, *args, **kwargs): - return agg_func(_series.transform(transform_fn, *args, **kwargs)) - - return data[self.cols].apply( - if_fn, - axis=1, - args=self.args, - **self.kwargs - ) + return super().evaluate(context).fillna( + False + ).astype( + 'boolean' + ).apply(agg_func, axis=1) class IfAny(IfCross): - + """For calls from dplyr's if_any""" if_type = 'any' class IfAll(IfCross): - + """For calls from dplyr's if_all""" if_type = 'all' -class RowwiseDataFrame(DataFrame): - - def __init__( - self, - *args: Any, - rowwise: Optional[Iterable[str]] = None, - **kwargs: Any - ) -> None: - super().__init__(*args, **kwargs) - self.flags.rowwise = rowwise or True - -class ContextWithData: - +class WithDataEnv: + """Implements `with data` to mimic R's `with(data, ...)`""" def __init__(self, data: Any) -> None: - self.data = DataContext(data) + self.data = DataEnv(data) def __enter__(self) -> Any: return self.data @@ -353,7 +243,7 @@ def __exit__(self, *exc_info) -> None: self.data.delete() class Nesting: - + """Nesting objects for calls from tidyr.nesting""" def __init__(self, *columns: Any, **kwargs: Any) -> None: self.columns = [] self.names = [] diff --git a/datar/core/names.py b/datar/core/names.py new file mode 100644 index 00000000..2a31533e --- /dev/null +++ b/datar/core/names.py @@ -0,0 +1,151 @@ +"""Name repairing""" +import inspect +import re +import keyword +from typing import Callable, List, Optional, Union, Iterable + +import numpy + +from .exceptions import NameNonUniqueError +from .types import is_iterable + +def _log_changed_names(changed_names: Iterable[str]) -> None: + """Log the changed names""" + if not changed_names: + return + from .utils import logger + logger.warning('New names:') + for orig_name, new_name in changed_names: + logger.warning('* %r -> %r', orig_name, new_name) + +def _repair_names_minimal(names: Iterable[str]) -> List[str]: + """Minimal repairing""" + return ["" if name in (None, numpy.nan) else str(name) for name in names] + +def _repair_names_unique( + names: Iterable[str], + quiet: bool = False, + sanitizer: Optional[Callable[[str], str]] = None +) -> List[str]: + """Make sure names are unique""" + min_names = _repair_names_minimal(names) + neat_names = [ + re.sub(r'(?:(? 1 or name == '': + name = f'{name}__{i}' + if name != names[i]: + changed_names.append((names[i], name)) + new_names.append(name) + if not quiet: + _log_changed_names(changed_names) + return new_names + +def _repair_names_universal( + names: Iterable[str], + quiet: bool = False +) -> List[str]: + """Make sure names are safely to be used as variable or attribute""" + min_names = _repair_names_minimal(names) + neat_names = [re.sub(r'[^\w]', '_', name) for name in min_names] + new_names = _repair_names_unique( + neat_names, + quiet=True, + sanitizer=lambda name: ( + f'_{name}' + if keyword.iskeyword(name) or (name and name[0].isdigit()) + else name + ) + ) + if not quiet: + changed_names = [ + (orig_name, new_name) + for orig_name, new_name in zip(names, new_names) + if orig_name != new_name + ] + _log_changed_names(changed_names) + return new_names + +def _repair_names_check_unique(names: Iterable[str]) -> Iterable[str]: + """Just check the uniqueness""" + for name in names: + if names.count(name) > 1: + raise NameNonUniqueError(f"Names must be unique: {name}") + if name == "" or name is numpy.nan: + raise NameNonUniqueError(f"Names can't be empty: {name}") + if re.search(r'(?:(? List[str]: + """Repair names based on the method + + Args: + names: The names to be repaired + repair: The method to repair + - `minimal`: Minimal names are never None or NA. + When an element doesn't have a name, its minimal name + is an empty string. + - `unique`: Unique names are unique. A suffix is appended to + duplicate names to make them unique. + - `universal`: Universal names are unique and syntactic, + meaning that you can safely use the names as variables without + causing a syntax error (like `f.`). + - A function, accepts either a list of names or a single name. + Function accepts a list of names must annotate the first + argument with `typing.Iterable` or `typing.Sequence`. + + Examples: + >>> repair_names([None]*3, repair="minimal") + >>> # ["", "", ""] + >>> repair_names(["x", NA], repair="minimal") + >>> # ["x", ""] + >>> repair_names(["", "x", "", "y", "x", "_2", "__"], repair="unique") + >>> # ["__0", "x__1", "__2", "y", "x__4", "__5", "__6"] + >>> repair_names(["", "x", NA, "x"], repair="universal") + >>> # ["__0", "x__1", "__2", "x__3"] + >>> repair_names(["(y)" "_z" ".2fa" "False"], repair="universal") + >>> # ["_y_", "_z", "_2fa", "_False"] + + Returns: + The repaired names + + Raises: + ValueError: when repair is not a string or callable + NameNonUniqueError: when check_unique fails + """ + if isinstance(repair, str): + repair = BUILTIN_REPAIR_METHODS[repair] + elif is_iterable(repair) and all(isinstance(elem, str) for elem in repair): + return repair + elif not callable(repair): + raise ValueError('Expect a function for name repairing.') + + parameters = inspect.signature(repair).parameters + annotation = list(parameters.values())[0].annotation + if ( + annotation is inspect._empty or + annotation._name not in ('Iterable', 'Sequence') + ): # scalar input + return [repair(name) for name in names] + + return repair(names) diff --git a/datar/core/operator.py b/datar/core/operator.py index 972a2600..75fe3e26 100644 --- a/datar/core/operator.py +++ b/datar/core/operator.py @@ -1,43 +1,38 @@ +"""Operators for datar""" from functools import partial import operator -from typing import Any -from pandas.core.frame import DataFrame -from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy -from pandas.core.groupby.groupby import GroupBy -from pandas.core.series import Series -from pipda import register_operator, Operator, Context -from pipda.context import ContextSelect +from typing import Any, Optional +from pipda import register_operator, Operator +from pipda.context import ContextBase -from .utils import align_value, objectize, list_intersect, list_union -from .middlewares import Collection, Inverted, Negated +from .utils import align_value, list_intersect, list_union +from .middlewares import Inverted, Negated @register_operator class DatarOperator(Operator): - + """Operator class for datar""" def _arithmetize1(self, operand: Any, op: str) -> Any: + """Operator for single operand""" op_func = getattr(operator, op) operand = align_value(operand, self.data) - ret = op_func(operand) - if isinstance(self.data, DataFrameGroupBy): - return ret.groupby(self.data.grouper, dropna=False) - return ret + return op_func(operand) def _arithmetize2(self, left: Any, right: Any, op: str) -> Any: + """Operator for paired operands""" op_func = getattr(operator, op) left = align_value(left, self.data) right = align_value(right, self.data) - ret = op_func(left, right) - if isinstance(self.data, DataFrameGroupBy): - return ret.groupby(self.data.grouper, dropna=False) - return ret + return op_func(left, right) - def invert(self, operand: Any) -> Any: + def invert(self, operand: Any, _context: Optional[ContextBase]) -> Any: + """Interpretation for ~x""" if isinstance(operand, (slice, str, list)): - return Inverted(operand, self.data, self.context).complements + return Inverted(operand, self.data, _context).complements return self._arithmetize1(operand, 'invert') def neg(self, operand: Any) -> Any: + """Interpretation for -x""" if isinstance(operand, (slice, list)): return Negated(operand) return self._arithmetize1(operand, 'neg') @@ -74,10 +69,14 @@ def or_(self, left: Any, right: Any) -> Any: return list_union(left, right) return self._arithmetize2(left, right, 'or_') - def ne(self, left: Any, right: Any) -> bool: + def ne(self, left: Any, right: Any) -> bool: # pylint: disable=invalid-name + """Interpret for left != right""" return not self.eq(left, right) def __getattr__(self, name: str) -> Any: + """Other operators""" if not hasattr(operator, name): raise AttributeError - return partial(self._arithmetize2, op=name) + attr = partial(self._arithmetize2, op=name) + attr.__qualname__ = self._arithmetize2.__qualname__ + return attr diff --git a/datar/core/types.py b/datar/core/types.py index 508df55e..ddf9d5dd 100644 --- a/datar/core/types.py +++ b/datar/core/types.py @@ -6,11 +6,12 @@ from pandas.core.frame import DataFrame from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy from pandas.core.series import Series +from pipda.function import Function # used for type annotations NumericType = Union[int, float, complex, numpy.number] IntType = Union[int, numpy.integer] -FloatType = Union[float, numpy.float] +FloatType = Union[float, numpy.float64] DataFrameType = Union[DataFrame, DataFrameGroupBy] SeriesType = Union[Series, SeriesGroupBy] SeriesLikeType = Union[Series, SeriesGroupBy, numpy.ndarray] @@ -21,8 +22,9 @@ FloatOrIter = Union[FloatType, Iterable[FloatType]] NumericOrIter = Union[NumericType, Iterable[NumericType]] +NoneType = type(None) # used for type checks -def is_int(x: Any) -> bool: +def is_scalar_int(x: Any) -> bool: """Check if a value is an integer""" return isinstance(x, (int, numpy.integer)) @@ -37,9 +39,14 @@ def is_scalar(x: Any) -> bool: None will be counted as scalar """ - if x is None or isinstance(x, type) or isinstance(x, numpy.dtype): + ret = numpy.isscalar(x) + if ret: + return ret + try: + iter(x) + except TypeError: return True - return numpy.isscalar(x) + return False def is_iterable(x: Any) -> bool: """Check if a value is iterable, which is not a scalar""" diff --git a/datar/core/utils.py b/datar/core/utils.py index 1b222a27..aadb7691 100644 --- a/datar/core/utils.py +++ b/datar/core/utils.py @@ -1,24 +1,23 @@ """Core utilities""" -import sys -import inspect + import logging -from functools import singledispatch, wraps +from functools import singledispatch +from copy import deepcopy from typing import Any, Callable, Iterable, List, Optional, Union import numpy -from pandas import DataFrame -import pandas +from pandas import DataFrame, Categorical +from pandas.core.flags import Flags from pandas.core.series import Series from pandas.core.groupby import DataFrameGroupBy, SeriesGroupBy from pandas.core.groupby.ops import BaseGrouper -import pipda -from pipda.context import Context, ContextBase -from pipda.function import register_func -from pipda.symbolic import DirectRefAttr -from pipda.utils import evaluate_args, evaluate_expr, evaluate_kwargs +from pandas.core.dtypes.common import is_categorical_dtype -from .exceptions import ColumnNameInvalidError, ColumnNotExistingError +from .exceptions import ( + ColumnNameInvalidError, ColumnNotExistingError, NameNonUniqueError +) from .types import DataFrameType, StringOrIter, is_scalar +from .names import repair_names # logger logger = logging.getLogger('datar') # pylint: disable=invalid-name @@ -64,7 +63,7 @@ def list_union(list1: Iterable[Any], list2: Iterable[Any]) -> List[Any]: Returns: The elements with elements in either list1 or list2 """ - return list1 + list_diff(list2, list1) + return list1 + list_diff(list1=list2, list2=list1) def check_column(column: Any) -> None: """Check if a column is valid @@ -75,12 +74,12 @@ def check_column(column: Any) -> None: Raises: ColumnNameInvalidError: When the column name is invalid """ - from .middlewares import Inverted, Across + from .middlewares import Inverted if not isinstance(column, ( - (int, str, list, set, tuple, Inverted, Across, slice) + (int, str, list, set, tuple, Inverted, slice) )): raise ColumnNameInvalidError( - 'Invalid column, expected int, str, list, tuple, c(), across(), ' + 'Invalid column, expected int, str, list, tuple, c(), ' f'f.column, ~c() or ~f.column, got {type(column)}' ) @@ -133,6 +132,7 @@ def filter_columns( return ret def sanitize_slice(slc: slice, all_columns: List[str]) -> slice: + """Sanitize slice objects""" int_start, int_stop, step = slc.start, slc.stop, slc.step if isinstance(int_start, str): int_start = all_columns.index(int_start) @@ -150,6 +150,7 @@ def _expand_slice_dummy( total: int, from_negated: bool = False ) -> List[int]: + """Expand a dummy slice object""" from .middlewares import Negated, Inverted all_indexes = list(range(total)) if isinstance(elems, int): @@ -181,7 +182,6 @@ def _expand_slice_dummy( [] ) return list_diff(all_indexes, selected_indexes) - raise TypeError(f'Unsupported type for slice expansion: {type(elems)!r}.') def expand_slice( @@ -189,10 +189,7 @@ def expand_slice( total: Union[int, Iterable[int]] ) -> Union[List[int], List[List[int]]]: """Expand the slide in an iterable, in a groupby-aware way""" - from .middlewares import Negated, Inverted, Collection - if isinstance(total, int): - return _expand_slice_dummy(elems, total) - # return _expand_slice_grouped(elems, total) + return _expand_slice_dummy(elems, total) def select_columns( all_columns: Iterable[str], @@ -214,7 +211,7 @@ def select_columns( ColumnNameInvalidError: When the column is invalid to select ColumnNotExistingError: When the column does not exist in the pool """ - from .middlewares import Inverted, Across + from .middlewares import Inverted if not isinstance(all_columns, list): all_columns = list(all_columns) @@ -237,8 +234,6 @@ def select_columns( selected.extend(column.elems) elif isinstance(column, slice): selected.extend(all_columns[sanitize_slice(column, all_columns)]) - elif isinstance(column, Across): - selected.extend(column.evaluate(Context.SELECT)) else: selected.append(column) @@ -257,6 +252,7 @@ def series_expandable( df_or_series: Union[DataFrame, Series], series_or_df: Union[DataFrame, Series] ) -> bool: + """Check if a series is expandable""" if (not isinstance(df_or_series, (Series, DataFrame)) or not isinstance(series_or_df, (Series, DataFrame))): return False @@ -274,6 +270,7 @@ def series_expandable( return series.index.name in df.columns def series_expand(series: Union[DataFrame, Series], df: DataFrame): + """Expand the series to the scale of a dataframe""" if isinstance(series, DataFrame): #assert series.shape[1] == 1 series = series.iloc[:, 0] @@ -285,6 +282,8 @@ def align_value( ) -> Any: """Normalize possible series data to add to the data or compare with other series of the data""" + from ..base.constants import NA + if is_scalar(value): return value @@ -301,45 +300,77 @@ def align_value( if len_series == data.shape[0]: return value + if len_series == 0: + return NA + if data.shape[0] % len_series == 0: nrepeat = data.shape[0] // len_series if isinstance(value, (list, tuple)): return value * nrepeat - return value.append([value] * (nrepeat - 1)) + # numpy.ndarray + return value.repeat(nrepeat) return value -def copy_df( - df: DataFrameType -) -> DataFrameType: - if isinstance(df, DataFrame): - ret = df.copy() - ret.flags.grouper = getattr(df.flags, 'grouper', None) - ret.flags.rowwise = getattr(df.flags, 'rowwise', None) - return ret +def update_df(df: DataFrame, df2: DataFrame) -> None: + """Update the dataframe""" + # DataFrame.update ignores nonexisting columns + # and not keeps categories - copied = copy_df(df.obj) - return group_df(copied, df.grouper) + for col in df2.columns: + df[col] = df2[col] + +def copy_flags(df1: DataFrame, flags: Union[DataFrameType, Flags]) -> None: + """Deep copy the flags from one dataframe to another""" + if isinstance(flags, DataFrame): + flags = flags.flags + elif isinstance(flags, DataFrameGroupBy): + flags = flags.obj.flags + + for key in dir(flags): + if key.startswith('_'): + continue + + setattr(df1.flags, key, deepcopy(getattr(flags, key))) def df_assign_item( - df: DataFrameType, + df: DataFrame, item: str, - value: Any + value: Any, + allow_dups: bool = False ) -> None: - if isinstance(df, DataFrameGroupBy): - df = df.obj + """Assign an item to a dataframe""" + value = align_value(value, df) try: value = value.values except AttributeError: ... - df[item] = value + lenval = 1 if is_scalar(value) else len(value) + + if df.shape[0] == 1 and lenval > 1: + if df.shape[1] == 0: # 0-column df + # Otherwise, cannot set a frame with no defined columns + df['__assign_placeholder__'] = 1 + # add rows inplace + for i in range(lenval - 1): + df.loc[i+1] = df.iloc[0, :] + + if '__assign_placeholder__' in df: + df.drop(columns=['__assign_placeholder__'], inplace=True) + + if not allow_dups: + df[item] = value + else: + df.insert(df.shape[1], item, value, allow_duplicates=True) def objectize(data: Any) -> Any: + """Get the object instead of the GroupBy object""" if isinstance(data, (SeriesGroupBy, DataFrameGroupBy)): return data.obj return data def categorize(data: Any) -> Any: + """Get the Categorical object""" try: return data.cat except AttributeError: @@ -347,12 +378,14 @@ def categorize(data: Any) -> Any: @singledispatch def to_df(data: Any, name: Optional[str] = None) -> DataFrame: - try: - df = DataFrame(data, columns=[name]) if name else DataFrame(data) - except ValueError: - df = DataFrame([data], columns=[name]) if name else DataFrame([data]) + """Convert an object to a data frame""" + if is_scalar(data): + data = [data] - return df + if name is None: + return DataFrame(data) + + return DataFrame({name: data}) @to_df.register(numpy.ndarray) def _(data: numpy.ndarray, name: Optional[str] = None) -> DataFrame: @@ -369,7 +402,9 @@ def _(data: numpy.ndarray, name: Optional[str] = None) -> DataFrame: @to_df.register(DataFrame) def _(data: DataFrame, name: Optional[str] = None) -> DataFrame: - return data + if name is None: + return data + return DataFrame({f"{name}${col}": data[col] for col in data.columns}) @to_df.register(Series) def _(data: Series, name: Optional[str] = None) -> DataFrame: @@ -386,126 +421,30 @@ def get_n_from_prop( n: Optional[int] = None, prop: Optional[float] = None ) -> int: + """Get n from a proportion""" if n is None and prop is None: return 1 if prop is not None: return int(float(total) * min(prop, 1.0)) return min(n, total) - -def _register_grouped_col0( - func: Callable, - context: ContextBase -) -> Callable: - """Register a function with argument of no column as groupby aware""" - - @register_func(DataFrame, context=None) - @wraps(func) - def wrapper( - _data: DataFrame, - *args: Any, - **kwargs: Any - ) -> Any: - _column = DirectRefAttr(_data, _data.columns[0]) - series = evaluate_expr(_column, _data, context) - args = evaluate_args(args, _data, context.args) - kwargs = evaluate_kwargs(kwargs, _data, context.kwargs) - return func(series, *args, **kwargs) - - @wrapper.register(DataFrameGroupBy) - def _( - _data: DataFrameGroupBy, - *args: Any, - **kwargs: Any - ) -> Any: - _column = DirectRefAttr(_data, _data.obj.columns[0]) - series = evaluate_expr(_column, _data, context) - args = evaluate_args(args, _data, context.args) - kwargs = evaluate_kwargs(kwargs, _data, context.kwargs) - return series.apply(func, *args, **kwargs) - - return wrapper - -def _register_grouped_col1( - func: Callable, - context: ContextBase -) -> Callable: - """Register a function with argument of single column as groupby aware""" - - @register_func(DataFrame, context=None) - @wraps(func) - def wrapper( - # in case this is called directly (not in a piping env) - # we should not have the _data argument - # _data: DataFrame, - # _column: Any, - *args: Any, - **kwargs: Any - ) -> Any: - # Let's if the function is called in a piping env - # If so, the previous frame should be in functools - # Otherwise, it should be pipda.function, where the wrapped - # function should be called directly, instead of generating an - # Expression object - - if inspect.getmodule(sys._getframe(1)) is pipda.function: - # called directly - return func(*args, **kwargs) - _data, _column, *args = args - series = evaluate_expr(_column, _data, context) - args = evaluate_args(args, _data, context.args) - kwargs = evaluate_kwargs(kwargs, _data, context.kwargs) - return func(series, *args, **kwargs) - - @wrapper.register(DataFrameGroupBy) - def _( - _data: DataFrameGroupBy, - _column: Any, - *args: Any, - **kwargs: Any - ) -> Any: - series = evaluate_expr(_column, _data, context) - args = evaluate_args(args, _data, context.args) - kwargs = evaluate_kwargs(kwargs, _data, context.kwargs) - # Todo: check if we have SeriesGroupby in args/kwargs - return series.apply(func, *args, **kwargs) - - return wrapper - -def register_grouped( - func: Optional[Callable] = None, - context: Optional[Union[Context, ContextBase]] = None, - columns: Union[str, int] = 1 -) -> Callable: - """Register a function as a group-by-aware function""" - if func is None: - return lambda fun: register_grouped( - fun, - context=context, - columns=columns - ) - - if isinstance(context, Context): - context = context.value - - if columns == 1: - return _register_grouped_col1(func, context=context) - - if columns == 0: - return _register_grouped_col0(func, context=context) - - raise ValueError("Expect columns to be either 0 or 1.") - def group_df( df: DataFrame, - grouper: Union[BaseGrouper, StringOrIter] + grouper: Union[BaseGrouper, StringOrIter], + drop: Optional[bool] = None ) -> DataFrameGroupBy: + """Group a dataframe""" + from ..dplyr import group_by_drop_default + if drop is None: + drop = group_by_drop_default(df) + return df.groupby( grouper, as_index=False, - sort=False, + sort=True, dropna=False, - group_keys=False + group_keys=False, + observed=drop ) def groupby_apply( @@ -513,14 +452,39 @@ def groupby_apply( func: Callable, groupdata: bool = False ) -> DataFrame: - + """Apply a function to DataFrameGroupBy object""" if groupdata: # df.groupby(group_keys=True).apply does not always add group as index g_keys = df.grouper.names def apply_func(subdf): + if subdf is None or subdf.shape[0] == 0: + return None ret = func(subdf) - ret[g_keys] = df.obj[g_keys] - return ret[list_union(g_keys, ret.columns)] - return df.apply(apply_func).reset_index(drop=True) + for key in g_keys: + if key not in ret: + df_assign_item(ret, key, subdf[key].values[0]) + if is_categorical_dtype(subdf[key]): + ret[key] = Categorical( + ret[key], + categories=subdf[key].cat.categories + ) + columns = list_union(g_keys, ret.columns) + # keep the original order + commcols = [col for col in df.obj.columns if col in columns] + # make sure columns are included + columns = list_union(commcols, list_diff(columns, commcols)) + return ret[columns] + + ret = df.apply(apply_func).reset_index(drop=True) + else: + ret = df.apply(func).reset_index(drop=True) + + copy_flags(ret, df) + return ret - return df.apply(func).reset_index(drop=True) +def check_column_uniqueness(df: DataFrame, msg: Optional[str] = None) -> None: + """Check if column names are unique of a dataframe""" + try: + repair_names(df.columns.tolist(), repair="check_unique") + except NameNonUniqueError as error: + raise ValueError(msg or str(error)) from None diff --git a/datar/datar/__init__.py b/datar/datar/__init__.py index f1978b08..a3dd000f 100644 --- a/datar/datar/__init__.py +++ b/datar/datar/__init__.py @@ -1,3 +1,3 @@ """Specific verbs/funcs from this package""" -from .verbs import get, flatten, showme, debug, drop_index +from .verbs import get, flatten, display, debug, drop_index diff --git a/datar/datar/verbs.py b/datar/datar/verbs.py index 7251c2dc..b42dfcbc 100644 --- a/datar/datar/verbs.py +++ b/datar/datar/verbs.py @@ -1,14 +1,14 @@ """Specific verbs from this package""" -from datar.core.middlewares import RowwiseDataFrame import sys from typing import Any, List, Union from pandas import DataFrame from pandas.core.groupby.generic import DataFrameGroupBy -from pipda import register_verb, Context, evaluate_expr +from pipda import register_verb, evaluate_expr from ..core.utils import objectize, logger from ..core.types import DataFrameType +from ..core.contexts import Context from ..dplyr import select, slice # pylint: disable=redefined-builtin @register_verb((DataFrame, DataFrameGroupBy), context=Context.SELECT) @@ -30,7 +30,8 @@ def get( A single element when both rows and cols are scalar, otherwise a subset of _data """ - _data = objectize(_data) + _data = objectize(_data).reset_index(drop=True) + # getting single element if ( rows is not None and @@ -97,12 +98,15 @@ def print_msg(msg: str, end: str = "\n"): print_msg("## Evaluated") print_msg(evaluate_expr(val, _data, context)) -@register_verb(DataFrame) -def showme(_data: DataFrame) -> DataFrame: +@register_verb(DataFrame, context=Context.EVAL) +def display(_data: DataFrame) -> DataFrame: """Let jupyter notebook show the (grouped) dataframe""" + rowwise_vars = getattr(_data.flags, 'rowwise', False) + if rowwise_vars: + logger.info('# [DataFrame] Rowwise: %s', rowwise_vars) return _data -@showme.register(DataFrameGroupBy) +@display.register(DataFrameGroupBy, context=Context.EVAL) def _(_data: DataFrameGroupBy) -> DataFrame: """Show the groups for grouped dataframe pandas only just shows repr. @@ -114,16 +118,7 @@ def _(_data: DataFrameGroupBy) -> DataFrame: ) return _data.obj -@showme.register(RowwiseDataFrame) -def _(_data: RowwiseDataFrame) -> DataFrame: - """Show the groups for rowwise dataframe - """ - logger.info( - '# [RowwiseDataFrame] Rowwise: %s', - _data.flags.rowwise, - ) - return _data.obj - -@register_verb(DataFrame) +@register_verb(DataFrame, context=Context.EVAL) def drop_index(_data: DataFrame) -> DataFrame: + """Drop the index of a dataframe, works as a verb""" return _data.reset_index(drop=True) diff --git a/datar/dplyr/__init__.py b/datar/dplyr/__init__.py index d1cf692a..f072a3d9 100644 --- a/datar/dplyr/__init__.py +++ b/datar/dplyr/__init__.py @@ -19,5 +19,6 @@ if_else, n_distinct, n, row_number, cur_group_id, cur_group_rows, cur_group, cur_data, cur_data_all, cur_column, cummean, cumall, cumany, lead, lag, num_range, recode, recode_factor, recode_categorical, - coalesce, na_if, near, nth, first, last, between + coalesce, na_if, near, nth, first, last, between, group_by_drop_default, + n_groups, group_size ) diff --git a/datar/dplyr/funcs.py b/datar/dplyr/funcs.py index 844985f2..2c792b55 100644 --- a/datar/dplyr/funcs.py +++ b/datar/dplyr/funcs.py @@ -7,106 +7,75 @@ from pandas.core.arrays.categorical import Categorical from pandas.core.dtypes.common import is_categorical_dtype from pandas import DataFrame, Series -from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy +from pandas.core.groupby.generic import DataFrameGroupBy -from pipda import register_func, Context +from pipda import register_func +from pipda.context import ContextBase +from pipda.utils import functype from ..core.middlewares import ( - Across, CAcross, CurColumn, DescSeries, IfAll, IfAny + Across, CurColumn, DescSeries, IfAll, IfAny +) +from ..core.types import ( + BoolOrIter, DataFrameType, NumericOrIter, NumericType, + is_iterable, is_scalar ) -from ..core.types import BoolOrIter, DataFrameType, NumericOrIter, NumericType, is_scalar from ..core.exceptions import ColumnNotExistingError -from ..core.utils import filter_columns, objectize, list_diff +from ..core.utils import ( + copy_flags, filter_columns, list_union, + objectize, list_diff, select_columns +) +from ..core.contexts import Context from ..base.constants import NA # pylint: disable=redefined-outer-name -@register_func((DataFrame, DataFrameGroupBy), context=Context.SELECT) -def desc( - _data: DataFrameType, - col: str -) -> Union[DescSeries, SeriesGroupBy]: - """Returns a DescSeries object, which can be used in arrange or other - environments that need a descending ordered series +@register_func(None, context=Context.EVAL) +def desc(x: Iterable[Any]) -> Series: + """Transform a vector into a format that will be sorted in descending order - Args: - col: The column + This is useful within arrange(). - Returns: - The DescSeries object - """ - if isinstance(_data, DataFrameGroupBy): - series = DescSeries(_data.obj[col].values, name=col) - return series.groupby(_data.grouper, dropna=False) - return DescSeries(_data[col].values, name=col) - -@register_func(context=Context.SELECT) -def across( - _data: DataFrameType, - _cols: Optional[Iterable[str]] = None, - _fns: Optional[Union[ - Callable, - Iterable[Callable], - Mapping[str, Callable] - ]] = None, - _names: Optional[str] = None, - **kwargs: Any -) -> Across: - """Apply the same transformation to multiple columns + The original API: + https://dplyr.tidyverse.org/reference/desc.html Args: - _data: The dataframe - _cols: The columns - _fns: Functions to apply to each of the selected columns. - _names: A glue specification that describes how to name - the output columns. This can use `{_col}` to stand for the - selected column name, and `{_fn}` to stand for the name of - the function being applied. - The default (None) is equivalent to `{_col}` for the - single function case and `{_col}_{_fn}` for the case where - a list is used for _fns. In such a case, `{_fn}` is 1-based. - To use 1-based index, use `{_fn0}` - *args, **kwargs: Arguments for the functions + x: vector to transform Returns: - A dataframe with one column for each column in _cols and - each function in _fns. + The descending order of x """ - return Across(_data, _cols, _fns, _names, (), kwargs) - - -@register_func(context=Context.SELECT) + x = Series(x) + try: + return -x + except TypeError: + cat = Categorical(x) + code = Series(cat.codes).astype(float) + code[code == -1.] = NA + return -code + +@register_func(context=Context.SELECT, verb_arg_only=True) def c_across( _data: DataFrame, - _cols: Optional[Iterable[str]] = None, - _fns: Optional[Union[Mapping[str, Callable]]] = None, - _names: Optional[str] = None, - **kwargs: Any -) -> CAcross: + _cols: Optional[Iterable[str]] = None +) -> Series: """Apply the same transformation to multiple columns rowwisely Args: _data: The dataframe _cols: The columns - _fns: Functions to apply to each of the selected columns. - _names: A glue specification that describes how to name - the output columns. This can use `{_col}` to stand for the - selected column name, and `{_fn}` to stand for the name of - the function being applied. - The default (None) is equivalent to `{_col}` for the - single function case and `{_col}_{_fn}` for the case where - a list is used for _fns. In such a case, `{_fn}` is 1-based. - To use 1-based index, use `{_fn0}` - *args, **kwargs: Arguments for the functions Returns: - A dataframe with one column for each column in _cols and - each function in _fns. + A series """ - return CAcross(_data, _cols, _fns, _names, (), kwargs) + if not _cols: + _cols = _data.columns + _cols = select_columns(_data.columns.tolist(), _cols) + series = [_data[col] for col in _cols] + return numpy.concatenate(series) -@register_func +@register_func(context=Context.SELECT) def starts_with( _data: DataFrameType, match: Union[Iterable[str], str], @@ -132,7 +101,7 @@ def starts_with( lambda mat, cname: cname.startswith(mat), ) -@register_func +@register_func(context=Context.SELECT) def ends_with( _data: DataFrameType, match: str, @@ -159,7 +128,7 @@ def ends_with( ) -@register_func +@register_func(context=Context.SELECT) def contains( _data: DataFrameType, match: str, @@ -185,7 +154,7 @@ def contains( lambda mat, cname: mat in cname, ) -@register_func +@register_func(context=Context.SELECT) def matches( _data: DataFrameType, match: str, @@ -211,7 +180,7 @@ def matches( re.search, ) -@register_func +@register_func(DataFrame) def everything(_data: DataFrame) -> List[str]: """Matches all columns. @@ -224,9 +193,14 @@ def everything(_data: DataFrame) -> List[str]: grouper = getattr(_data.flags, 'grouper', None) if grouper is not None: return list_diff(_data.columns.tolist(), grouper.names) - return _data.columns.to_list() + return _data.columns.tolist() -@register_func +@everything.register(DataFrameGroupBy) +def _(_data: DataFrameGroupBy) -> List[str]: + """All columns for a grouped dataframe""" + return list_diff(_data.obj.columns.tolist(), _data.grouper.names) + +@register_func(context=Context.SELECT) def last_col( _data: DataFrameType, offset: int = 0, @@ -247,7 +221,7 @@ def last_col( vars = vars or _data.columns return vars[-(offset+1)] -@register_func +@register_func(context=Context.EVAL) def all_of( _data: DataFrameType, x: Iterable[Union[int, str]] @@ -278,7 +252,7 @@ def all_of( return list(x) -@register_func +@register_func(context=Context.SELECT) def any_of( _data: DataFrameType, x: Iterable[Union[int, str]], @@ -299,7 +273,7 @@ def any_of( vars = vars or objectize(_data).columns return [elem for elem in x if elem in vars] -@register_func((DataFrame, DataFrameGroupBy)) +@register_func((DataFrame, DataFrameGroupBy), context=Context.EVAL) def where(_data: DataFrameType, fn: Callable) -> List[str]: """Selects the variables for which a function returns True. @@ -312,23 +286,18 @@ def where(_data: DataFrameType, fn: Callable) -> List[str]: Returns: The matched columns """ + columns = everything(_data) _data = objectize(_data) retcols = [] - - pipda_type = getattr(fn, '__pipda__', None) - for col in _data.columns: - if not pipda_type: + pipda_type = functype(fn) + for col in columns: + if pipda_type == 'plain': conditions = fn(_data[col]) + elif pipda_type == 'plain-func': + conditions = fn(_data[col], _env=_data) else: - conditions = ( - fn(_data[col], _calling_type='piping').evaluate(_data) - if pipda_type == 'PlainFunction' - else fn( - _data, - _data[col], - _calling_type='piping' - ).evaluate(_data) - ) + conditions = fn(_data, _data[col], _env=_data) + if isinstance(conditions, bool): if conditions: retcols.append(col) @@ -339,36 +308,88 @@ def where(_data: DataFrameType, fn: Callable) -> List[str]: return retcols -@register_func(context=Context.SELECT) +@register_func( + context=None, + extra_contexts={'_cols': Context.SELECT}, + verb_arg_only=True +) +def across( + _data: DataFrameType, + _cols: Optional[Iterable[str]] = None, + _fns: Optional[Union[ + Callable, + Iterable[Callable], + Mapping[str, Callable] + ]] = None, + _names: Optional[str] = None, + _context: Optional[ContextBase] = None, + **kwargs: Any +) -> DataFrame: + """Apply the same transformation to multiple columns + + The original API: + https://dplyr.tidyverse.org/reference/across.html + + Args: + _data: The dataframe + _cols: The columns + _fns: Functions to apply to each of the selected columns. + _names: A glue specification that describes how to name + the output columns. This can use `{_col}` to stand for the + selected column name, and `{_fn}` to stand for the name of + the function being applied. + The default (None) is equivalent to `{_col}` for the + single function case and `{_col}_{_fn}` for the case where + a list is used for _fns. In such a case, `{_fn}` is 0-based. + To use 1-based index, use `{_fn1}` + **kwargs: Arguments for the functions + + Returns: + A dataframe with one column for each column in _cols and + each function in _fns. + """ + return Across(_data, _cols, _fns, _names, kwargs).evaluate(_context) + +@register_func( + context=None, + extra_contexts={'_cols': Context.SELECT}, + verb_arg_only=True +) def if_any( _data: DataFrame, _cols: Optional[Iterable[str]] = None, _fns: Optional[Union[Mapping[str, Callable]]] = None, _names: Optional[str] = None, + _context: Optional[ContextBase] = None, **kwargs: Any -) -> Across: +) -> Iterable[bool]: """apply the same predicate function to a selection of columns and combine the results True if any element is True. See across(). """ - return IfAny(_data, _cols, _fns, _names, (), kwargs) + return IfAny(_data, _cols, _fns, _names, kwargs).evaluate(_context) -@register_func(context=Context.SELECT) +@register_func( + context=None, + extra_contexts={'_cols': Context.SELECT}, + verb_arg_only=True +) def if_all( _data: DataFrame, _cols: Optional[Iterable[str]] = None, _fns: Optional[Union[Mapping[str, Callable]]] = None, _names: Optional[str] = None, + _context: Optional[ContextBase] = None, **kwargs: Any -) -> Across: +) -> Iterable[bool]: """apply the same predicate function to a selection of columns and combine the results True if all elements are True. See across(). """ - return IfAll(_data, _cols, _fns, _names, (), kwargs) + return IfAll(_data, _cols, _fns, _names, kwargs).evaluate(_context) def _ranking( data: Iterable[Any], @@ -441,6 +462,7 @@ def case_when( """Vectorise multiple if_else() statements. Args: + _data: The data frame. *when_cases: A even-size sequence, with 2n-th element values to match, and 2(n+1)-th element the values to replace. When matching value is True, then next value will be default to @@ -459,10 +481,31 @@ def case_when( if case is True: df['x'] = ret else: - df.loc[case, 'x'] = ret + df.loc[case.reset_index(drop=True), 'x'] = ret return df.x +@case_when.register((list, tuple, numpy.ndarray)) +def _( + _data: Union[list, tuple, numpy.ndarray], + *when_cases: Any +) -> numpy.ndarray: + """case_when on lists/tuples""" + if len(when_cases) % 2 != 0: + raise ValueError('Number of arguments of case_when should be even.') + + array = numpy.array(_data) + nrow = len(array) + ret = numpy.array([NA] * nrow) + when_cases = reversed(list(zip(when_cases[0::2], when_cases[1::2]))) + for case, val in when_cases: + if case is True: + ret[numpy.isnan(ret)] = val + else: + ret[case] = val + + return ret + @register_func((DataFrame, DataFrameGroupBy), context=Context.EVAL) def if_else( _data: DataFrameType, @@ -515,7 +558,7 @@ def n(series: Iterable[Any]) -> int: @register_func(context=Context.EVAL) def row_number(_data: Iterable[Any]) -> Series: """Gives the row number, 0-based.""" - return Series(range(len(_data))) + return Series(range(len(_data)), dtype='int') @register_func(DataFrame) def cur_group_id(_data: DataFrame) -> int: @@ -538,10 +581,10 @@ def cur_group_id(_data: DataFrame) -> int: @register_func(DataFrame) def cur_group_rows(_data: DataFrame) -> int: """gives the row indices for the current group.""" - return _data.index + return _data.index.tolist() @register_func(DataFrame) -def cur_group(_data: DataFrame) -> Series: +def cur_group(_data: DataFrame) -> DataFrame: """gives the group keys, a tibble with one row and one column for each grouping variable.""" grouper = getattr(_data.flags, 'grouper', None) @@ -550,8 +593,10 @@ def cur_group(_data: DataFrame) -> Series: 'To get current group, a dataframe must be grouped ' 'using `datar.dplyr.group_by`' ) - - return _data[grouper.names] + group_id = cur_group_id(_data) + levels = grouper.get_group_levels() + group = list(zip(*levels))[group_id] + return DataFrame(group, columns=grouper.names).reset_index(drop=True) @register_func(DataFrame) def cur_data(_data: DataFrame) -> int: @@ -559,18 +604,25 @@ def cur_data(_data: DataFrame) -> int: (excluding grouping variables).""" grouper = getattr(_data.flags, 'grouper', None) if not grouper: - raise ValueError( - 'To get current group data, a dataframe must be grouped ' - 'using `datar.dplyr.group_by`' - ) + return _data + + copied = _data.copy()[[ + col for col in _data.columns if col not in grouper.names + ]] - return _data.drop(columns=grouper.names) + copy_flags(copied, _data) + return copied.reset_index(drop=True) @register_func(DataFrame) def cur_data_all(_data: DataFrame) -> int: """gives the current data for the current group (including grouping variables)""" - return _data.copy() + level = cur_group(_data) + for group in level.columns: + _data[group] = level[group].values[0] + return _data[ + list_union(level.columns.tolist(), _data.columns) + ].reset_index(drop=True) def cur_column() -> CurColumn: """Used in the functions of across. So we don't have to register it.""" @@ -787,7 +839,7 @@ def recode_factor( recode_categorical = recode_factor # pylint: disable=invalid-name @register_func(None, context=Context.EVAL) -def coalesce(x: Any, replace: Any) -> Any: +def coalesce(x: Any, *replace: Any) -> Any: """Replace missing values https://dplyr.tidyverse.org/reference/coalesce.html @@ -800,21 +852,28 @@ def coalesce(x: Any, replace: Any) -> Any: A vector the same length as the first argument with missing values replaced by the first non-missing value. """ + if not replace: + return x + x = objectize(x) - if isinstance(x, Iterable): - if not isinstance(replace, Iterable): - replace = [replace] * len(x) - elif len(replace) != len(x): - raise ValueError( - f"Expect length {len(x)} for coalesce replacement, " - f"got {len(replace)}" + if isinstance(x, DataFrame): + y = x.copy() + copy_flags(y, x) + for repl in replace: + x = y.combine_first(repl) + copy_flags(x, y) + y = x + return y + + if is_iterable(x): + x = Series(x) + for repl in replace: + x = x.combine_first( + Series(repl if is_iterable(repl) else [repl] * len(x)) ) - return [ - rep if numpy.isnan(elem) else elem - for elem, rep in zip(x, replace) - ] + return x.values - return replace if numpy.isnan(x) else x + return replace[0] if numpy.isnan(x) else x @register_func(None, context=Context.EVAL) def na_if(x: Iterable[Any], y: Any) -> Iterable[Any]: @@ -902,3 +961,26 @@ def last( return x[-1] except IndexError: return default + +def group_by_drop_default(data: DataFrameType) -> bool: + """Get the groupby _drop attribute of dataframe""" + return getattr(objectize(data).flags, 'groupby_drop', True) + +def n_groups(data: DataFrameType) -> int: + """Get the number of groups""" + if isinstance(data, DataFrame): + return 1 + + # when dropna=False with NAs + # https://github.com/pandas-dev/pandas/issues/35202 + # return len(data) + return len(data.size()) + +def group_size(data: DataFrameType) -> List[int]: + """Get the group sizes as a list of integers""" + if isinstance(data, DataFrame): + return data.shape[0] + gsize = data.size() + if isinstance(gsize, Series): + return gsize.tolist() + return gsize['size'].tolist() diff --git a/datar/dplyr/verbs.py b/datar/dplyr/verbs.py index af0e5e9f..f393a0b5 100644 --- a/datar/dplyr/verbs.py +++ b/datar/dplyr/verbs.py @@ -1,39 +1,50 @@ +# pylint: disable=too-many-lines """Verbs ported from R-dplyr""" -from typing import Any, Callable, Iterable, List, Mapping, Optional, Union +from typing import ( + Any, Callable, Iterable, List, Mapping, Optional, Union +) import numpy import pandas -from pandas.core.indexes.multi import MultiIndex -from pandas import DataFrame, Series -from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy +from pandas import DataFrame, Series, RangeIndex, Categorical +from pandas.api.types import union_categoricals +from pandas.core.groupby.generic import DataFrameGroupBy -from pipda import register_verb, Context, evaluate_expr, evaluate_args +from pipda import register_verb, evaluate_expr +from pipda.utils import Expression -from ..core.middlewares import ( - Across, CAcross, Collection, - DescSeries, IfCross, Inverted, RowwiseDataFrame -) +from ..core.middlewares import Inverted from ..core.types import ( - DataFrameType, SeriesLikeType, StringOrIter, is_scalar + DataFrameType, NoneType, NumericOrIter, SeriesLikeType, StringOrIter, + is_scalar ) -from ..core.contexts import ContextEvalWithUsedRefs, ContextSelectSlice -from ..core.exceptions import ColumnNameInvalidError +from ..core.contexts import ContextEval, ContextSelectSlice +from ..core.exceptions import ColumnNameInvalidError, ColumnNotExistingError from ..core.utils import ( - align_value, copy_df, df_assign_item, expand_slice, get_n_from_prop, - group_df, groupby_apply, list_diff, list_intersect, list_union, - objectize, select_columns, to_df, logger + align_value, check_column_uniqueness, copy_flags, df_assign_item, + expand_slice, get_n_from_prop, group_df, groupby_apply, list_diff, + list_intersect, list_union, objectize, select_columns, to_df, + logger, update_df ) +from ..core.names import repair_names +from ..core.contexts import Context +from ..tibble.funcs import tibble +from ..base.funcs import is_categorical +from .funcs import group_by_drop_default -# pylint: disable=redefined-builtin +# pylint: disable=redefined-builtin,no-value-for-parameter -@register_verb((DataFrame, DataFrameGroupBy), context=Context.EVAL) +@register_verb(DataFrame, context=Context.EVAL) def arrange( - _data: DataFrameType, - *series: Union[Series, SeriesGroupBy, Across], + _data: DataFrame, + *series: Iterable[Any], _by_group: bool = False -) -> DataFrameType: +) -> DataFrame: """orders the rows of a data frame by the values of selected columns. + The original API: + https://dplyr.tidyverse.org/reference/arrange.html + Args: _data: A data frame *series: Variables, or functions of variables. @@ -50,61 +61,73 @@ def arrange( Groups are not modified. Data frame attributes are preserved. """ - data = objectize(_data) - sorting_df = ( - data.index.to_frame(name='__index__').drop(columns=['__index__']) - ) - desc_cols = set() - acrosses = [] - kwargs = {} - for ser in series: - if isinstance(ser, Across): - desc_cols |= ser.desc_cols() - if ser.fns: - acrosses.append(ser) - else: - for col in ser.cols: - kwargs[col] = data[col].values - else: - ser = objectize(ser) - if isinstance(ser, DescSeries): - desc_cols.add(ser.name) - kwargs[ser.name] = ser.values + if not series: + return _data - sorting_df = mutate(sorting_df, *acrosses, **kwargs) + check_column_uniqueness( + _data, + "Cannot arrange a data frame with duplicate names." + ) - by = sorting_df.columns.to_list() - if isinstance(_data, DataFrameGroupBy): - for key in _data.grouper.names: - if key not in sorting_df: - sorting_df[key] = _data.obj[key].values - if _by_group: - by = list_union(_data.grouper.names, by) + sorting_df = DataFrame(index=_data.index) >> mutate(*series) + by = sorting_df.columns.tolist() + sorting_df.sort_values(by=by, inplace=True) - ascending = [col not in desc_cols for col in by] - sorting_df.sort_values(by=by, ascending=ascending, inplace=True) - data = data.loc[sorting_df.index, :] + ret = _data.loc[sorting_df.index, :] + copy_flags(ret, _data) + return ret - if isinstance(_data, DataFrameGroupBy): - return group_df(data, _data.grouper) +@arrange.register(DataFrameGroupBy, context=Context.PENDING) +def _( + _data: DataFrameGroupBy, + *series: Any, + _by_group: bool = False +) -> DataFrameGroupBy: + """Arrange grouped dataframe""" + if not _by_group: + ret = _data.obj >> arrange(*series) + else: + ret = _data.obj >> arrange( + *(_data.obj[col] for col in _data.grouper.names), + *series + ) + copy_flags(ret, _data) + return group_df(ret, _data.grouper.names) - return data +def _mutate_rowwise(_data: DataFrame, *args: Any, **kwargs: Any) -> DataFrame: + """Mutate on rowwise data frame""" + if _data.shape[0] > 0: + def apply_func(ser): + return (ser.to_frame().T >> mutate(*args, **kwargs)).iloc[0, :] + applied = _data.apply( + apply_func, + axis=1 + ).reset_index(drop=True) + else: + applied = DataFrame( + columns=list_union(_data.columns.tolist(), kwargs.keys()) + ) + copy_flags(applied, _data) # rowwise copied + return applied -@register_verb(DataFrame, context=None) +@register_verb(DataFrame, context=Context.PENDING) def mutate( _data: DataFrame, - *acrosses: Across, + *args: Any, _keep: str = 'all', _before: Optional[str] = None, _after: Optional[str] = None, **kwargs: Any ) -> DataFrame: - """adds new variables and preserves existing ones + # pylint: disable=too-many-branches + """Adds new variables and preserves existing ones + + The original API: + https://dplyr.tidyverse.org/reference/mutate.html Args: _data: A data frame - *acrosses: Values from across function _keep: allows you to control which columns from _data are retained in the output: - "all", the default, retains all variables. @@ -117,7 +140,7 @@ def mutate( _before, _after: Optionally, control where new columns should appear (the default is to add to the right hand side). See relocate() for more details. - **kwargs: Name-value pairs. The name gives the name of the column + *args, **kwargs: Name-value pairs. The name gives the name of the column in the output. The value can be: - A vector of length 1, which will be recycled to the correct length. @@ -126,7 +149,7 @@ def mutate( - None to remove the column Returns: - An object of the same type as .data. The output has the following + An object of the same type as _data. The output has the following properties: - Rows are not affected. - Existing columns will be preserved according to the _keep @@ -138,47 +161,62 @@ def mutate( - Groups will be recomputed if a grouping variable is mutated. - Data frame attributes are preserved. """ - context = ContextEvalWithUsedRefs() + if getattr(_data.flags, 'rowwise', False): + return _mutate_rowwise( + _data, + *args, + _keep=_keep, + _before=_before, + _after=_after, + **kwargs + ) + + context = ContextEval() + if _before is not None: _before = evaluate_expr(_before, _data, Context.SELECT) if _after is not None: _after = evaluate_expr(_after, _data, Context.SELECT) - across = {} # no need OrderedDict in python3.7+ anymore - for acrs in acrosses: - acrs = evaluate_expr(acrs, _data, context) - across.update( - acrs.evaluate(context) - if isinstance(acrs, Across) else acrs - ) - - across.update(kwargs) - kwargs = across + data = _data.copy() + copy_flags(data, _data) + + serieses = {} # no need OrderedDict in python3.7+ anymore + for i, ser in enumerate(args): + ser = evaluate_expr(ser, data, context) + if isinstance(ser, Series): + serieses[ser.name] = ser.values + elif isinstance(ser, DataFrame): + serieses.update(ser.to_dict('series')) + elif isinstance(ser, dict): + serieses.update(ser) + else: + serieses[f'V{i}'] = ser - data = copy_df(_data) - if isinstance(_data, RowwiseDataFrame): - data = RowwiseDataFrame(data, rowwise=_data.flags.rowwise) + serieses.update(kwargs) - for key, val in kwargs.items(): + for key, val in serieses.items(): if val is None: data.drop(columns=[key], inplace=True) continue val = evaluate_expr(val, data, context) - - if isinstance(val, CAcross): - val.names = key - if isinstance(val, Across): - val = DataFrame(val.evaluate(context, data)) - value = align_value(val, data) if isinstance(value, DataFrame): - for col in value.columns: - df_assign_item(data, f'{key}${col}', value[col]) + if value.shape[1] == 1: + df_assign_item( + data, + key if isinstance(value.columns, RangeIndex) + else value.columns[0], + value.iloc[:, 0] + ) + else: + for col in value.columns: + df_assign_item(data, f'{key}${col}', value[col]) else: df_assign_item(data, key, value) - outcols = list(kwargs) + outcols = list(serieses) # do the relocate first if _before is not None or _after is not None: # pylint: disable=no-value-for-parameter @@ -195,41 +233,44 @@ def mutate( data = data[outcols] # else: # raise - if (isinstance(_data, RowwiseDataFrame) and - not isinstance(data, RowwiseDataFrame)): - return RowwiseDataFrame(data, rowwise=_data.flags.rowwise) - return data -@mutate.register(DataFrameGroupBy, context=None) +@mutate.register(DataFrameGroupBy, context=Context.PENDING) def _( _data: DataFrameGroupBy, *args: Any, **kwargs: Any ) -> DataFrameGroupBy: """Mutate on DataFrameGroupBy object""" - def apply_func(df): - df.flags.grouper = _data.grouper - return df >> mutate(*args, **kwargs) - - return group_df(groupby_apply(_data, apply_func), _data.grouper) + if _data.obj.shape[0] > 0: + def apply_func(df): + copy_flags(df, _data) + # allow group context to work, such as cur_data() + df.flags.grouper = _data.grouper + return df >> mutate(*args, **kwargs) + + applied = groupby_apply(_data, apply_func) # index reset + else: + applied = DataFrame( + columns=list_union(_data.obj.columns.tolist(), kwargs.keys()) + ) + return group_df(applied, _data.grouper) # Forward pipda.Expression for mutate to evaluate -@register_verb((DataFrame, DataFrameGroupBy), context=None) +@register_verb((DataFrame, DataFrameGroupBy), context=Context.PENDING) def transmutate( _data: DataFrameType, - *acrosses: Across, + *series: Iterable[Any], _before: Optional[str] = None, _after: Optional[str] = None, **kwargs: Any -) -> DataFrame: +) -> DataFrameType: """Mutate with _keep='none' See mutate(). """ - return mutate( - _data, - *acrosses, + return _data >> mutate( + *series, _keep='none', _before=_before, _after=_after, @@ -286,7 +327,7 @@ def relocate( return ret -@register_verb((DataFrame, DataFrameGroupBy)) +@register_verb((DataFrame, DataFrameGroupBy), context=Context.SELECT) def select( _data: DataFrameType, *columns: Union[StringOrIter, Inverted], @@ -301,29 +342,35 @@ def select( Returns: The dataframe with select columns """ + if isinstance(_data, DataFrameGroupBy): + data = _data.obj + groups = _data.grouper.names + else: + data = _data + groups = [] + selected = select_columns( - _data.columns, + data.columns, *columns, *renamings.values() ) + selected = list_union(groups, selected) # old -> new new_names = {val: key for key, val in renamings.items() if val in selected} - data = objectize(_data)[selected] + data = data[selected] if new_names: data = data.rename(columns=new_names) + copy_flags(data, _data) if isinstance(_data, DataFrameGroupBy): return group_df(data, _data.grouper) return data -@register_verb((DataFrame, DataFrameGroupBy)) -def rowwise(_data: DataFrameType, *columns: str) -> RowwiseDataFrame: +@register_verb(DataFrame, context=Context.SELECT) +def rowwise(_data: DataFrame, *columns: str) -> DataFrame: """Compute on a data frame a row-at-a-time - Note: - If the dataframe is grouped, the group information will be lost - Args: _data: The dataframe *columns: Variables to be preserved when calling summarise(). @@ -331,17 +378,35 @@ def rowwise(_data: DataFrameType, *columns: str) -> RowwiseDataFrame: uniquely identify each row. Returns: - A row-wise data frame with class RowwiseDataFrame + A row-wise data frame """ - _data = objectize(_data) - columns = select_columns(_data.columns, columns) - return RowwiseDataFrame(_data, rowwise=columns) + check_column_uniqueness(_data) + data = _data.copy() + copy_flags(data, _data) + if not columns: + columns = True + else: + columns = select_columns(_data.columns, columns) + data.flags.rowwise = columns + return data -@register_verb(DataFrame, context=None) +@rowwise.register(DataFrameGroupBy, context=Context.SELECT) +def _(_data: DataFrameGroupBy, *columns: str) -> DataFrame: + if columns: + raise ValueError( + "Can't re-group when creating rowwise data." + ) + data = _data.obj.copy() + copy_flags(data, _data) + data.flags.rowwise = _data.grouper.names + return data + +@register_verb(DataFrame, context=Context.PENDING) def group_by( _data: DataFrame, - *columns: str, + *args: Any, _add: bool = False, # not working, since _data is not grouped + _drop: Optional[bool] = None, **kwargs: Any ) -> DataFrameGroupBy: """Takes an existing tbl and converts it into a grouped tbl where @@ -349,28 +414,34 @@ def group_by( Args: _data: The dataframe - *columns: variables or computations to group by. + *args: variables or computations to group by. **kwargs: Extra variables to group the dataframe Return: A DataFrameGroupBy object """ - if kwargs: - _data = mutate(_data, **kwargs) - - columns = evaluate_args(columns, _data, Context.SELECT) - columns = select_columns(_data.columns, *columns, *kwargs.keys()) + data = _data.copy() + copy_flags(data, _data) + mutated = _data >> mutate(*args, **kwargs, _keep='none') + update_df(data, mutated) + + data.flags.groupby_drop = ( + group_by_drop_default(_data) + if _drop is None else _drop + ) # requires pandas 1.2+ # eariler versions have bugs with apply/transform # GH35889 - return group_df(_data, columns) + data.reset_index(drop=True, inplace=True) + return group_df(data, mutated.columns.tolist(), drop=_drop) -@group_by.register(DataFrameGroupBy) +@group_by.register(DataFrameGroupBy, context=Context.PENDING) def _( _data: DataFrameGroupBy, - *columns: str, + *args: Any, _add: bool = False, + _drop: Optional[bool] = None, **kwargs: Any ) -> DataFrameGroupBy: """Group by on a DataFrameGroupBy object @@ -382,33 +453,50 @@ def _( override existing groups. To add to the existing groups, use _add = TRUE. """ - if kwargs: - _data = mutate(_data, **kwargs) + mutated = _data >> mutate(*args, **kwargs, _keep='none') + data = _data.obj.copy() + update_df(data, mutated.obj) + copy_flags(data, _data) - columns = evaluate_args(columns, _data, Context.SELECT) - columns = select_columns(_data.obj.columns, *columns, *kwargs.keys()) if _add: - groups = Collection(*_data.grouper.names) + columns - return group_df(_data.obj, groups) - return group_df(_data.obj, columns) + columns = list_union(_data.grouper.names, mutated.obj.columns) + else: + columns = mutated.obj.columns.tolist() + return group_df(data, columns, drop=_drop) -@register_verb(DataFrameGroupBy) -def ungroup(_data: DataFrameGroupBy) -> DataFrame: +@register_verb(DataFrameGroupBy, context=Context.SELECT) +def ungroup(_data: DataFrameGroupBy, *cols: str) -> DataFrameType: """Ungroup a grouped dataframe Args: _data: The grouped dataframe + *cols: Columns to remove from grouping Returns: - The ungrouped dataframe + The ungrouped dataframe or DataFrameGroupBy object with remaining + grouping variables. """ - return _data.obj + if not cols: + return _data.obj + + gvars = _data.grouper.names + for col in cols: + if col not in gvars: + raise ValueError(f'Not a grouping variable: {col!r}') + new_vars = list_diff(gvars, cols) + return group_df(_data, new_vars) + +@ungroup.register(DataFrame, context=Context.EVAL) +def _(_data: DataFrame, *cols: str) -> DataFrame: + if cols: + raise ValueError(f'Dataframe is not grouped by {cols}') + return _data # ------------------------------ # group data -@register_verb((DataFrame, DataFrameGroupBy)) +@register_verb((DataFrame, DataFrameGroupBy), context=Context.EVAL) def group_keys( _data: DataFrameType, *cols: str, @@ -420,19 +508,24 @@ def group_keys( group_levels = list(_data.groups.keys()) return DataFrame(group_levels, columns=_data.grouper.names) -@register_verb(DataFrameGroupBy) +@register_verb(DataFrameGroupBy, context=Context.EVAL) def group_rows(_data: DataFrameGroupBy) -> List[str]: """Returns the rows which each group contains""" return _data.grouper.groups -@register_verb(DataFrameGroupBy) +@register_verb(DataFrameGroupBy, context=Context.EVAL) def group_vars(_data: DataFrameGroupBy) -> List[str]: """gives names of grouping variables as character vector""" return _data.grouper.names +@group_vars.register(DataFrame, context=Context.EVAL) +def _(_data: DataFrame) -> List[str]: + """Group vars of DataFrame""" + return getattr(_data.flags, 'rowwise', None) or [] + group_cols = group_vars # pylint: disable=invalid-name -@register_verb((DataFrame, DataFrameGroupBy)) +@register_verb((DataFrame, DataFrameGroupBy), context=Context.EVAL) def group_map( _data: DataFrameType, func: Callable[[DataFrame], Any] @@ -444,7 +537,7 @@ def group_map( func(_data.obj.loc[index]) for index in _data.grouper.groups.values() ] -@register_verb((DataFrame, DataFrameGroupBy)) +@register_verb((DataFrame, DataFrameGroupBy), context=Context.EVAL) def group_modify( _data: DataFrameType, func: Callable[[DataFrame], DataFrame] @@ -454,7 +547,7 @@ def group_modify( return func(_data) return _data.apply(func).reset_index(drop=True, level=0) -@register_verb((DataFrame, DataFrameGroupBy)) +@register_verb((DataFrame, DataFrameGroupBy), context=Context.EVAL) def group_walk( _data: DataFrameType, func: Callable[[DataFrame], None] @@ -464,14 +557,14 @@ def group_walk( func(_data) _data.apply(func) -@register_verb(DataFrameGroupBy) +@register_verb(DataFrameGroupBy, context=Context.EVAL) def group_trim( _data: DataFrameGroupBy ) -> DataFrameGroupBy: """Trim the unused group levels""" return group_df(_data.obj, _data.grouper.names) -@register_verb((DataFrame, DataFrameGroupBy)) +@register_verb((DataFrame, DataFrameGroupBy), context=Context.EVAL) def group_split( _data: DataFrameType, *cols: str, @@ -479,19 +572,18 @@ def group_split( **kwargs: Any ) -> DataFrameGroupBy: """Get a list of data in each group""" - if isinstance(_data, RowwiseDataFrame): - _data = objectize(_data) - return [_data.iloc[[i], :] for i in range(_data.shape[0])] - if isinstance(_data, DataFrameGroupBy): return [ _data.obj.loc[index] for index in _data.grouper.groups.values() ] + if getattr(_data.flags, 'rowwise', None): + return [_data.iloc[[i], :] for i in range(_data.shape[0])] + _data = group_by(_data, *cols, **kwargs) return group_split(_data) -@register_verb((DataFrame, DataFrameGroupBy), context=Context.UNSET) +@register_verb((DataFrame, DataFrameGroupBy), context=Context.PENDING) def with_groups( _data: DataFrameType, _groups: Optional[StringOrIter], @@ -521,13 +613,78 @@ def with_groups( return _func(_data, *args, **kwargs) -@register_verb(DataFrame, context=Context.MIXED) +def _summarise_rowwise( + _data: DataFrame, + *dfs: Union[DataFrame, Mapping[str, Iterable[Any]]], + _groups: Optional[str] = None, + **kwargs: Any +) -> DataFrameType: + """Summarise on rowwise dataframe""" + rowwise_vars = _data.flags.rowwise + + def apply_func(ser): + row = ser.to_frame().T.reset_index(drop=True) + summarised = row >> summarise(*dfs, **kwargs) + summarised.reset_index(drop=True, inplace=True) + if rowwise_vars and rowwise_vars is not True: + ret = row[rowwise_vars].iloc[range(summarised.shape[0]), :] + ret[summarised.columns.tolist()] = summarised + return ret + + return summarised + + if _data.shape[0] == 0: + columns = list(kwargs) + if rowwise_vars and rowwise_vars is not True: + columns = list_union(rowwise_vars, columns) + applied = DataFrame(columns=columns) + else: + applied = pandas.concat( + (apply_func(row[1]) for row in _data.iterrows()), + axis=0 + ) + copy_flags(applied, _data) + applied.flags.rowwise = False + + if rowwise_vars is True: + if _groups == 'rowwise': + applied.flags.rowwise = True + return applied + + if _groups is None and summarise.inform: + logger.info( + '`summarise()` has ungrouped output. ' + 'You can override using the `_groups` argument.' + ) + + return applied + + # rowwise vars set + if _groups == 'rowwise': + applied.flags.rowwise = True + return applied + + if _groups is None and summarise.inform: + logger.info( + '`summarise()` has grouped output by %s. ' + 'You can override using the `_groups` argument.', + rowwise_vars + ) + _groups = 'keep' + + if _groups == 'keep': + return group_df(applied, rowwise_vars) + + return applied + + +@register_verb(DataFrame, context=Context.PENDING) def summarise( _data: DataFrame, - *acrosses: Across, + *dfs: Union[DataFrame, Mapping[str, Iterable[Any]]], _groups: Optional[str] = None, **kwargs: Any -) -> DataFrame: +) -> DataFrameType: """Summarise each group to fewer rows See: https://dplyr.tidyverse.org/reference/summarise.html @@ -538,63 +695,101 @@ def summarise( - "drop": All levels of grouping are dropped. - "keep": Same grouping structure as _data. - "rowwise": Each row is its own group. - *acrosses, **kwargs: Name-value pairs, where value is the summarized + *dfs, **kwargs: Name-value pairs, where value is the summarized data for each group Returns: The summary dataframe. """ - across = {} # no need OrderedDict in python3.7+ anymore - for acrs in acrosses: - across.update( - acrs.evaluate(Context.EVAL, _data) - if isinstance(acrs, Across) - else acrs + check_column_uniqueness( + _data, + "Can't transform a data frame with duplicate names." + ) + if getattr(_data.flags, 'rowwise', False): + return _summarise_rowwise( + _data, + *dfs, + _groups=_groups, + **kwargs ) - across.update(kwargs) - kwargs = across + context = Context.EVAL.value + + serieses = {} + new_names = [] + for i, ser in enumerate(dfs): + if isinstance(ser, Series): + serieses[ser.name] = ser.values + elif isinstance(ser, DataFrame): + serieses.update(ser.to_dict('series')) + elif isinstance(ser, dict): + serieses.update(ser) + else: + serieses[f"V{i}"] = ser + new_names.append(f"V{i}") + + serieses.update(kwargs) + kwargs = serieses ret = None - if isinstance(_data, RowwiseDataFrame) and _data.flags.rowwise is not True: - ret = _data.loc[:, _data.flags.rowwise] for key, val in kwargs.items(): - if isinstance(val, CAcross): - val.names = key - if isinstance(val, Across): - val = DataFrame(val.evaluate(Context.EVAL, _data)) - + if val is None: + continue if ret is None: + val = evaluate_expr(val, _data, context) + if key in new_names and isinstance(val, DataFrame): + key = None ret = to_df(val, key) - else: - ret[key] = align_value(val, ret) + continue + try: + val = evaluate_expr(val, ret, context) + except ColumnNotExistingError: + val = evaluate_expr(val, _data, context) + + value = align_value(val, ret) + df_assign_item(ret, key, value) + + if ret is None: + ret = DataFrame(index=[0]) + + copy_flags(ret, _data) + ret.flags.rowwise = False if _groups == 'rowwise': - return RowwiseDataFrame(ret) + ret.flags.rowwise = True return ret -@summarise.register(DataFrameGroupBy, context=None) +@summarise.register(DataFrameGroupBy, context=Context.PENDING) def _( _data: DataFrameGroupBy, - *acrosses: Across, + *dfs: Union[DataFrame, Mapping[str, Iterable[Any]]], _groups: Optional[str] = None, **kwargs: Any ) -> DataFrameType: - def apply_func(df): - df.flags.grouper = _data.grouper - return df[ - list_diff(df.columns.tolist(), _data.grouper.names) - ] >> summarise(*acrosses, _groups=_groups, **kwargs) - - ret = groupby_apply(_data, apply_func, groupdata=True) + gsizes = [] + if _data.obj.shape[0] > 0: + def apply_func(df): + df.flags.grouper = _data.grouper + ret = df >> summarise(*dfs, _groups=_groups, **kwargs) + gsizes.append(0 if df.shape[0] == 0 else ret.shape[0]) + return ret + + applied = groupby_apply(_data, apply_func, groupdata=True) + else: # 0-row dataframe + # check cols in *dfs + applied = DataFrame( + columns=list_union(_data.grouper.names, kwargs.keys()) + ) g_keys = _data.grouper.names if _groups is None: - gsize = group_df(ret, _data.grouper.names).grouper.size().tolist() - if gsize == [1] * len(gsize): + has_args = len(kwargs) > 0 or len(dfs) > 0 + all_ones = all(gsize <= 1 for gsize in gsizes) + + if applied.shape[0] <= 1 or all_ones or not has_args: _groups = 'drop_last' if len(g_keys) == 1 and summarise.inform: logger.info( @@ -603,33 +798,34 @@ def apply_func(df): ) elif summarise.inform: logger.info( - '`summarise()` regrouping output by ' + '`summarise()` has grouped output by ' '%s (override with `_groups` argument)', g_keys[:-1] ) else: - if gsize != [gsize[0]] * len(gsize): - _groups = 'keep' - if summarise.inform: - logger.info( - '`summarise()` regrouping output by %s. ' - 'You can override using the `.groups` argument.', - g_keys - ) + _groups = 'keep' + if summarise.inform: + logger.info( + '`summarise()` has grouped output by %s. ' + 'You can override using the `_groups` argument.', + g_keys + ) + + copy_flags(applied, _data) if _groups == 'drop': - return ret + return applied if _groups == 'drop_last': - return group_df(ret, g_keys[:-1]) if g_keys[:-1] else ret + return group_df(applied, g_keys[:-1]) if g_keys[:-1] else applied if _groups == 'keep': # even keep the unexisting levels - return group_df(ret, g_keys) + return group_df(applied, g_keys) # else: # todo: raise - return ret + return applied summarise.inform = True summarize = summarise # pylint: disable=invalid-name @@ -638,7 +834,6 @@ def apply_func(df): @register_verb(DataFrame, context=Context.EVAL) def filter( _data: DataFrame, - condition: Iterable[bool], *conditions: Iterable[bool], _preserve: bool = False ) -> DataFrame: @@ -654,38 +849,40 @@ def filter( Returns: The subset dataframe """ - if isinstance(condition, IfCross): - condition = condition.evaluate(Context.EVAL, _data) - + if _data.shape[0] == 0: + return _data # check condition, conditions + condition = numpy.array([True] * _data.shape[0]) for cond in conditions: - if isinstance(cond, IfCross): - cond = cond.evaluate(Context.EVAL, _data) + if is_scalar(cond): + cond = numpy.array([cond] * _data.shape[0]) condition = condition & cond try: condition = objectize(condition).values.flatten() except AttributeError: ... - ret = objectize(_data)[condition] - if isinstance(_data, DataFrameGroupBy): - grouper = _data.grouper if _preserve else _data.grouper.names - return group_df(ret, grouper) + ret = _data[condition].reset_index(drop=True) + copy_flags(ret, _data) return ret -@filter.register(DataFrameGroupBy, context=None) +@filter.register(DataFrameGroupBy, context=Context.PENDING) def _( _data: DataFrameGroupBy, - condition: Iterable[bool], - *conditions: Iterable[bool], + *conditions: Expression, _preserve: bool = False ) -> DataFrameGroupBy: + """Filter on DataFrameGroupBy object""" + if _data.obj.shape[0] > 0: + def apply_func(df): + df.flags.grouper = _data.grouper + return df >> filter(*conditions) - def apply_func(df): - df.flags.grouper = _data.grouper - return df >> filter(condition, *conditions, _preserve=_preserve) + ret = groupby_apply(_data, apply_func, groupdata=True) + else: + ret = DataFrame(columns=_data.obj.columns) + copy_flags(ret, _data) - ret = groupby_apply(_data, apply_func) if _preserve: return group_df(ret, _data.grouper) return group_df(ret, _data.grouper.names) @@ -693,13 +890,14 @@ def apply_func(df): # ------------------------------ # count -@register_verb((DataFrame, DataFrameGroupBy), context=None) +@register_verb(DataFrame, context=Context.EVAL) def count( - _data: DataFrameType, + _data: DataFrame, *columns: Any, - wt: Optional[str] = None, + wt: Optional[NumericOrIter] = None, sort: bool = False, - name: str = 'n', + name: Optional[str] = None, + _drop: Optional[bool] = None, **mutates: Any ) -> DataFrame: """Count observations by group @@ -718,46 +916,124 @@ def count( Returns: DataFrame object with the count column """ - _data = objectize(_data) - columns = evaluate_args(columns, _data, Context.SELECT) - columns = select_columns(_data.columns, *columns) + if _drop is None: + _drop = group_by_drop_default(_data) - wt = evaluate_expr(wt, _data, Context.SELECT) - _data = mutate(_data, **mutates) + mutated = _data >> mutate(*columns, **mutates, _keep='none') + data = _data.copy() + update_df(data, mutated) + copy_flags(data, _data) - columns = columns + list(mutates) - grouped = group_df(_data, columns) + columns = mutated.columns.tolist() + if not columns: + raise ValueError("No columns to count.") + + grouped = group_df(data, columns, drop=_drop) + # check if name in columns + if name is None: + name = 'n' + while name in columns: + name += 'n' + if name != 'n': + logger.warning( + 'Storing counts in `%s`, as `n` already present in input. ' + 'Use `name="new_name"` to pick a new name.', + name + ) + elif isinstance(name, str): + columns = [col for col in columns if col != name] + else: + raise ValueError("`name` must be a single string.") - if not wt: - count_frame = grouped[columns].size().to_frame(name=name) + if wt is None: + count_frame = grouped[columns].grouper.size().to_frame(name=name) else: - count_frame = grouped[wt].sum().to_frame(name=name) + count_frame = Series(wt).groupby( + grouped.grouper + ).sum().to_frame(name=name) ret = count_frame.reset_index(level=columns) if sort: ret = ret.sort_values([name], ascending=[False]) return ret +@count.register(DataFrameGroupBy, context=Context.PENDING) +def _( + _data: DataFrameGroupBy, + *columns: Any, + wt: Optional[NumericOrIter] = None, + sort: bool = False, + name: Optional[str] = None, + _drop: Optional[bool] = None, + **mutates: Any +): + if _drop is None: + _drop = group_by_drop_default(_data) + + gkeys = _data.grouper.names -@register_verb((DataFrame, DataFrameGroupBy), context=Context.SELECT) + def apply_func(df): + if df.shape[0] == 0: + return None + return df >> count( + df[gkeys], + *columns, + wt=wt, + sort=sort, + name=name, + _drop=True, + **mutates + ) + + applied = groupby_apply(_data, apply_func)# index reset + + if not _drop: + if len(gkeys) > 1 or not is_categorical(_data.obj[gkeys[0]]): + logger.warning( + 'Currently, _drop=False of count on grouped dataframe ' + 'only works when dataframe is grouped by a single ' + 'categorical column.' + ) + else: + applied = applied.set_index(gkeys).reindex( + _data.obj[gkeys[0]].cat.categories, + fill_value=0 + ).reset_index(level=gkeys) + + # not dropping anything + return group_df(applied, gkeys) + + +@register_verb(DataFrameGroupBy, context=Context.PENDING) def tally( - _data: DataFrameType, - wt: str = None, + _data: DataFrameGroupBy, + wt: Optional[NumericOrIter] = None, sort: bool = False, - name: str = 'n' + name: Optional[str] = None ) -> DataFrame: """A ower-level function for count that assumes you've done the grouping See count() """ - if isinstance(_data, DataFrameGroupBy): - return count(_data, *_data.grouper.names, wt=wt, sort=sort, name=name) + ret = _data >> count(wt=wt, sort=sort, name=name) + return ret.obj if isinstance(ret, DataFrameGroupBy) else ret - return DataFrame({ - name: [_data.shape[0] if wt is None else _data[wt].sum()] - }) +@tally.register(DataFrame, context=Context.EVAL) +def _( + _data: DataFrame, + wt: Optional[NumericOrIter] = None, + sort: bool = False, # pylint: disable=unused-argument + name: Optional[str] = None +) -> DataFrame: + """tally for DataFrame object""" + name = name or 'n' + if wt is None: + wt = _data.shape[0] + else: + wt = wt.sum() + return DataFrame({name: [wt]}) -@register_verb((DataFrame, DataFrameGroupBy), context=None) +@register_verb((DataFrame, DataFrameGroupBy), context=Context.PENDING) def add_count( _data: DataFrameType, *columns: Any, @@ -779,14 +1055,15 @@ def add_count( if sort: ret = ret.sort_values([name], ascending=[False]) + copy_flags(ret, _data) if isinstance(_data, DataFrameGroupBy): return group_df(ret, _data.grouper) return ret -@register_verb((DataFrame, DataFrameGroupBy), context=Context.SELECT) +@register_verb((DataFrame, DataFrameGroupBy), context=Context.PENDING) def add_tally( _data: DataFrameType, - wt: str = None, + wt: Optional[str] = None, sort: bool = False, name: str = 'n' ) -> DataFrameType: @@ -803,20 +1080,24 @@ def add_tally( if sort: ret = ret.sort_values([name], ascending=[False]) + copy_flags(ret, _data) if isinstance(_data, DataFrameGroupBy): return group_df(ret, _data.grouper) return ret -@register_verb((DataFrame, DataFrameGroupBy), context=Context.MIXED) +@register_verb(DataFrame, context=Context.PENDING) def distinct( - _data: DataFrameType, + _data: DataFrame, *columns: Any, _keep_all: bool = False, **mutates: Any -) -> DataFrameType: +) -> DataFrame: """Select only unique/distinct rows from a data frame. + The original API: + https://dplyr.tidyverse.org/reference/distinct.html + Args: _data: The dataframe *columns, **mutates: Optional variables to use when determining @@ -826,24 +1107,42 @@ def distinct( Returns: A dataframe without duplicated rows in _data """ - data = objectize(_data) + mutated = _data >> mutate(*columns, **mutates, _keep='none') + data = _data.copy() + update_df(data, mutated) + copy_flags(data, _data) + + columns = ( + mutated.columns.tolist() + if mutated.shape[1] > 0 + else data.columns.tolist() + ) + # keep the order + columns = [col for col in data.columns if col in columns] - all_columns = data.columns - columns = select_columns(all_columns, *columns) - if isinstance(_data, DataFrameGroupBy): - columns = list_union(_data.grouper.names, columns) + data.drop_duplicates(columns, inplace=True) + if not _keep_all: + data2 = data[columns] + copy_flags(data2, data) + data = data2 + + return data - data = mutate(data, **mutates) - columns = columns + list(mutates) +@distinct.register(DataFrameGroupBy, context=Context.PENDING) +def _( + _data: DataFrameGroupBy, + *columns: Any, + _keep_all: bool = False, + **mutates: Any +) -> DataFrameGroupBy: - if not columns: - columns = all_columns + def apply_func(df): + return df >> distinct(*columns, _keep_all=_keep_all, **mutates) - uniq_frame = data.drop_duplicates(columns, ignore_index=True) - ret = uniq_frame if _keep_all else uniq_frame[columns] - if isinstance(_data, DataFrameGroupBy): - return group_df(ret, _data.grouper) - return ret + return group_df( + groupby_apply(_data, apply_func, groupdata=True), + _data.grouper.names + ) @register_verb((DataFrame, DataFrameGroupBy), context=Context.SELECT) def pull( @@ -852,7 +1151,7 @@ def pull( name: Optional[StringOrIter] = None, to: str = 'series' ) -> SeriesLikeType: - """Pull a series from a dataframe + """Pull a series or a dataframe from a dataframe Args: _data: The dataframe @@ -860,8 +1159,11 @@ def pull( name: If specified, a zip object will be return with the name-value pairs. It can be a column name or a list of strs with the same length as the series + Only works when pulling `a` for name `a$b` to: Type of data to return. + Only works when pulling `a` for name `a$b` - series: Return a pandas Series object + Group information will be lost - array: Return a numpy.ndarray object - list: Return a python list @@ -872,16 +1174,24 @@ def pull( if isinstance(var, int): var = _data.columns[var] - if name is not None and is_scalar(name): - return zip(_data[name].values, _data[var].values) - if name is not None: - return zip(name, _data[var].values) + # check if var is a dataframe + if var not in _data: + cols = [col for col in _data.columns if col.startswith(f'{var}$')] + ret = _data.loc[:, cols] + ret.columns = [col[(len(var)+1):] for col in cols] + return ret + value = _data[var] if to == 'list': - return _data[var].values.tolist() + value = value.values.tolist() if to == 'array': - return _data[var].values - return _data[var] + value = value.values + + if name is not None and is_scalar(name): + return zip(_data[name].values, value) + if name is not None: + return zip(name, value) + return value @register_verb(DataFrame, context=Context.SELECT) def rename( @@ -898,7 +1208,17 @@ def rename( Returns: The dataframe with new names """ - return _data.rename(columns={val: key for key, val in kwargs.items()}) + names = {val: key for key, val in kwargs.items()} + ret = _data.rename(columns=names) + copy_flags(ret, _data) + row_wise = getattr(ret.flags, 'rowwise', None) + if is_scalar(row_wise): + return ret + + for i, var in enumerate(row_wise): + if var in names: + row_wise[i] = names[var] + return ret @register_verb(DataFrame, context=Context.SELECT) def rename_with( @@ -942,24 +1262,34 @@ def slice( The sliced dataframe """ rows = expand_slice(rows, _data.shape[0]) - return _data.iloc[rows, :] + try: + ret = _data.iloc[rows, :] + except IndexError: + ret = _data.iloc[[], :] + + copy_flags(ret, _data) + return ret -@slice.register(DataFrameGroupBy) +@slice.register(DataFrameGroupBy, context=Context.PENDING) def _( _data: DataFrameGroupBy, rows: Any, _preserve: bool = False ) -> DataFrameGroupBy: """Slice on grouped dataframe""" + + def apply_func(df): + ret = df >> slice(rows) + return ret + grouper = _data.grouper - _data = objectize(_data) - rows = expand_slice(rows, _data.shape[0]) - ret = _data.iloc[rows, :] if not _preserve: - return group_df(ret, grouper.names) - return group_df(ret, grouper) + grouper = grouper.names + + applied = groupby_apply(_data, apply_func, groupdata=True) + return group_df(applied, grouper) -@register_verb(DataFrame) +@register_verb(DataFrame, context=Context.EVAL) def slice_head( _data: DataFrame, n: Optional[int] = None, @@ -984,27 +1314,20 @@ def slice_head( rows = list(range(n)) return _data.iloc[rows, :] -@slice_head.register(DataFrameGroupBy) +@slice_head.register(DataFrameGroupBy, context=Context.PENDING) def _( _data: DataFrameGroupBy, n: Optional[Union[int, Iterable[int]]] = None, prop: Optional[Union[float, Iterable[float]]] = None ) -> DataFrame: """slice_head on grouped dataframe""" - # any other better way? - total = _data.size().to_frame(name='size') - total['n'] = n - total['prop'] = prop - indexes = total.apply( - lambda row: _data.groups[row.name][ - :get_n_from_prop(row.size, row.n, row.prop) - ], - axis=1 - ) - indexes = numpy.concatenate(indexes.values) - return group_df(_data.obj.iloc[indexes, :], _data.grouper) + def apply_func(df): + return df >> slice_head(n=n, prop=prop) -@register_verb(DataFrame) + applied = groupby_apply(_data, apply_func, groupdata=True) + return group_df(applied, _data.grouper.names) + +@register_verb(DataFrame, context=Context.EVAL) def slice_tail( _data: DataFrame, n: Optional[int] = 1, @@ -1018,25 +1341,18 @@ def slice_tail( rows = [-(elem+1) for elem in range(n)] return _data.iloc[rows, :] -@slice_tail.register(DataFrameGroupBy) +@slice_tail.register(DataFrameGroupBy, context=Context.PENDING) def _( _data: DataFrameGroupBy, n: Optional[Union[int, Iterable[int]]] = None, prop: Optional[Union[float, Iterable[float]]] = None ) -> DataFrame: """slice_tail on grouped dataframe""" - # any other better way? - total = _data.size().to_frame(name='size') - total['n'] = n - total['prop'] = prop - indexes = total.apply( - lambda row: _data.groups[row.name][ - -get_n_from_prop(row.size, row.n, row.prop): - ], - axis=1 - ) - indexes = numpy.concatenate(indexes.values) - return group_df(_data.obj.iloc[indexes, :], _data.grouper) + def apply_func(df): + return df >> slice_tail(n=n, prop=prop) + + applied = groupby_apply(_data, apply_func, groupdata=True) + return group_df(applied, _data.grouper.names) @register_verb(DataFrame, context=Context.EVAL) def slice_min( @@ -1058,7 +1374,7 @@ def slice_min( columns=['__slice_order__'] ) -@slice_min.register(DataFrameGroupBy, context=None) +@slice_min.register(DataFrameGroupBy, context=Context.PENDING) def _( _data: DataFrameGroupBy, order_by: Series, @@ -1093,7 +1409,7 @@ def slice_max( columns=['__slice_order__'] ) -@slice_max.register(DataFrameGroupBy, context=None) +@slice_max.register(DataFrameGroupBy, context=Context.PENDING) def _( _data: DataFrameGroupBy, order_by: Series, @@ -1131,7 +1447,7 @@ def slice_sample( axis=0 ) -@slice_sample.register(DataFrameGroupBy, context=None) +@slice_sample.register(DataFrameGroupBy, context=Context.PENDING) def _( _data: DataFrameGroupBy, n: Optional[int] = 1, @@ -1151,25 +1467,106 @@ def apply_func(df): # Two table verbs # --------------- -@register_verb(DataFrame) +@register_verb( + (DataFrame, list, dict, NoneType), + context=Context.EVAL +) def bind_rows( - _data: DataFrame, - *datas: DataFrame + _data: Optional[Union[DataFrame, list, dict]], + *datas: Optional[Union[DataFrameType, dict]], + _id: Optional[str] = None, + **kwargs: Union[DataFrame, dict] ) -> DataFrame: + # pylint: disable=too-many-branches """Bind rows of give dataframes Args: - _data, *datas: Dataframes to combine + _data: The seed dataframe to bind others + Could be a dict or a list, keys/indexes will be used for _id col + *datas: Other dataframes to combine + _id: The name of the id columns + **kwargs: A mapping of dataframe, keys will be used as _id col. Returns: The combined dataframe """ - return pandas.concat([_data, *datas]) + if _id is not None and not isinstance(_id, str): + raise ValueError("`_id` must be a scalar string.") -@register_verb(DataFrame) + def data_to_df(data): + """Make a copy of dataframe or convert dict to a dataframe""" + if isinstance(data, (DataFrame, DataFrameGroupBy)): + return objectize(data).copy() + + ret = tibble(**data) # avoid varname error + return ret + + key_data = {} + if isinstance(_data, list): + for i, dat in enumerate(_data): + if dat is not None: + key_data[i] = data_to_df(dat) + elif _data is not None: + key_data[0] = data_to_df(_data) + + for i, dat in enumerate(datas): + if dat is not None: + key_data[len(key_data)] = data_to_df(dat) + + for key, val in kwargs.items(): + if val is not None: + key_data[key] = data_to_df(val) + + if not key_data: + return DataFrame() + + # handle categorical data + for col in list(key_data.values())[0].columns: + all_series = [ + dat[col] for dat in key_data.values() + if col in dat and not dat[col].isna().all() + ] + all_categorical = [ + is_categorical(ser) for ser in all_series + ] + if all(all_categorical): + union_cat = union_categoricals(all_series) + for data in key_data.values(): + if col not in data: # in case it is 0-column df + continue + data[col] = Categorical( + data[col], + categories=union_cat.categories, + ordered=is_categorical(data[col]) and data[col].cat.ordered + ) + elif any(all_categorical): + logger.warning("Factor information lost during rows binding.") + + if _id is not None: + return pandas.concat( + key_data.values(), + keys=key_data.keys(), + names=[_id, None] + ).reset_index(level=0).reset_index(drop=True) + return pandas.concat(key_data.values()).reset_index(drop=True) + +@bind_rows.register(DataFrameGroupBy, context=Context.PENDING) +def _( + _data: DataFrameGroupBy, + *datas: Optional[Union[DataFrameType, dict]], + _id: Optional[str] = None, + **kwargs: Union[DataFrame, dict] +) -> DataFrameGroupBy: + + data = _data.obj >> bind_rows(*datas, _id=_id, **kwargs) + copy_flags(data, _data) + return group_df(data, _data.grouper.names) + +@register_verb((DataFrame, dict, NoneType), context=Context.EVAL) def bind_cols( - _data: DataFrame, - *datas: DataFrame + _data: Optional[Union[DataFrame, dict]], + *datas: Optional[Union[DataFrame, dict]], + _name_repair: Union[str, Callable] = "unique" ) -> DataFrame: """Bind columns of give dataframes @@ -1179,9 +1576,23 @@ def bind_cols( Returns: The combined dataframe """ - return pandas.concat([_data, *datas], axis=1) + if isinstance(_data, dict): + _data = tibble(**_data) + more_data = [] + for data in datas: + if isinstance(data, dict): + more_data.append(tibble(**data)) + else: + more_data.append(data) + if _data is not None: + more_data.insert(0, _data) + if not more_data: + return DataFrame() + ret = pandas.concat(more_data, axis=1) + ret.columns = repair_names(ret.columns.tolist(), repair=_name_repair) + return ret -@register_verb(DataFrame) +@register_verb(DataFrame, context=Context.EVAL) def intersect( _data: DataFrame, data2: DataFrame, @@ -1208,7 +1619,7 @@ def intersect( how='inner' ) >> distinct(*on) -@register_verb(DataFrame) +@register_verb(DataFrame, context=Context.EVAL) def union( _data: DataFrame, data2: DataFrame, @@ -1235,7 +1646,7 @@ def union( how='outer' ) >> distinct(*on) -@register_verb(DataFrame) +@register_verb(DataFrame, context=Context.EVAL) def setdiff( _data: DataFrame, data2: DataFrame, @@ -1262,7 +1673,7 @@ def setdiff( lambda x: x['_merge'] == 'left_only' ].drop(columns=['_merge']) >> distinct(*on) -@register_verb(DataFrame) +@register_verb(DataFrame, context=Context.EVAL) def union_all( _data: DataFrame, data2: DataFrame @@ -1278,7 +1689,7 @@ def union_all( """ return bind_rows(_data, data2) -@register_verb(DataFrame) +@register_verb(DataFrame, context=Context.EVAL) def setequal( _data: DataFrame, data2: DataFrame @@ -1296,7 +1707,50 @@ def setequal( data2 = data2.sort_values(by=data2.columns.to_list()).reset_index(drop=True) return data1.equals(data2) -@register_verb(DataFrame) +def _join( + x: DataFrameType, + y: DataFrameType, + how: str, + by: Optional[Union[StringOrIter, Mapping[str, str]]] = None, + copy: bool = False, + suffix: Iterable[str] = ("_x", "_y"), + keep: bool = False +) -> DataFrameType: + """General join""" + xobj = objectize(x) + y = objectize(y) + if isinstance(by, dict): + right_on = list(by.values()) + ret = pandas.merge( + xobj, y, + left_on=list(by.keys()), + right_on=right_on, + how=how, + copy=copy, + suffixes=suffix + ) + if not keep: + ret.drop(columns=right_on, inplace=True) + else: + ret = pandas.merge( + xobj, y, + on=by, + how=how, + copy=copy, + suffixes=suffix + ) + + copy_flags(ret, x) + if isinstance(x, DataFrameGroupBy): + return group_df(ret, x.grouper.names) + + return ret + +@register_verb( + (DataFrame, DataFrameGroupBy), + context=Context.EVAL, + extra_contexts={'by': Context.SELECT} +) def inner_join( x: DataFrame, y: DataFrame, @@ -1322,106 +1776,94 @@ def inner_join( Returns: The joined dataframe """ - if isinstance(by, dict): - right_on = list(by.values()) - ret = pandas.merge( - x, y, - left_on=list(by.keys()), - right_on=right_on, - how='inner', - copy=copy, - suffixes=suffix - ) - if not keep: - return ret.drop(columns=right_on) - return ret - return pandas.merge(x, y, on=by, how='inner', copy=copy, suffixes=suffix) + return _join( + x, y, + how='inner', + by=by, + copy=copy, + suffix=suffix, + keep=keep + ) -@register_verb(DataFrame) +@register_verb( + (DataFrame, DataFrameGroupBy), + context=Context.EVAL, + extra_contexts={'by': Context.SELECT} +) def left_join( - x: DataFrame, - y: DataFrame, + x: DataFrameType, + y: DataFrameType, by: Optional[Union[StringOrIter, Mapping[str, str]]] = None, copy: bool = False, suffix: Iterable[str] = ("_x", "_y"), keep: bool = False -) -> DataFrame: +) -> DataFrameType: """Mutating joins including all rows in x. See inner_join() """ - if isinstance(by, dict): - right_on = list(by.values()) - ret = pandas.merge( - x, y, - left_on=list(by.keys()), - right_on=right_on, - how='left', - copy=copy, - suffixes=suffix - ) - if not keep: - return ret.drop(columns=right_on) - return ret - return pandas.merge(x, y, on=by, how='left', copy=copy, suffixes=suffix) + return _join( + x, y, + how='left', + by=by, + copy=copy, + suffix=suffix, + keep=keep + ) -@register_verb(DataFrame) +@register_verb( + (DataFrame, DataFrameGroupBy), + context=Context.EVAL, + extra_contexts={'by': Context.SELECT} +) def right_join( - x: DataFrame, - y: DataFrame, + x: DataFrameType, + y: DataFrameType, by: Optional[Union[StringOrIter, Mapping[str, str]]] = None, copy: bool = False, suffix: Iterable[str] = ("_x", "_y"), keep: bool = False -) -> DataFrame: +) -> DataFrameType: """Mutating joins including all rows in y. See inner_join() """ - if isinstance(by, dict): - right_on = list(by.values()) - ret = pandas.merge( - x, y, - left_on=list(by.keys()), - right_on=right_on, - how='right', - copy=copy, - suffixes=suffix - ) - if not keep: - return ret.drop(columns=right_on) - return ret - return pandas.merge(x, y, on=by, how='right', copy=copy, suffixes=suffix) + return _join( + x, y, + how='right', + by=by, + copy=copy, + suffix=suffix, + keep=keep + ) -@register_verb(DataFrame) +@register_verb( + (DataFrame, DataFrameGroupBy), + context=Context.EVAL, + extra_contexts={'by': Context.SELECT} +) def full_join( - x: DataFrame, - y: DataFrame, + x: DataFrameType, + y: DataFrameType, by: Optional[Union[StringOrIter, Mapping[str, str]]] = None, copy: bool = False, suffix: Iterable[str] = ("_x", "_y"), keep: bool = False -) -> DataFrame: +) -> DataFrameType: """Mutating joins including all rows in x or y. See inner_join() """ - if isinstance(by, dict): - right_on = list(by.values()) - ret = pandas.merge( - x, y, - left_on=list(by.keys()), - right_on=right_on, - how='outer', - copy=copy, - suffixes=suffix - ) - if not keep: - return ret.drop(columns=right_on) - return ret - return pandas.merge(x, y, on=by, how='outer', copy=copy, suffixes=suffix) + return _join( + x, y, + how='outer', + by=by, + copy=copy, + suffix=suffix, + keep=keep + ) -@register_verb(DataFrame) +@register_verb(DataFrame, context=Context.EVAL) def nest_join( x: DataFrame, y: DataFrame, @@ -1470,30 +1912,68 @@ def get_nested_df(row: Series) -> DataFrame: y_matched = y_matched.to_frame(name=y_name) return pandas.concat([x, y_matched], axis=1) -@register_verb(DataFrame) +@register_verb( + (DataFrame, DataFrameGroupBy), + context=Context.EVAL, + extra_contexts={'by': Context.SELECT} +) def semi_join( - x: DataFrame, - y: DataFrame, + x: DataFrameType, + y: DataFrameType, by: Optional[Union[StringOrIter, Mapping[str, str]]] = None, copy: bool = False -) -> DataFrame: +) -> DataFrameType: """Returns all rows from x with a match in y. See inner_join() """ - ret = pandas.merge(x, y, on=by, how='left', copy=copy, indicator=True) - return ret[ret._merge == 'both'].loc[:, x.columns.tolist()] + xobj = objectize(x) + y = objectize(y) + ret = pandas.merge( + xobj, y, + on=by, + how='left', + copy=copy, + suffixes=['', '_y'], + indicator=True + ) + ret = ret.loc[ret._merge == 'both', xobj.columns.tolist()] + + copy_flags(ret, x) + if isinstance(x, DataFrameGroupBy): + return group_df(ret, x.grouper.names) + + return ret -@register_verb(DataFrame) +@register_verb( + (DataFrame, DataFrameGroupBy), + context=Context.EVAL, + extra_contexts={'by': Context.SELECT} +) def anti_join( - x: DataFrame, - y: DataFrame, + x: DataFrameType, + y: DataFrameType, by: Optional[Union[StringOrIter, Mapping[str, str]]] = None, copy: bool = False -) -> DataFrame: +) -> DataFrameType: """Returns all rows from x without a match in y. See inner_join() """ - ret = pandas.merge(x, y, on=by, how='left', copy=copy, indicator=True) - return ret[ret._merge != 'both'].loc[:, x.columns.tolist()] + xobj = objectize(x) + y = objectize(y) + ret = pandas.merge( + xobj, y, + on=by, + how='left', + copy=copy, + suffixes=['', '_y'], + indicator=True + ) + ret = ret.loc[ret._merge != 'both', xobj.columns.tolist()] + + copy_flags(ret, x) + if isinstance(x, DataFrameGroupBy): + return group_df(ret, x.grouper.names) + + return ret diff --git a/datar/stats/funcs.py b/datar/stats/funcs.py index 7a49912b..744ea261 100644 --- a/datar/stats/funcs.py +++ b/datar/stats/funcs.py @@ -2,13 +2,14 @@ from typing import Any, Iterable, List import numpy -from pipda import Context +from pipda import register_func -from ..core.utils import register_grouped from ..core.types import FloatOrIter, SeriesLikeType +from ..core.contexts import Context # pylint: disable=redefined-builtin, redefined-outer-name +@register_func(None, context=Context.EVAL) def rnorm(n: int, mean: float = 0.0, sd: float = 1.0) -> List[float]: """random generation for the normal distribution with mean equal to mean and standard deviation equal to sd. @@ -51,7 +52,7 @@ def rpois(n: int, lambda_: float) -> List[float]: """ return numpy.random.poisson(lam=lambda_, size=n) -@register_grouped(context=Context.EVAL) +@register_func(None, context=Context.EVAL) def quantile( series: Iterable[Any], probs: FloatOrIter = (0.0, 0.25, 0.5, 0.75, 1.0), @@ -73,7 +74,7 @@ def quantile( else numpy.quantile(series, probs) ) -@register_grouped(context=Context.EVAL) +@register_func(None, context=Context.EVAL) def sd( series: Iterable[Any], na_rm: bool = False, diff --git a/datar/tibble/funcs.py b/datar/tibble/funcs.py index 553c9120..2fff447d 100644 --- a/datar/tibble/funcs.py +++ b/datar/tibble/funcs.py @@ -1,21 +1,21 @@ """Functions ported from tidyverse-tibble""" -import re -import inspect -from typing import Any, Callable, Union +import itertools +from typing import Any, Callable, Union, Optional -import pandas from pandas import DataFrame +from pandas.core.groupby.generic import DataFrameGroupBy from varname import argname, varname from pipda import Context from pipda.utils import Expression from pipda.symbolic import DirectRefAttr, DirectRefItem -from ..core.types import is_iterable -from ..core.utils import to_df +from ..core.utils import copy_flags, df_assign_item, objectize, to_df +from ..core.names import repair_names def tibble( *args: Any, _name_repair: Union[str, Callable] = 'check_unique', + _rows: Optional[int] = None, **kwargs: Any ) -> DataFrame: # pylint: disable=too-many-statements @@ -30,114 +30,63 @@ def tibble( but check they are unique, - "universal": Make the names unique and syntactic - a function: apply custom name repair + _rows: Number of rows of a 0-col dataframe when args and kwargs are + not provided. When args or kwargs are provided, this is ignored. Returns: A dataframe """ - # .rows not supported - argnames = argname(args, vars_only=False) + if not args and not kwargs: + df = DataFrame() if not _rows else DataFrame(index=range(_rows)) + df.__dfname__ = varname(raise_exc=False) + return df + + argnames = argname(args, vars_only=False, pos_only=True) + name_values = zip(argnames, args) + name_values = itertools.chain(name_values, kwargs.items()) + # cannot do it with Mappings, same keys will be lost + names = [] + values = [] + for name, value in name_values: + names.append(name) + values.append(value) + + names = repair_names(names, repair=_name_repair) df = None - raw_names = [] - new_names = [] - - def repair_name( - name: str, - name_repair: Union[str, Callable] = _name_repair - ) -> str: - - if name_repair == 'minimal': - raw_names.append(name) - new_names.append(name) - return name - if name_repair == 'unique': - if name in raw_names: - if name in new_names: - new_names[new_names.index(name)] = f'{name}_1' - new_names.append(f'{name}_2') - else: - indexes = [ - int(new_name[len(name)+1:]) - for new_name in new_names - if new_name.startswith(f'{name}_') - ] - new_names.append( - f'{name}_1' if not indexes - else f'{name}_{max(indexes) + 1}' - ) - else: - new_names.append(name) - raw_names.append(name) - return new_names[-1] - if name_repair == 'check_unique': - if name in raw_names: - raise ValueError(f"Column name {name!r} duplicated.") - return repair_name(name, 'minimal') - if name_repair == 'universal': - name = re.sub(r'[^a-zA-Z0-9]', '_', name) - name = re.sub(r'_+', '_', name).rstrip('_') - return repair_name(name, 'unique') - - if callable(name_repair): - if len(inspect.signature(name_repair).parameters) == 3: - new_name = name_repair(name, raw_names, new_names) - else: - new_name = name_repair(name) - - raw_names.append(name) - new_names.append(new_name) - return new_name - - if is_iterable(name_repair): - tmpname = f'_tmp_{len(raw_names)}' - raw_names.append(tmpname) - if not new_names: - new_names.extend(name_repair) - return tmpname - - raise ValueError( - "Expect 'minimal', 'unique', 'check_unique', " - "'universal', callable or a list of names for '_name_repair', " - f"but got {name_repair!r}" - ) - - for name, arg in zip(argnames, args): + for name, arg in zip(names, values): + if arg is None: + continue if isinstance(arg, Expression): - arg = arg.evaluate(df, Context.EVAL.value) - - if df is None: - df = to_df(arg, repair_name(name)) - elif isinstance(arg, DataFrame): - arg = DataFrame( - arg.values, - columns=[repair_name(col) for col in arg.columns] - ) - df = pandas.concat([df, arg], axis=1) - else: - df[repair_name(name)] = arg + arg = arg(df, Context.EVAL.value) - for key, val in kwargs.items(): - key = repair_name(key) - if isinstance(val, Expression): - val = val.evaluate(df, Context.EVAL.value) + if isinstance(arg, dict): + arg = tibble(**arg) if df is None: - df = to_df(val, key) - elif isinstance(val, DataFrame): - val = DataFrame( - val.values, - columns=[f'{key}[{col!r}]' for col in val.columns] - ) - df = pandas.concat([df, val], axis=1) - else: - df[key] = val + if isinstance(arg, (DataFrame, DataFrameGroupBy)): + arg = objectize(arg) + df = arg.copy() + copy_flags(df, arg) + if name not in argnames: + df.columns = [f'{name}${col}' for col in df.columns] - if ( - new_names != df.columns.to_list() and - _name_repair not in ('minimal', 'check_unique') - ): - df = df.rename(columns=dict(zip(df.columns, new_names))) + else: + df = to_df(arg, name) + elif isinstance(arg, (DataFrame, DataFrameGroupBy)): + arg = objectize(arg) + for col in arg.columns: + df_assign_item( + df, + f'{name}${col}' if name not in argnames else col, + arg[col], + allow_dups=True + ) + else: + df_assign_item(df, name, arg, allow_dups=True) + if df is None: + df = DataFrame() df.__dfname__ = varname(raise_exc=False) return df @@ -161,17 +110,26 @@ def tribble(*dummies: Any) -> DataFrame: A dataframe """ columns = [] - data = [[]] + data = [] for dummy in dummies: # columns if isinstance(dummy, (DirectRefAttr, DirectRefItem)): columns.append(dummy.ref) + elif not columns: + raise ValueError( + 'Must specify at least one column using the `f.` syntax.' + ) else: - # columns have been finished - if len(data[-1]) == len(columns): + if not data: data.append([]) - data[-1].append(dummy) + if len(data[-1]) < len(columns): + data[-1].append(dummy) + else: + data.append([dummy]) - ret = DataFrame(data, columns=columns) + ret = ( + DataFrame(data, columns=columns) if data + else DataFrame(columns=columns) + ) ret.__dfname__ = varname(raise_exc=False) return ret diff --git a/datar/tidyr/funcs.py b/datar/tidyr/funcs.py index 6b8a8c3d..cd27299b 100644 --- a/datar/tidyr/funcs.py +++ b/datar/tidyr/funcs.py @@ -1,14 +1,14 @@ """Functions from tidyr""" -from pandas.core.series import Series -from datar.core.middlewares import Nesting -from datar.dplyr.funcs import last from typing import Any, Iterable import numpy -from pipda import register_func, Context +from pandas.core.series import Series +from pipda import register_func from ..core.types import NumericType +from ..core.contexts import Context +from ..core.middlewares import Nesting @register_func(None, context=Context.EVAL) def full_seq( @@ -44,4 +44,5 @@ def full_seq( @register_func(None, context=None) def nesting(*cols: Any, **kwargs: Any) -> Nesting: + """Nesting""" return Nesting(*cols, **kwargs) diff --git a/datar/tidyr/verbs.py b/datar/tidyr/verbs.py index 1dc915e4..a704e5ae 100644 --- a/datar/tidyr/verbs.py +++ b/datar/tidyr/verbs.py @@ -7,19 +7,20 @@ import numpy import pandas from pandas import DataFrame -from pandas.core.dtypes.common import is_categorical_dtype from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy from pandas.core.series import Series -from pipda import register_verb, Context +from pipda import register_verb from ..core.utils import ( - copy_df, group_df, objectize, select_columns, list_diff, logger + copy_flags, group_df, objectize, select_columns, list_diff, logger ) from ..core.types import ( DataFrameType, IntOrIter, SeriesLikeType, StringOrIter, is_scalar ) from ..core.middlewares import Nesting +from ..core.contexts import Context +from ..core.names import repair_names from ..base.constants import NA from ..base.funcs import levels from ..dplyr.verbs import distinct @@ -339,6 +340,8 @@ def fill( """Fills missing values in selected columns using the next or previous entry. + See: https://tidyr.tidyverse.org/reference/fill.html + Args: _data: A dataframe *columns: Columns to fill @@ -370,14 +373,15 @@ def fill( lambda df: fill(df, *columns, _direction=_direction) ).groupby(grouper, dropna=False) -@register_verb(context=Context.EVAL) def expand_grid( _data: Iterable[Any] = None, - #_name_repair: str = "check_unique", # todo + _name_repair: str = "check_unique", **kwargs: Iterable[Any] ) -> DataFrame: """Expand elements into a new dataframe + See: https://tidyr.tidyverse.org/reference/expand_grid.html + Args: _data, **kwargs: Name-value pairs. The name will become the column name in the output. @@ -406,7 +410,7 @@ def expand_grid( return DataFrame( (itertools.chain.from_iterable(row) for row in itertools.product(*product_args)), - columns=names + columns=repair_names(names, _name_repair) ) @register_verb((DataFrame, DataFrameGroupBy), context=Context.SELECT) @@ -422,6 +426,8 @@ def extract( group into a new column. If the groups don't match, or the input is NA, the output will be NA. + See: https://tidyr.tidyverse.org/reference/extract.html + Args: _data: The dataframe col: Column name or position. @@ -516,22 +522,23 @@ def separate( # pylint: disable=too-many-branches for i, elem in enumerate(_data[col]): if elem in (NA, None): row = [NA] * nout - else: - row = re.split(sep, str(elem), nout - 1) - if len(row) < nout: - if fill == 'warn': - missing_warns.append(i) - if fill in ('warn', 'right'): - row += [NA] * (nout - len(row)) - else: - row = [NA] * (nout - len(row)) + row + continue + + row = re.split(sep, str(elem), nout - 1) + if len(row) < nout: + if fill == 'warn': + missing_warns.append(i) + if fill in ('warn', 'right'): + row += [NA] * (nout - len(row)) else: - more_splits = re.split(sep, row[-1], 1) - if len(more_splits) > 1: - if extra == 'warn': - extra_warns.append(i) - if extra in ('warn', 'drop'): - row[-1] = more_splits[0] + row = [NA] * (nout - len(row)) + row + else: + more_splits = re.split(sep, row[-1], 1) + if len(more_splits) > 1: + if extra == 'warn': + extra_warns.append(i) + if extra in ('warn', 'drop'): + row[-1] = more_splits[0] outdata.append(non_na_elems(row)) @@ -643,8 +650,9 @@ def unite( """ grouper = getattr(_data, 'grouper', None) columns = select_columns(_data.columns, *columns) - data = objectize(_data) - data = copy_df(data) + _data = objectize(_data) + data = _data.copy() + copy_flags(data, _data) def unite_cols(row): if na_rm: @@ -653,7 +661,7 @@ def unite_cols(row): data[col] = data[columns].agg(unite_cols, axis=1) if remove: - data = data.drop(columns=columns) + data.drop(columns=columns, inplace=True) if grouper is not None: return group_df(data, grouper) @@ -666,6 +674,8 @@ def drop_na( ) -> DataFrameType: """Drop rows containing missing values + See: https://tidyr.tidyverse.org/reference/drop_na.html + Args: data: A data frame. *columns: Columns to inspect for missing values. @@ -682,12 +692,12 @@ def drop_na( @register_verb(DataFrame, context=Context.EVAL) def expand( - _data: DataFrame, + _data: DataFrame, # pylint: disable=no-value-for-parameter *columns: Union[str, Nesting], # _name_repair: Union[str, Callable] = None # todo **kwargs: Iterable[Any] ) -> DataFrame: - """""" + """See: https://tidyr.tidyverse.org/reference/expand.html""" iterables = [] names = [] for i, column in enumerate(columns): @@ -724,4 +734,4 @@ def expand( return DataFrame(( itertools.chain.from_iterable(row) for row in itertools.product(*iterables) - ), columns=names) >> distinct() + ), columns=names) >> distinct() # pylint: disable=no-value-for-parameter diff --git a/datar/utils/verbs.py b/datar/utils/verbs.py index ccad6903..1411111d 100644 --- a/datar/utils/verbs.py +++ b/datar/utils/verbs.py @@ -6,8 +6,9 @@ from ..core.types import is_iterable from ..core.utils import objectize +from ..core.contexts import Context -@register_verb +@register_verb(context=Context.EVAL) def head(_data: Any, n: int = 6) -> DataFrame: """Get the first n rows of the dataframe or a vector @@ -24,7 +25,7 @@ def head(_data: Any, n: int = 6) -> DataFrame: return _data.head(n) return _data[:n] -@register_verb +@register_verb(context=Context.EVAL) def tail(_data: Any, n: int = 6) -> DataFrame: """Get the last n rows of the dataframe or a vector diff --git a/examples/across.ipynb b/docs/notebooks/across.ipynb similarity index 85% rename from examples/across.ipynb rename to docs/notebooks/across.ipynb index 8c1b5b8d..7537df38 100644 --- a/examples/across.ipynb +++ b/docs/notebooks/across.ipynb @@ -4,11 +4,42 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apply the same transformation to multiple columns\n", + "\n", + " The original API:\n", + " https://dplyr.tidyverse.org/reference/across.html\n", + "\n", + " Args:\n", + " _data: The dataframe\n", + " _cols: The columns\n", + " _fns: Functions to apply to each of the selected columns.\n", + " _names: A glue specification that describes how to name\n", + " the output columns. This can use `{_col}` to stand for the\n", + " selected column name, and `{_fn}` to stand for the name of\n", + " the function being applied.\n", + " The default (None) is equivalent to `{_col}` for the\n", + " single function case and `{_col}_{_fn}` for the case where\n", + " a list is used for _fns. In such a case, `{_fn}` is 0-based.\n", + " To use 1-based index, use `{_fn1}`\n", + " **kwargs: Arguments for the functions\n", + "\n", + " Returns:\n", + " A dataframe with one column for each column in _cols and\n", + " each function in _fns.\n", + " \n" + ] + } + ], "source": [ - "# https://dplyr.tidyverse.org/reference/across.html\n", "from datar.datasets import iris\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(across.__doc__)" ] }, { @@ -836,13 +867,6 @@ "execution_count": 9, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2021-03-12 19:55:36][datar][ INFO] `summarise()` ungrouping output (override with `_groups` argument)\n" - ] - }, { "data": { "text/html": [ @@ -864,39 +888,39 @@ " \n", " \n", " \n", - " Species\n", " Sepal_Length\n", " Sepal_Width\n", + " Species\n", " \n", " \n", " \n", " \n", " 0\n", - " setosa\n", " 5.006\n", " 3.428\n", + " setosa\n", " \n", " \n", " 1\n", - " versicolor\n", " 5.936\n", " 2.770\n", + " versicolor\n", " \n", " \n", " 2\n", - " virginica\n", " 6.588\n", " 2.974\n", + " virginica\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Species Sepal_Length Sepal_Width\n", - "0 setosa 5.006 3.428\n", - "1 versicolor 5.936 2.770\n", - "2 virginica 6.588 2.974" + " Sepal_Length Sepal_Width Species\n", + "0 5.006 3.428 setosa\n", + "1 5.936 2.770 versicolor\n", + "2 6.588 2.974 virginica" ] }, "execution_count": 9, @@ -915,13 +939,6 @@ "execution_count": 10, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2021-03-12 19:55:36][datar][ INFO] `summarise()` ungrouping output (override with `_groups` argument)\n" - ] - }, { "data": { "text/html": [ @@ -1007,13 +1024,6 @@ "execution_count": 11, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2021-03-12 19:55:36][datar][ INFO] `summarise()` ungrouping output (override with `_groups` argument)\n" - ] - }, { "data": { "text/html": [ @@ -1086,13 +1096,6 @@ "execution_count": 12, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2021-03-12 19:55:36][datar][ INFO] `summarise()` ungrouping output (override with `_groups` argument)\n" - ] - }, { "data": { "text/html": [ @@ -1178,13 +1181,6 @@ "execution_count": 13, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2021-03-12 19:55:36][datar][ INFO] `summarise()` ungrouping output (override with `_groups` argument)\n" - ] - }, { "data": { "text/html": [ @@ -1207,10 +1203,10 @@ " \n", " \n", " Species\n", + " Sepal_Length.fn0\n", " Sepal_Length.fn1\n", - " Sepal_Length.fn2\n", + " Sepal_Width.fn0\n", " Sepal_Width.fn1\n", - " Sepal_Width.fn2\n", " \n", " \n", " \n", @@ -1243,12 +1239,12 @@ "" ], "text/plain": [ - " Species Sepal_Length.fn1 Sepal_Length.fn2 Sepal_Width.fn1 \\\n", + " Species Sepal_Length.fn0 Sepal_Length.fn1 Sepal_Width.fn0 \\\n", "0 setosa 5.006 0.352490 3.428 \n", "1 versicolor 5.936 0.516171 2.770 \n", "2 virginica 6.588 0.635880 2.974 \n", "\n", - " Sepal_Width.fn2 \n", + " Sepal_Width.fn1 \n", "0 0.379064 \n", "1 0.313798 \n", "2 0.322497 " @@ -1270,13 +1266,6 @@ "execution_count": 14, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2021-03-12 19:55:36][datar][ INFO] `summarise()` ungrouping output (override with `_groups` argument)\n" - ] - }, { "data": { "text/html": [ @@ -1353,7 +1342,7 @@ ], "source": [ "iris >> group_by(f.Species) >> summarise(\n", - " across(starts_with(\"Sepal\"), [mean, sd], _names = \"{_col}.fn{_fn0}\")\n", + " across(starts_with(\"Sepal\"), [mean, sd], _names = \"{_col}.fn{_fn}\")\n", ")" ] }, @@ -1361,6 +1350,91 @@ "cell_type": "code", "execution_count": 15, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SpeciesSepal_Length.fn1Sepal_Length.fn2Sepal_Width.fn1Sepal_Width.fn2
0setosa5.0060.3524903.4280.379064
1versicolor5.9360.5161712.7700.313798
2virginica6.5880.6358802.9740.322497
\n", + "
" + ], + "text/plain": [ + " Species Sepal_Length.fn1 Sepal_Length.fn2 Sepal_Width.fn1 \\\n", + "0 setosa 5.006 0.352490 3.428 \n", + "1 versicolor 5.936 0.516171 2.770 \n", + "2 virginica 6.588 0.635880 2.974 \n", + "\n", + " Sepal_Width.fn2 \n", + "0 0.379064 \n", + "1 0.313798 \n", + "2 0.322497 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris >> group_by(f.Species) >> summarise(\n", + " across(starts_with(\"Sepal\"), [mean, sd], _names = \"{_col}.fn{_fn1}\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, "outputs": [ { "data": { @@ -1392,7 +1466,7 @@ " \n", " \n", " \n", - " 15\n", + " 0\n", " 5.7\n", " 4.4\n", " 1.5\n", @@ -1400,7 +1474,7 @@ " setosa\n", " \n", " \n", - " 32\n", + " 1\n", " 5.2\n", " 4.1\n", " 1.5\n", @@ -1408,7 +1482,7 @@ " setosa\n", " \n", " \n", - " 33\n", + " 2\n", " 5.5\n", " 4.2\n", " 1.4\n", @@ -1420,13 +1494,13 @@ "" ], "text/plain": [ - " Sepal_Length Sepal_Width Petal_Length Petal_Width Species\n", - "15 5.7 4.4 1.5 0.4 setosa\n", - "32 5.2 4.1 1.5 0.1 setosa\n", - "33 5.5 4.2 1.4 0.2 setosa" + " Sepal_Length Sepal_Width Petal_Length Petal_Width Species\n", + "0 5.7 4.4 1.5 0.4 setosa\n", + "1 5.2 4.1 1.5 0.1 setosa\n", + "2 5.5 4.2 1.4 0.2 setosa" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1437,7 +1511,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1470,7 +1544,7 @@ " \n", " \n", " \n", - " 100\n", + " 0\n", " 6.3\n", " 3.3\n", " 6.0\n", @@ -1478,7 +1552,7 @@ " virginica\n", " \n", " \n", - " 102\n", + " 1\n", " 7.1\n", " 3.0\n", " 5.9\n", @@ -1486,7 +1560,7 @@ " virginica\n", " \n", " \n", - " 104\n", + " 2\n", " 6.5\n", " 3.0\n", " 5.8\n", @@ -1494,7 +1568,7 @@ " virginica\n", " \n", " \n", - " 105\n", + " 3\n", " 7.6\n", " 3.0\n", " 6.6\n", @@ -1502,7 +1576,7 @@ " virginica\n", " \n", " \n", - " 109\n", + " 4\n", " 7.2\n", " 3.6\n", " 6.1\n", @@ -1510,7 +1584,7 @@ " virginica\n", " \n", " \n", - " 112\n", + " 5\n", " 6.8\n", " 3.0\n", " 5.5\n", @@ -1518,7 +1592,7 @@ " virginica\n", " \n", " \n", - " 114\n", + " 6\n", " 5.8\n", " 2.8\n", " 5.1\n", @@ -1526,7 +1600,7 @@ " virginica\n", " \n", " \n", - " 115\n", + " 7\n", " 6.4\n", " 3.2\n", " 5.3\n", @@ -1534,7 +1608,7 @@ " virginica\n", " \n", " \n", - " 117\n", + " 8\n", " 7.7\n", " 3.8\n", " 6.7\n", @@ -1542,7 +1616,7 @@ " virginica\n", " \n", " \n", - " 118\n", + " 9\n", " 7.7\n", " 2.6\n", " 6.9\n", @@ -1550,7 +1624,7 @@ " virginica\n", " \n", " \n", - " 120\n", + " 10\n", " 6.9\n", " 3.2\n", " 5.7\n", @@ -1558,7 +1632,7 @@ " virginica\n", " \n", " \n", - " 124\n", + " 11\n", " 6.7\n", " 3.3\n", " 5.7\n", @@ -1566,7 +1640,7 @@ " virginica\n", " \n", " \n", - " 128\n", + " 12\n", " 6.4\n", " 2.8\n", " 5.6\n", @@ -1574,7 +1648,7 @@ " virginica\n", " \n", " \n", - " 132\n", + " 13\n", " 6.4\n", " 2.8\n", " 5.6\n", @@ -1582,7 +1656,7 @@ " virginica\n", " \n", " \n", - " 135\n", + " 14\n", " 7.7\n", " 3.0\n", " 6.1\n", @@ -1590,7 +1664,7 @@ " virginica\n", " \n", " \n", - " 136\n", + " 15\n", " 6.3\n", " 3.4\n", " 5.6\n", @@ -1598,7 +1672,7 @@ " virginica\n", " \n", " \n", - " 139\n", + " 16\n", " 6.9\n", " 3.1\n", " 5.4\n", @@ -1606,7 +1680,7 @@ " virginica\n", " \n", " \n", - " 140\n", + " 17\n", " 6.7\n", " 3.1\n", " 5.6\n", @@ -1614,7 +1688,7 @@ " virginica\n", " \n", " \n", - " 141\n", + " 18\n", " 6.9\n", " 3.1\n", " 5.1\n", @@ -1622,7 +1696,7 @@ " virginica\n", " \n", " \n", - " 143\n", + " 19\n", " 6.8\n", " 3.2\n", " 5.9\n", @@ -1630,7 +1704,7 @@ " virginica\n", " \n", " \n", - " 144\n", + " 20\n", " 6.7\n", " 3.3\n", " 5.7\n", @@ -1638,7 +1712,7 @@ " virginica\n", " \n", " \n", - " 145\n", + " 21\n", " 6.7\n", " 3.0\n", " 5.2\n", @@ -1646,7 +1720,7 @@ " virginica\n", " \n", " \n", - " 148\n", + " 22\n", " 6.2\n", " 3.4\n", " 5.4\n", @@ -1658,33 +1732,33 @@ "" ], "text/plain": [ - " Sepal_Length Sepal_Width Petal_Length Petal_Width Species\n", - "100 6.3 3.3 6.0 2.5 virginica\n", - "102 7.1 3.0 5.9 2.1 virginica\n", - "104 6.5 3.0 5.8 2.2 virginica\n", - "105 7.6 3.0 6.6 2.1 virginica\n", - "109 7.2 3.6 6.1 2.5 virginica\n", - "112 6.8 3.0 5.5 2.1 virginica\n", - "114 5.8 2.8 5.1 2.4 virginica\n", - "115 6.4 3.2 5.3 2.3 virginica\n", - "117 7.7 3.8 6.7 2.2 virginica\n", - "118 7.7 2.6 6.9 2.3 virginica\n", - "120 6.9 3.2 5.7 2.3 virginica\n", - "124 6.7 3.3 5.7 2.1 virginica\n", - "128 6.4 2.8 5.6 2.1 virginica\n", - "132 6.4 2.8 5.6 2.2 virginica\n", - "135 7.7 3.0 6.1 2.3 virginica\n", - "136 6.3 3.4 5.6 2.4 virginica\n", - "139 6.9 3.1 5.4 2.1 virginica\n", - "140 6.7 3.1 5.6 2.4 virginica\n", - "141 6.9 3.1 5.1 2.3 virginica\n", - "143 6.8 3.2 5.9 2.3 virginica\n", - "144 6.7 3.3 5.7 2.5 virginica\n", - "145 6.7 3.0 5.2 2.3 virginica\n", - "148 6.2 3.4 5.4 2.3 virginica" + " Sepal_Length Sepal_Width Petal_Length Petal_Width Species\n", + "0 6.3 3.3 6.0 2.5 virginica\n", + "1 7.1 3.0 5.9 2.1 virginica\n", + "2 6.5 3.0 5.8 2.2 virginica\n", + "3 7.6 3.0 6.6 2.1 virginica\n", + "4 7.2 3.6 6.1 2.5 virginica\n", + "5 6.8 3.0 5.5 2.1 virginica\n", + "6 5.8 2.8 5.1 2.4 virginica\n", + "7 6.4 3.2 5.3 2.3 virginica\n", + "8 7.7 3.8 6.7 2.2 virginica\n", + "9 7.7 2.6 6.9 2.3 virginica\n", + "10 6.9 3.2 5.7 2.3 virginica\n", + "11 6.7 3.3 5.7 2.1 virginica\n", + "12 6.4 2.8 5.6 2.1 virginica\n", + "13 6.4 2.8 5.6 2.2 virginica\n", + "14 7.7 3.0 6.1 2.3 virginica\n", + "15 6.3 3.4 5.6 2.4 virginica\n", + "16 6.9 3.1 5.4 2.1 virginica\n", + "17 6.7 3.1 5.6 2.4 virginica\n", + "18 6.9 3.1 5.1 2.3 virginica\n", + "19 6.8 3.2 5.9 2.3 virginica\n", + "20 6.7 3.3 5.7 2.5 virginica\n", + "21 6.7 3.0 5.2 2.3 virginica\n", + "22 6.2 3.4 5.4 2.3 virginica" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1695,7 +1769,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": { "tags": [] }, @@ -1733,72 +1807,72 @@ " \n", " \n", " 0\n", - " 1\n", - " 0.569361\n", - " 0.976979\n", - " 0.526496\n", - " 0.884346\n", - " 2.957183\n", - " 0.224866\n", + " 1.0\n", + " 0.248253\n", + " 0.269296\n", + " 0.402731\n", + " 0.548187\n", + " 1.468467\n", + " 0.138747\n", " \n", " \n", " 1\n", - " 2\n", - " 0.732912\n", - " 0.242606\n", - " 0.686552\n", - " 0.930933\n", - " 2.593004\n", - " 0.290455\n", + " 2.0\n", + " 0.591192\n", + " 0.529841\n", + " 0.291233\n", + " 0.429670\n", + " 1.841936\n", + " 0.131010\n", " \n", " \n", " 2\n", - " 3\n", - " 0.747643\n", - " 0.526913\n", - " 0.988250\n", - " 0.759010\n", - " 3.021816\n", - " 0.188413\n", + " 3.0\n", + " 0.083661\n", + " 0.866707\n", + " 0.730725\n", + " 0.399703\n", + " 2.080796\n", + " 0.350939\n", " \n", " \n", " 3\n", - " 4\n", - " 0.544264\n", - " 0.228770\n", - " 0.710761\n", - " 0.199676\n", - " 1.683472\n", - " 0.248390\n", + " 4.0\n", + " 0.366326\n", + " 0.447924\n", + " 0.355393\n", + " 0.370590\n", + " 1.540233\n", + " 0.042397\n", " \n", " \n", "\n", "" ], "text/plain": [ - " id w x y z sum sd\n", - "0 1 0.569361 0.976979 0.526496 0.884346 2.957183 0.224866\n", - "1 2 0.732912 0.242606 0.686552 0.930933 2.593004 0.290455\n", - "2 3 0.747643 0.526913 0.988250 0.759010 3.021816 0.188413\n", - "3 4 0.544264 0.228770 0.710761 0.199676 1.683472 0.248390" + " id w x y z sum sd\n", + "0 1.0 0.248253 0.269296 0.402731 0.548187 1.468467 0.138747\n", + "1 2.0 0.591192 0.529841 0.291233 0.429670 1.841936 0.131010\n", + "2 3.0 0.083661 0.866707 0.730725 0.399703 2.080796 0.350939\n", + "3 4.0 0.366326 0.447924 0.355393 0.370590 1.540233 0.042397" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = tibble(\n", - " id=[1, 2, 3, 4], \n", + " id=[1, 2, 3, 4],\n", " w=runif(4), \n", " x=runif(4), \n", " y=runif(4), \n", " z=runif(4)\n", ")\n", "df >> rowwise() >> mutate(\n", - " sum = c_across(f[f.w:f.z], sum), # function has to be called inside c_across\n", - " sd = c_across(f[f.w:f.z], sd)\n", + " sum = sum(c_across(f[f.w:f.z])),\n", + " sd = sd(c_across(f[f.w:f.z]))\n", ")" ] }, diff --git a/examples/arrange.ipynb b/docs/notebooks/arrange.ipynb similarity index 75% rename from examples/arrange.ipynb rename to docs/notebooks/arrange.ipynb index b0df60e5..991c04fa 100644 --- a/examples/arrange.ipynb +++ b/docs/notebooks/arrange.ipynb @@ -6,588 +6,45 @@ "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "# https://dplyr.tidyverse.org/reference/arrange.html\n", - "\n", - "from datar.datasets import mtcars, iris\n", - "from datar.all import *" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mpgcyldisphpdratwtqsecvsamgearcarb
Lincoln Continental10.48460.02153.005.42417.820034
Ford Pantera L15.88351.02644.223.17014.500154
Pontiac Firebird19.28400.01753.083.84517.050032
Hornet Sportabout18.78360.01753.153.44017.020032
Camaro Z2813.38350.02453.733.84015.410034
Duster 36014.38360.02453.213.57015.840034
AMC Javelin15.28304.01503.153.43517.300032
Dodge Challenger15.58318.01502.763.52016.870032
Chrysler Imperial14.78440.02303.235.34517.420034
Maserati Bora15.08301.03353.543.57014.600158
Merc 450SE16.48275.81803.074.07017.400033
Merc 450SL17.38275.81803.073.73017.600033
Merc 450SLC15.28275.81803.073.78018.000033
Cadillac Fleetwood10.48472.02052.935.25017.980034
Ferrari Dino19.76145.01753.622.77015.500156
Mazda RX421.06160.01103.902.62016.460144
Merc 28019.26167.61233.923.44018.301044
Valiant18.16225.01052.763.46020.221031
Hornet 4 Drive21.46258.01103.083.21519.441031
Mazda RX4 Wag21.06160.01103.902.87517.020144
Merc 280C17.86167.61233.923.44018.901044
Fiat 12832.4478.7664.082.20019.471141
Honda Civic30.4475.7524.931.61518.521142
Toyota Corolla33.9471.1654.221.83519.901141
Toyota Corona21.54120.1973.702.46520.011031
Merc 23022.84140.8953.923.15022.901042
Merc 240D24.44146.7623.693.19020.001042
Fiat X1-927.3479.0664.081.93518.901141
Porsche 914-226.04120.3914.432.14016.700152
Lotus Europa30.4495.11133.771.51316.901152
Datsun 71022.84108.0933.852.32018.611141
Volvo 142E21.44121.01094.112.78018.601142
\n", - "
" - ], - "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear \\\n", - "Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 \n", - "Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 \n", - "Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 \n", - "Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 \n", - "Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 \n", - "Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 \n", - "AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 \n", - "Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 \n", - "Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 \n", - "Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 \n", - "Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 \n", - "Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 \n", - "Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 \n", - "Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 \n", - "Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 \n", - "Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 \n", - "Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 \n", - "Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 \n", - "Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 \n", - "Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 \n", - "Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 \n", - "Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 \n", - "Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 \n", - "Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 \n", - "Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 \n", - "Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 \n", - "Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 \n", - "Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 \n", - "Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 \n", - "Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 \n", - "Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 \n", - "Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 \n", - "\n", - " carb \n", - "Lincoln Continental 4 \n", - "Ford Pantera L 4 \n", - "Pontiac Firebird 2 \n", - "Hornet Sportabout 2 \n", - "Camaro Z28 4 \n", - "Duster 360 4 \n", - "AMC Javelin 2 \n", - "Dodge Challenger 2 \n", - "Chrysler Imperial 4 \n", - "Maserati Bora 8 \n", - "Merc 450SE 3 \n", - "Merc 450SL 3 \n", - "Merc 450SLC 3 \n", - "Cadillac Fleetwood 4 \n", - "Ferrari Dino 6 \n", - "Mazda RX4 4 \n", - "Merc 280 4 \n", - "Valiant 1 \n", - "Hornet 4 Drive 1 \n", - "Mazda RX4 Wag 4 \n", - "Merc 280C 4 \n", - "Fiat 128 1 \n", - "Honda Civic 2 \n", - "Toyota Corolla 1 \n", - "Toyota Corona 1 \n", - "Merc 230 2 \n", - "Merc 240D 2 \n", - "Fiat X1-9 1 \n", - "Porsche 914-2 2 \n", - "Lotus Europa 2 \n", - "Datsun 710 1 \n", - "Volvo 142E 2 " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "orders the rows of a data frame by the values of selected columns.\n", + "\n", + " The original API:\n", + " https://dplyr.tidyverse.org/reference/arrange.html\n", + "\n", + " Args:\n", + " _data: A data frame\n", + " *series: Variables, or functions of variables.\n", + " Use desc() to sort a variable in descending order.\n", + " _by_group: If TRUE, will sort first by grouping variable.\n", + " Applies to grouped data frames only.\n", + " **kwargs: Name-value pairs that apply with mutate\n", + "\n", + " Returns:\n", + " An object of the same type as _data.\n", + " The output has the following properties:\n", + " All rows appear in the output, but (usually) in a different place.\n", + " Columns are not modified.\n", + " Groups are not modified.\n", + " Data frame attributes are preserved.\n", + " \n" + ] } ], "source": [ - "mtcars >> arrange(-f.cyl)" + "from datar.datasets import mtcars, iris\n", + "from datar.all import *\n", + "\n", + "print(arrange.__doc__)" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -1147,19 +604,18 @@ "Cadillac Fleetwood 4 " ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# f-expression can only be used in piping\n", - "mtcars >> arrange(f.cyl, f.disp)" + "arrange(mtcars, f.cyl, f.disp)" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -1422,28 +878,28 @@ " 1\n", " \n", " \n", - " Merc 280C\n", - " 17.8\n", + " Merc 280\n", + " 19.2\n", " 6\n", " 167.6\n", " 123\n", " 3.92\n", " 3.440\n", - " 18.90\n", + " 18.30\n", " 1\n", " 0\n", " 4\n", " 4\n", " \n", " \n", - " Merc 280\n", - " 19.2\n", + " Merc 280C\n", + " 17.8\n", " 6\n", " 167.6\n", " 123\n", " 3.92\n", " 3.440\n", - " 18.30\n", + " 18.90\n", " 1\n", " 0\n", " 4\n", @@ -1667,8 +1123,8 @@ "Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 \n", "Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 \n", "Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 \n", - "Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 \n", "Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 \n", + "Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 \n", "Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 \n", "Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 \n", "Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 \n", @@ -1701,8 +1157,8 @@ "Merc 450SLC 3 \n", "Hornet 4 Drive 1 \n", "Valiant 1 \n", - "Merc 280C 4 \n", "Merc 280 4 \n", + "Merc 280C 4 \n", "Mazda RX4 4 \n", "Mazda RX4 Wag 4 \n", "Merc 240D 2 \n", @@ -1719,7 +1175,7 @@ "Toyota Corolla 1 " ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -1730,7 +1186,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -1769,7 +1225,7 @@ " \n", " \n", " \n", - " Lincoln Continental\n", + " 15\n", " 10.4\n", " 8\n", " 460.0\n", @@ -1783,7 +1239,7 @@ " 4\n", " \n", " \n", - " Chrysler Imperial\n", + " 16\n", " 14.7\n", " 8\n", " 440.0\n", @@ -1797,7 +1253,7 @@ " 4\n", " \n", " \n", - " Cadillac Fleetwood\n", + " 14\n", " 10.4\n", " 8\n", " 472.0\n", @@ -1811,7 +1267,7 @@ " 4\n", " \n", " \n", - " Merc 450SE\n", + " 11\n", " 16.4\n", " 8\n", " 275.8\n", @@ -1825,7 +1281,7 @@ " 3\n", " \n", " \n", - " Pontiac Firebird\n", + " 24\n", " 19.2\n", " 8\n", " 400.0\n", @@ -1839,7 +1295,7 @@ " 2\n", " \n", " \n", - " Camaro Z28\n", + " 23\n", " 13.3\n", " 8\n", " 350.0\n", @@ -1853,7 +1309,7 @@ " 4\n", " \n", " \n", - " Merc 450SLC\n", + " 13\n", " 15.2\n", " 8\n", " 275.8\n", @@ -1867,7 +1323,7 @@ " 3\n", " \n", " \n", - " Merc 450SL\n", + " 12\n", " 17.3\n", " 8\n", " 275.8\n", @@ -1881,7 +1337,7 @@ " 3\n", " \n", " \n", - " Maserati Bora\n", + " 30\n", " 15.0\n", " 8\n", " 301.0\n", @@ -1895,7 +1351,7 @@ " 8\n", " \n", " \n", - " Duster 360\n", + " 6\n", " 14.3\n", " 8\n", " 360.0\n", @@ -1909,7 +1365,7 @@ " 4\n", " \n", " \n", - " Dodge Challenger\n", + " 21\n", " 15.5\n", " 8\n", " 318.0\n", @@ -1923,7 +1379,7 @@ " 2\n", " \n", " \n", - " Valiant\n", + " 5\n", " 18.1\n", " 6\n", " 225.0\n", @@ -1937,21 +1393,7 @@ " 1\n", " \n", " \n", - " Merc 280C\n", - " 17.8\n", - " 6\n", - " 167.6\n", - " 123\n", - " 3.92\n", - " 3.440\n", - " 18.90\n", - " 1\n", - " 0\n", - " 4\n", - " 4\n", - " \n", - " \n", - " Merc 280\n", + " 9\n", " 19.2\n", " 6\n", " 167.6\n", @@ -1965,7 +1407,7 @@ " 4\n", " \n", " \n", - " Hornet Sportabout\n", + " 4\n", " 18.7\n", " 8\n", " 360.0\n", @@ -1979,7 +1421,21 @@ " 2\n", " \n", " \n", - " AMC Javelin\n", + " 10\n", + " 17.8\n", + " 6\n", + " 167.6\n", + " 123\n", + " 3.92\n", + " 3.440\n", + " 18.90\n", + " 1\n", + " 0\n", + " 4\n", + " 4\n", + " \n", + " \n", + " 22\n", " 15.2\n", " 8\n", " 304.0\n", @@ -1993,7 +1449,7 @@ " 2\n", " \n", " \n", - " Hornet 4 Drive\n", + " 3\n", " 21.4\n", " 6\n", " 258.0\n", @@ -2007,7 +1463,7 @@ " 1\n", " \n", " \n", - " Merc 240D\n", + " 7\n", " 24.4\n", " 4\n", " 146.7\n", @@ -2021,7 +1477,7 @@ " 2\n", " \n", " \n", - " Ford Pantera L\n", + " 28\n", " 15.8\n", " 8\n", " 351.0\n", @@ -2035,7 +1491,7 @@ " 4\n", " \n", " \n", - " Merc 230\n", + " 8\n", " 22.8\n", " 4\n", " 140.8\n", @@ -2049,7 +1505,7 @@ " 2\n", " \n", " \n", - " Mazda RX4 Wag\n", + " 1\n", " 21.0\n", " 6\n", " 160.0\n", @@ -2063,7 +1519,7 @@ " 4\n", " \n", " \n", - " Volvo 142E\n", + " 31\n", " 21.4\n", " 4\n", " 121.0\n", @@ -2077,7 +1533,7 @@ " 2\n", " \n", " \n", - " Ferrari Dino\n", + " 29\n", " 19.7\n", " 6\n", " 145.0\n", @@ -2091,7 +1547,7 @@ " 6\n", " \n", " \n", - " Mazda RX4\n", + " 0\n", " 21.0\n", " 6\n", " 160.0\n", @@ -2105,7 +1561,7 @@ " 4\n", " \n", " \n", - " Toyota Corona\n", + " 20\n", " 21.5\n", " 4\n", " 120.1\n", @@ -2119,7 +1575,7 @@ " 1\n", " \n", " \n", - " Datsun 710\n", + " 2\n", " 22.8\n", " 4\n", " 108.0\n", @@ -2133,7 +1589,7 @@ " 1\n", " \n", " \n", - " Fiat 128\n", + " 17\n", " 32.4\n", " 4\n", " 78.7\n", @@ -2147,7 +1603,7 @@ " 1\n", " \n", " \n", - " Porsche 914-2\n", + " 26\n", " 26.0\n", " 4\n", " 120.3\n", @@ -2161,7 +1617,7 @@ " 2\n", " \n", " \n", - " Fiat X1-9\n", + " 25\n", " 27.3\n", " 4\n", " 79.0\n", @@ -2175,7 +1631,7 @@ " 1\n", " \n", " \n", - " Toyota Corolla\n", + " 19\n", " 33.9\n", " 4\n", " 71.1\n", @@ -2189,7 +1645,7 @@ " 1\n", " \n", " \n", - " Honda Civic\n", + " 18\n", " 30.4\n", " 4\n", " 75.7\n", @@ -2203,7 +1659,7 @@ " 2\n", " \n", " \n", - " Lotus Europa\n", + " 27\n", " 30.4\n", " 4\n", " 95.1\n", @@ -2221,76 +1677,42 @@ "" ], "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear \\\n", - "Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 \n", - "Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 \n", - "Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 \n", - "Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 \n", - "Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 \n", - "Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 \n", - "Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 \n", - "Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 \n", - "Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 \n", - "Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 \n", - "Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 \n", - "Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 \n", - "Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 \n", - "Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 \n", - "Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 \n", - "AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 \n", - "Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 \n", - "Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 \n", - "Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 \n", - "Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 \n", - "Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 \n", - "Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 \n", - "Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 \n", - "Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 \n", - "Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 \n", - "Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 \n", - "Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 \n", - "Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 \n", - "Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 \n", - "Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 \n", - "Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 \n", - "Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 \n", - "\n", - " carb \n", - "Lincoln Continental 4 \n", - "Chrysler Imperial 4 \n", - "Cadillac Fleetwood 4 \n", - "Merc 450SE 3 \n", - "Pontiac Firebird 2 \n", - "Camaro Z28 4 \n", - "Merc 450SLC 3 \n", - "Merc 450SL 3 \n", - "Maserati Bora 8 \n", - "Duster 360 4 \n", - "Dodge Challenger 2 \n", - "Valiant 1 \n", - "Merc 280C 4 \n", - "Merc 280 4 \n", - "Hornet Sportabout 2 \n", - "AMC Javelin 2 \n", - "Hornet 4 Drive 1 \n", - "Merc 240D 2 \n", - "Ford Pantera L 4 \n", - "Merc 230 2 \n", - "Mazda RX4 Wag 4 \n", - "Volvo 142E 2 \n", - "Ferrari Dino 6 \n", - "Mazda RX4 4 \n", - "Toyota Corona 1 \n", - "Datsun 710 1 \n", - "Fiat 128 1 \n", - "Porsche 914-2 2 \n", - "Fiat X1-9 1 \n", - "Toyota Corolla 1 \n", - "Honda Civic 2 \n", - "Lotus Europa 2 " + " mpg cyl disp hp drat wt qsec vs am gear carb\n", + "15 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4\n", + "16 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4\n", + "14 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4\n", + "11 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3\n", + "24 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2\n", + "23 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4\n", + "13 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3\n", + "12 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3\n", + "30 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8\n", + "6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4\n", + "21 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2\n", + "5 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1\n", + "9 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4\n", + "4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2\n", + "10 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4\n", + "22 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2\n", + "3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1\n", + "7 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2\n", + "28 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4\n", + "8 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2\n", + "1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4\n", + "31 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2\n", + "29 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6\n", + "0 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4\n", + "20 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1\n", + "2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1\n", + "17 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1\n", + "26 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2\n", + "25 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1\n", + "19 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1\n", + "18 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2\n", + "27 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -2302,7 +1724,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -2341,7 +1763,7 @@ " \n", " \n", " \n", - " Merc 240D\n", + " 7\n", " 24.4\n", " 4\n", " 146.7\n", @@ -2355,7 +1777,7 @@ " 2\n", " \n", " \n", - " Merc 230\n", + " 8\n", " 22.8\n", " 4\n", " 140.8\n", @@ -2369,7 +1791,7 @@ " 2\n", " \n", " \n", - " Volvo 142E\n", + " 31\n", " 21.4\n", " 4\n", " 121.0\n", @@ -2383,7 +1805,7 @@ " 2\n", " \n", " \n", - " Toyota Corona\n", + " 20\n", " 21.5\n", " 4\n", " 120.1\n", @@ -2397,7 +1819,7 @@ " 1\n", " \n", " \n", - " Datsun 710\n", + " 2\n", " 22.8\n", " 4\n", " 108.0\n", @@ -2411,7 +1833,7 @@ " 1\n", " \n", " \n", - " Fiat 128\n", + " 17\n", " 32.4\n", " 4\n", " 78.7\n", @@ -2425,7 +1847,7 @@ " 1\n", " \n", " \n", - " Porsche 914-2\n", + " 26\n", " 26.0\n", " 4\n", " 120.3\n", @@ -2439,7 +1861,7 @@ " 2\n", " \n", " \n", - " Fiat X1-9\n", + " 25\n", " 27.3\n", " 4\n", " 79.0\n", @@ -2453,7 +1875,7 @@ " 1\n", " \n", " \n", - " Toyota Corolla\n", + " 19\n", " 33.9\n", " 4\n", " 71.1\n", @@ -2467,7 +1889,7 @@ " 1\n", " \n", " \n", - " Honda Civic\n", + " 18\n", " 30.4\n", " 4\n", " 75.7\n", @@ -2481,7 +1903,7 @@ " 2\n", " \n", " \n", - " Lotus Europa\n", + " 27\n", " 30.4\n", " 4\n", " 95.1\n", @@ -2495,7 +1917,7 @@ " 2\n", " \n", " \n", - " Valiant\n", + " 5\n", " 18.1\n", " 6\n", " 225.0\n", @@ -2509,7 +1931,7 @@ " 1\n", " \n", " \n", - " Merc 280\n", + " 9\n", " 19.2\n", " 6\n", " 167.6\n", @@ -2523,7 +1945,7 @@ " 4\n", " \n", " \n", - " Merc 280C\n", + " 10\n", " 17.8\n", " 6\n", " 167.6\n", @@ -2537,7 +1959,7 @@ " 4\n", " \n", " \n", - " Hornet 4 Drive\n", + " 3\n", " 21.4\n", " 6\n", " 258.0\n", @@ -2551,7 +1973,7 @@ " 1\n", " \n", " \n", - " Mazda RX4 Wag\n", + " 1\n", " 21.0\n", " 6\n", " 160.0\n", @@ -2565,7 +1987,7 @@ " 4\n", " \n", " \n", - " Ferrari Dino\n", + " 29\n", " 19.7\n", " 6\n", " 145.0\n", @@ -2579,7 +2001,7 @@ " 6\n", " \n", " \n", - " Mazda RX4\n", + " 0\n", " 21.0\n", " 6\n", " 160.0\n", @@ -2593,7 +2015,7 @@ " 4\n", " \n", " \n", - " Lincoln Continental\n", + " 15\n", " 10.4\n", " 8\n", " 460.0\n", @@ -2607,7 +2029,7 @@ " 4\n", " \n", " \n", - " Chrysler Imperial\n", + " 16\n", " 14.7\n", " 8\n", " 440.0\n", @@ -2621,7 +2043,7 @@ " 4\n", " \n", " \n", - " Cadillac Fleetwood\n", + " 14\n", " 10.4\n", " 8\n", " 472.0\n", @@ -2635,7 +2057,7 @@ " 4\n", " \n", " \n", - " Merc 450SE\n", + " 11\n", " 16.4\n", " 8\n", " 275.8\n", @@ -2649,7 +2071,7 @@ " 3\n", " \n", " \n", - " Pontiac Firebird\n", + " 24\n", " 19.2\n", " 8\n", " 400.0\n", @@ -2663,7 +2085,7 @@ " 2\n", " \n", " \n", - " Camaro Z28\n", + " 23\n", " 13.3\n", " 8\n", " 350.0\n", @@ -2677,7 +2099,7 @@ " 4\n", " \n", " \n", - " Merc 450SLC\n", + " 13\n", " 15.2\n", " 8\n", " 275.8\n", @@ -2691,7 +2113,7 @@ " 3\n", " \n", " \n", - " Merc 450SL\n", + " 12\n", " 17.3\n", " 8\n", " 275.8\n", @@ -2705,7 +2127,7 @@ " 3\n", " \n", " \n", - " Duster 360\n", + " 6\n", " 14.3\n", " 8\n", " 360.0\n", @@ -2719,7 +2141,7 @@ " 4\n", " \n", " \n", - " Maserati Bora\n", + " 30\n", " 15.0\n", " 8\n", " 301.0\n", @@ -2733,7 +2155,7 @@ " 8\n", " \n", " \n", - " Dodge Challenger\n", + " 21\n", " 15.5\n", " 8\n", " 318.0\n", @@ -2747,7 +2169,7 @@ " 2\n", " \n", " \n", - " Hornet Sportabout\n", + " 4\n", " 18.7\n", " 8\n", " 360.0\n", @@ -2761,7 +2183,7 @@ " 2\n", " \n", " \n", - " AMC Javelin\n", + " 22\n", " 15.2\n", " 8\n", " 304.0\n", @@ -2775,7 +2197,7 @@ " 2\n", " \n", " \n", - " Ford Pantera L\n", + " 28\n", " 15.8\n", " 8\n", " 351.0\n", @@ -2793,76 +2215,42 @@ "" ], "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear \\\n", - "Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 \n", - "Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 \n", - "Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 \n", - "Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 \n", - "Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 \n", - "Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 \n", - "Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 \n", - "Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 \n", - "Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 \n", - "Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 \n", - "Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 \n", - "Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 \n", - "Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 \n", - "Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 \n", - "Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 \n", - "Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 \n", - "Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 \n", - "Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 \n", - "Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 \n", - "Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 \n", - "Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 \n", - "Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 \n", - "Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 \n", - "Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 \n", - "Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 \n", - "Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 \n", - "Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 \n", - "Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 \n", - "Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 \n", - "Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 \n", - "AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 \n", - "Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 \n", - "\n", - " carb \n", - "Merc 240D 2 \n", - "Merc 230 2 \n", - "Volvo 142E 2 \n", - "Toyota Corona 1 \n", - "Datsun 710 1 \n", - "Fiat 128 1 \n", - "Porsche 914-2 2 \n", - "Fiat X1-9 1 \n", - "Toyota Corolla 1 \n", - "Honda Civic 2 \n", - "Lotus Europa 2 \n", - "Valiant 1 \n", - "Merc 280 4 \n", - "Merc 280C 4 \n", - "Hornet 4 Drive 1 \n", - "Mazda RX4 Wag 4 \n", - "Ferrari Dino 6 \n", - "Mazda RX4 4 \n", - "Lincoln Continental 4 \n", - "Chrysler Imperial 4 \n", - "Cadillac Fleetwood 4 \n", - "Merc 450SE 3 \n", - "Pontiac Firebird 2 \n", - "Camaro Z28 4 \n", - "Merc 450SLC 3 \n", - "Merc 450SL 3 \n", - "Duster 360 4 \n", - "Maserati Bora 8 \n", - "Dodge Challenger 2 \n", - "Hornet Sportabout 2 \n", - "AMC Javelin 2 \n", - "Ford Pantera L 4 " + " mpg cyl disp hp drat wt qsec vs am gear carb\n", + "7 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2\n", + "8 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2\n", + "31 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2\n", + "20 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1\n", + "2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1\n", + "17 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1\n", + "26 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2\n", + "25 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1\n", + "19 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1\n", + "18 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2\n", + "27 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2\n", + "5 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1\n", + "9 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4\n", + "10 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4\n", + "3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1\n", + "1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4\n", + "29 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6\n", + "0 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4\n", + "15 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4\n", + "16 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4\n", + "14 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4\n", + "11 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3\n", + "24 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2\n", + "23 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4\n", + "13 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3\n", + "12 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3\n", + "6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4\n", + "30 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8\n", + "21 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2\n", + "4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2\n", + "22 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2\n", + "28 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -2873,7 +2261,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -3433,7 +2821,7 @@ "Toyota Corolla 1 " ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -3448,7 +2836,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -3590,7 +2978,7 @@ "[150 rows x 5 columns]" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -3601,7 +2989,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -3743,7 +3131,7 @@ "[150 rows x 5 columns]" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } diff --git a/examples/base.ipynb b/docs/notebooks/base.ipynb similarity index 97% rename from examples/base.ipynb rename to docs/notebooks/base.ipynb index 52b36596..f6d525b4 100644 --- a/examples/base.ipynb +++ b/docs/notebooks/base.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "sporting-necklace", + "id": "stunning-island", "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "sealed-venture", + "id": "instructional-plymouth", "metadata": {}, "outputs": [ { @@ -47,7 +47,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "immune-copyright", + "id": "radio-rough", "metadata": {}, "outputs": [ { @@ -105,7 +105,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "future-disability", + "id": "periodic-wagner", "metadata": {}, "outputs": [ { @@ -126,7 +126,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "familiar-counter", + "id": "anonymous-romance", "metadata": {}, "outputs": [ { @@ -183,7 +183,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "precise-island", + "id": "falling-bishop", "metadata": {}, "outputs": [ { @@ -204,7 +204,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "composite-story", + "id": "offshore-silver", "metadata": {}, "outputs": [ { @@ -261,7 +261,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "abroad-minnesota", + "id": "robust-memory", "metadata": {}, "outputs": [ { @@ -282,7 +282,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "stylish-experience", + "id": "exceptional-record", "metadata": {}, "outputs": [ { @@ -303,7 +303,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "vietnamese-empty", + "id": "apparent-shell", "metadata": {}, "outputs": [ { @@ -324,7 +324,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "automated-pasta", + "id": "latin-shooting", "metadata": {}, "outputs": [ { @@ -395,7 +395,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "demonstrated-entry", + "id": "joined-emission", "metadata": {}, "outputs": [ { @@ -470,7 +470,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "fresh-stocks", + "id": "derived-spain", "metadata": {}, "outputs": [ { @@ -531,7 +531,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "spread-evanescence", + "id": "unexpected-salon", "metadata": {}, "outputs": [ { @@ -602,7 +602,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "dutch-bleeding", + "id": "handled-asthma", "metadata": {}, "outputs": [ { @@ -673,7 +673,7 @@ { "cell_type": "code", "execution_count": 16, - "id": "surgical-honor", + "id": "unusual-adapter", "metadata": {}, "outputs": [ { @@ -756,7 +756,7 @@ { "cell_type": "code", "execution_count": 17, - "id": "generic-granny", + "id": "recovered-mother", "metadata": {}, "outputs": [ { diff --git a/examples/between.ipynb b/docs/notebooks/between.ipynb similarity index 72% rename from examples/between.ipynb rename to docs/notebooks/between.ipynb index 94a20337..1c1970fc 100644 --- a/examples/between.ipynb +++ b/docs/notebooks/between.ipynb @@ -2,21 +2,32 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, - "id": "aggressive-precipitation", + "execution_count": 6, + "id": "limiting-briefing", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Function version of `left <= x <= right`, which cannot do it rowwisely\n", + " \n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/between.html\n", "\n", "from datar.datasets import starwars\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(between.__doc__)" ] }, { "cell_type": "code", - "execution_count": 5, - "id": "senior-colon", + "execution_count": 2, + "id": "placed-rubber", "metadata": {}, "outputs": [ { @@ -37,7 +48,7 @@ "dtype: bool" ] }, - "execution_count": 5, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -48,31 +59,30 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "cleared-season", + "execution_count": 3, + "id": "bored-internet", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([ 0.88053148, -0.5351542 , 0.47564723, -0.19307227, -0.28602512,\n", - " -0.00822481, -0.41392545, 0.26630982, 0.13285379, 0.24227996,\n", - " 0.41755305, -0.69736888, -0.72390425, 0.21258702, 0.28975535,\n", - " 0.72626841, 0.10291894, 0.85700353, -0.25442961, 0.11728744,\n", - " 0.54529179, 0.08898517, 0.04798773, -0.58060331, 0.78023119,\n", - " -0.28043406, 0.97063715, 0.43759287, 0.69674501, -0.19616799,\n", - " -0.60318492, -0.08496449, 0.73840085, 0.2638862 , 0.32608722,\n", - " -0.62350829, -0.28049812, 0.10954724, 0.81647738, 0.24890757,\n", - " 0.50733862, 0.99845194, 0.61920947, 0.67987384, 0.92613896,\n", - " -0.4026945 , -0.5155664 , 0.57855208, 0.29903044, 0.61692408,\n", - " -0.05965839, -0.43687307, 0.99029965, 0.5494035 , -0.46630694,\n", - " 0.92380197, 0.82790729, -0.70980125, -0.21627509, 0.73339347,\n", - " 0.19704249, -0.26683036, 0.46680121, -0.54279255, -0.45049086,\n", - " -0.02378481, 0.25545022, 0.46919449, 0.46910124, -0.03904592,\n", - " -0.99363742, -0.36300745, -0.82733344, 0.09962891])" + "array([-0.28741651, -0.03757609, 0.05969333, 0.43484369, -0.84572932,\n", + " -0.76224083, 0.10370117, 0.06591327, -0.33523095, -0.87523847,\n", + " 0.96762856, -0.5337002 , 0.15029452, 0.14013877, 0.38197503,\n", + " -0.74342373, 0.52214141, -0.42011358, -0.0897477 , -0.69270735,\n", + " -0.85548524, 0.48818403, -0.07457817, 0.34201212, -0.5702833 ,\n", + " 0.90942721, -0.45274751, -0.42862831, -0.91396287, -0.65214425,\n", + " -0.58482241, -0.26079899, -0.09459455, 0.47454913, 0.79451831,\n", + " -0.3507544 , 0.11577005, 0.83433367, -0.5751406 , 0.45737957,\n", + " -0.76638515, 0.48572847, 0.52373804, -0.24637071, 0.79404125,\n", + " 0.18531304, 0.62905851, 0.86022313, -0.99388915, 0.40973193,\n", + " 0.00902842, -0.3114048 , -0.6296591 , 0.98194554, -0.66646065,\n", + " -0.84659128, 0.50355201, 0.92087997, -0.83196487, -0.78734087,\n", + " -0.23976344, -0.49397098, 0.56756236, -0.28145025, 0.82398199,\n", + " -0.81643294])" ] }, - "execution_count": 9, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -84,8 +94,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "promotional-domestic", + "execution_count": 4, + "id": "thirty-freedom", "metadata": {}, "outputs": [ { @@ -124,7 +134,7 @@ " \n", " \n", " \n", - " 4\n", + " 0\n", " Leia Organa\n", " 150.0\n", " 49.0\n", @@ -138,7 +148,7 @@ " Human\n", " \n", " \n", - " 26\n", + " 1\n", " Mon Mothma\n", " 150.0\n", " NaN\n", @@ -152,7 +162,7 @@ " Human\n", " \n", " \n", - " 37\n", + " 2\n", " Watto\n", " 137.0\n", " NaN\n", @@ -166,7 +176,7 @@ " Toydarian\n", " \n", " \n", - " 38\n", + " 3\n", " Sebulba\n", " 112.0\n", " 40.0\n", @@ -180,7 +190,7 @@ " Dug\n", " \n", " \n", - " 45\n", + " 4\n", " Gasgano\n", " 122.0\n", " NaN\n", @@ -198,22 +208,22 @@ "" ], "text/plain": [ - " name height mass hair_color skin_color eye_color birth_year \\\n", - "4 Leia Organa 150.0 49.0 brown light brown 19.0 \n", - "26 Mon Mothma 150.0 NaN auburn fair blue 48.0 \n", - "37 Watto 137.0 NaN black blue, grey yellow NaN \n", - "38 Sebulba 112.0 40.0 none grey, red orange NaN \n", - "45 Gasgano 122.0 NaN none white, blue black NaN \n", + " name height mass hair_color skin_color eye_color birth_year \\\n", + "0 Leia Organa 150.0 49.0 brown light brown 19.0 \n", + "1 Mon Mothma 150.0 NaN auburn fair blue 48.0 \n", + "2 Watto 137.0 NaN black blue, grey yellow NaN \n", + "3 Sebulba 112.0 40.0 none grey, red orange NaN \n", + "4 Gasgano 122.0 NaN none white, blue black NaN \n", "\n", - " sex gender homeworld species \n", - "4 female feminine Alderaan Human \n", - "26 female feminine Chandrila Human \n", - "37 male masculine Toydaria Toydarian \n", - "38 male masculine Malastare Dug \n", - "45 male masculine Troiken Xexto " + " sex gender homeworld species \n", + "0 female feminine Alderaan Human \n", + "1 female feminine Chandrila Human \n", + "2 male masculine Toydaria Toydarian \n", + "3 male masculine Malastare Dug \n", + "4 male masculine Troiken Xexto " ] }, - "execution_count": 10, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -224,8 +234,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "abstract-ukraine", + "execution_count": 5, + "id": "complimentary-wrapping", "metadata": {}, "outputs": [ { @@ -264,7 +274,7 @@ " \n", " \n", " \n", - " 2\n", + " 0\n", " R2-D2\n", " 96.0\n", " 32.0\n", @@ -278,7 +288,7 @@ " Droid\n", " \n", " \n", - " 4\n", + " 1\n", " Leia Organa\n", " 150.0\n", " 49.0\n", @@ -292,7 +302,7 @@ " Human\n", " \n", " \n", - " 7\n", + " 2\n", " R5-D4\n", " 97.0\n", " 32.0\n", @@ -306,7 +316,7 @@ " Droid\n", " \n", " \n", - " 18\n", + " 3\n", " Yoda\n", " 66.0\n", " 17.0\n", @@ -320,7 +330,7 @@ " Yoda's species\n", " \n", " \n", - " 26\n", + " 4\n", " Mon Mothma\n", " 150.0\n", " NaN\n", @@ -334,7 +344,7 @@ " Human\n", " \n", " \n", - " 28\n", + " 5\n", " Wicket Systri Warrick\n", " 88.0\n", " 20.0\n", @@ -348,7 +358,7 @@ " Ewok\n", " \n", " \n", - " 37\n", + " 6\n", " Watto\n", " 137.0\n", " NaN\n", @@ -362,7 +372,7 @@ " Toydarian\n", " \n", " \n", - " 38\n", + " 7\n", " Sebulba\n", " 112.0\n", " 40.0\n", @@ -376,7 +386,7 @@ " Dug\n", " \n", " \n", - " 44\n", + " 8\n", " Dud Bolt\n", " 94.0\n", " 45.0\n", @@ -390,7 +400,7 @@ " Vulptereen\n", " \n", " \n", - " 45\n", + " 9\n", " Gasgano\n", " 122.0\n", " NaN\n", @@ -404,7 +414,7 @@ " Xexto\n", " \n", " \n", - " 71\n", + " 10\n", " Ratts Tyerell\n", " 79.0\n", " 15.0\n", @@ -418,7 +428,7 @@ " Aleena\n", " \n", " \n", - " 72\n", + " 11\n", " R4-P17\n", " 96.0\n", " NaN\n", @@ -437,35 +447,35 @@ ], "text/plain": [ " name height mass hair_color skin_color eye_color \\\n", - "2 R2-D2 96.0 32.0 NaN white, blue red \n", - "4 Leia Organa 150.0 49.0 brown light brown \n", - "7 R5-D4 97.0 32.0 NaN white, red red \n", - "18 Yoda 66.0 17.0 white green brown \n", - "26 Mon Mothma 150.0 NaN auburn fair blue \n", - "28 Wicket Systri Warrick 88.0 20.0 brown brown brown \n", - "37 Watto 137.0 NaN black blue, grey yellow \n", - "38 Sebulba 112.0 40.0 none grey, red orange \n", - "44 Dud Bolt 94.0 45.0 none blue, grey yellow \n", - "45 Gasgano 122.0 NaN none white, blue black \n", - "71 Ratts Tyerell 79.0 15.0 none grey, blue unknown \n", - "72 R4-P17 96.0 NaN none silver, red red, blue \n", + "0 R2-D2 96.0 32.0 NaN white, blue red \n", + "1 Leia Organa 150.0 49.0 brown light brown \n", + "2 R5-D4 97.0 32.0 NaN white, red red \n", + "3 Yoda 66.0 17.0 white green brown \n", + "4 Mon Mothma 150.0 NaN auburn fair blue \n", + "5 Wicket Systri Warrick 88.0 20.0 brown brown brown \n", + "6 Watto 137.0 NaN black blue, grey yellow \n", + "7 Sebulba 112.0 40.0 none grey, red orange \n", + "8 Dud Bolt 94.0 45.0 none blue, grey yellow \n", + "9 Gasgano 122.0 NaN none white, blue black \n", + "10 Ratts Tyerell 79.0 15.0 none grey, blue unknown \n", + "11 R4-P17 96.0 NaN none silver, red red, blue \n", "\n", " birth_year sex gender homeworld species \n", - "2 33.0 none masculine Naboo Droid \n", - "4 19.0 female feminine Alderaan Human \n", - "7 NaN none masculine Tatooine Droid \n", - "18 896.0 male masculine NaN Yoda's species \n", - "26 48.0 female feminine Chandrila Human \n", - "28 8.0 male masculine Endor Ewok \n", - "37 NaN male masculine Toydaria Toydarian \n", - "38 NaN male masculine Malastare Dug \n", - "44 NaN male masculine Vulpter Vulptereen \n", - "45 NaN male masculine Troiken Xexto \n", - "71 NaN male masculine Aleen Minor Aleena \n", - "72 NaN none feminine NaN Droid " + "0 33.0 none masculine Naboo Droid \n", + "1 19.0 female feminine Alderaan Human \n", + "2 NaN none masculine Tatooine Droid \n", + "3 896.0 male masculine NaN Yoda's species \n", + "4 48.0 female feminine Chandrila Human \n", + "5 8.0 male masculine Endor Ewok \n", + "6 NaN male masculine Toydaria Toydarian \n", + "7 NaN male masculine Malastare Dug \n", + "8 NaN male masculine Vulpter Vulptereen \n", + "9 NaN male masculine Troiken Xexto \n", + "10 NaN male masculine Aleen Minor Aleena \n", + "11 NaN none feminine NaN Droid " ] }, - "execution_count": 14, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } diff --git a/examples/bind.ipynb b/docs/notebooks/bind.ipynb similarity index 69% rename from examples/bind.ipynb rename to docs/notebooks/bind.ipynb index a6c41af1..8632b2d6 100644 --- a/examples/bind.ipynb +++ b/docs/notebooks/bind.ipynb @@ -3,20 +3,50 @@ { "cell_type": "code", "execution_count": 1, - "id": "authorized-blogger", + "id": "hydraulic-earth", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bind rows of give dataframes\n", + "\n", + " Args:\n", + " _data: The seed dataframe to bind others\n", + " Could be a dict or a list, keys/indexes will be used for _id col\n", + " *datas: Other dataframes to combine\n", + " _id: The name of the id columns\n", + " **kwargs: A mapping of dataframe, keys will be used as _id col.\n", + "\n", + " Returns:\n", + " The combined dataframe\n", + " \n", + "Bind columns of give dataframes\n", + "\n", + " Args:\n", + " _data, *datas: Dataframes to combine\n", + "\n", + " Returns:\n", + " The combined dataframe\n", + " \n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/bind.html\n", "\n", "from datar.datasets import starwars\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(bind_rows.__doc__)\n", + "print(bind_cols.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "paperback-bubble", + "id": "boxed-independence", "metadata": {}, "outputs": [ { @@ -111,7 +141,7 @@ " Human\n", " \n", " \n", - " 9\n", + " 4\n", " Obi-Wan Kenobi\n", " 182.0\n", " 77.0\n", @@ -125,7 +155,7 @@ " Human\n", " \n", " \n", - " 10\n", + " 5\n", " Anakin Skywalker\n", " 188.0\n", " 84.0\n", @@ -139,7 +169,7 @@ " Human\n", " \n", " \n", - " 11\n", + " 6\n", " Wilhuff Tarkin\n", " 180.0\n", " NaN\n", @@ -153,7 +183,7 @@ " Human\n", " \n", " \n", - " 12\n", + " 7\n", " Chewbacca\n", " 228.0\n", " 112.0\n", @@ -171,25 +201,25 @@ "" ], "text/plain": [ - " name height mass hair_color skin_color eye_color \\\n", - "0 Luke Skywalker 172.0 77.0 blond fair blue \n", - "1 C-3PO 167.0 75.0 NaN gold yellow \n", - "2 R2-D2 96.0 32.0 NaN white, blue red \n", - "3 Darth Vader 202.0 136.0 none white yellow \n", - "9 Obi-Wan Kenobi 182.0 77.0 auburn, white fair blue-gray \n", - "10 Anakin Skywalker 188.0 84.0 blond fair blue \n", - "11 Wilhuff Tarkin 180.0 NaN auburn, grey fair blue \n", - "12 Chewbacca 228.0 112.0 brown unknown blue \n", + " name height mass hair_color skin_color eye_color \\\n", + "0 Luke Skywalker 172.0 77.0 blond fair blue \n", + "1 C-3PO 167.0 75.0 NaN gold yellow \n", + "2 R2-D2 96.0 32.0 NaN white, blue red \n", + "3 Darth Vader 202.0 136.0 none white yellow \n", + "4 Obi-Wan Kenobi 182.0 77.0 auburn, white fair blue-gray \n", + "5 Anakin Skywalker 188.0 84.0 blond fair blue \n", + "6 Wilhuff Tarkin 180.0 NaN auburn, grey fair blue \n", + "7 Chewbacca 228.0 112.0 brown unknown blue \n", "\n", - " birth_year sex gender homeworld species \n", - "0 19.0 male masculine Tatooine Human \n", - "1 112.0 none masculine Tatooine Droid \n", - "2 33.0 none masculine Naboo Droid \n", - "3 41.9 male masculine Tatooine Human \n", - "9 57.0 male masculine Stewjon Human \n", - "10 41.9 male masculine Tatooine Human \n", - "11 64.0 male masculine Eriadu Human \n", - "12 200.0 male masculine Kashyyyk Wookiee " + " birth_year sex gender homeworld species \n", + "0 19.0 male masculine Tatooine Human \n", + "1 112.0 none masculine Tatooine Droid \n", + "2 33.0 none masculine Naboo Droid \n", + "3 41.9 male masculine Tatooine Human \n", + "4 57.0 male masculine Stewjon Human \n", + "5 41.9 male masculine Tatooine Human \n", + "6 64.0 male masculine Eriadu Human \n", + "7 200.0 male masculine Kashyyyk Wookiee " ] }, "execution_count": 2, @@ -207,7 +237,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "palestinian-qualification", + "id": "lesbian-ireland", "metadata": {}, "outputs": [ { @@ -242,7 +272,7 @@ " 2\n", " \n", " \n", - " 0\n", + " 1\n", " 3\n", " 4\n", " \n", @@ -253,7 +283,7 @@ "text/plain": [ " a b\n", "0 1 2\n", - "0 3 4" + "1 3 4" ] }, "execution_count": 3, @@ -268,7 +298,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "adjacent-platform", + "id": "ceramic-numbers", "metadata": {}, "outputs": [ { @@ -303,17 +333,17 @@ " 2\n", " \n", " \n", - " 0\n", + " 1\n", " 3\n", " 5\n", " \n", " \n", - " 1\n", + " 2\n", " 4\n", " 6\n", " \n", " \n", - " 0\n", + " 3\n", " 7\n", " 8\n", " \n", @@ -324,9 +354,9 @@ "text/plain": [ " a b\n", "0 1 2\n", - "0 3 5\n", - "1 4 6\n", - "0 7 8" + "1 3 5\n", + "2 4 6\n", + "3 7 8" ] }, "execution_count": 4, @@ -344,7 +374,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "ruled-wright", + "id": "designing-accounting", "metadata": {}, "outputs": [ { @@ -389,22 +419,22 @@ " NaN\n", " \n", " \n", - " 0\n", + " 3\n", " NaN\n", " 0.0\n", " \n", " \n", - " 1\n", + " 4\n", " NaN\n", " 1.0\n", " \n", " \n", - " 2\n", + " 5\n", " NaN\n", " 2.0\n", " \n", " \n", - " 3\n", + " 6\n", " NaN\n", " 3.0\n", " \n", @@ -417,10 +447,10 @@ "0 0.0 NaN\n", "1 1.0 NaN\n", "2 2.0 NaN\n", - "0 NaN 0.0\n", - "1 NaN 1.0\n", - "2 NaN 2.0\n", - "3 NaN 3.0" + "3 NaN 0.0\n", + "4 NaN 1.0\n", + "5 NaN 2.0\n", + "6 NaN 3.0" ] }, "execution_count": 5, @@ -437,7 +467,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "younger-swift", + "id": "pacific-gazette", "metadata": {}, "outputs": [ { @@ -506,9 +536,38 @@ { "cell_type": "code", "execution_count": 7, - "id": "derived-authentication", + "id": "apart-fault", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-04-01 17:42:29][datar][WARNING] New names:\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'name' -> 'name__0'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'height' -> 'height__1'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'mass' -> 'mass__2'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'hair_color' -> 'hair_color__3'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'skin_color' -> 'skin_color__4'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'eye_color' -> 'eye_color__5'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'birth_year' -> 'birth_year__6'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'sex' -> 'sex__7'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'gender' -> 'gender__8'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'homeworld' -> 'homeworld__9'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'species' -> 'species__10'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'name' -> 'name__11'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'height' -> 'height__12'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'mass' -> 'mass__13'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'hair_color' -> 'hair_color__14'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'skin_color' -> 'skin_color__15'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'eye_color' -> 'eye_color__16'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'birth_year' -> 'birth_year__17'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'sex' -> 'sex__18'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'gender' -> 'gender__19'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'homeworld' -> 'homeworld__20'\n", + "[2021-04-01 17:42:29][datar][WARNING] * 'species' -> 'species__21'\n" + ] + }, { "data": { "text/html": [ @@ -530,27 +589,27 @@ " \n", " \n", " \n", - " name\n", - " height\n", - " mass\n", - " hair_color\n", - " skin_color\n", - " eye_color\n", - " birth_year\n", - " sex\n", - " gender\n", - " homeworld\n", + " name__0\n", + " height__1\n", + " mass__2\n", + " hair_color__3\n", + " skin_color__4\n", + " eye_color__5\n", + " birth_year__6\n", + " sex__7\n", + " gender__8\n", + " homeworld__9\n", " ...\n", - " height\n", - " mass\n", - " hair_color\n", - " skin_color\n", - " eye_color\n", - " birth_year\n", - " sex\n", - " gender\n", - " homeworld\n", - " species\n", + " height__12\n", + " mass__13\n", + " hair_color__14\n", + " skin_color__15\n", + " eye_color__16\n", + " birth_year__17\n", + " sex__18\n", + " gender__19\n", + " homeworld__20\n", + " species__21\n", " \n", " \n", " \n", @@ -752,35 +811,45 @@ "" ], "text/plain": [ - " name height mass hair_color skin_color eye_color \\\n", - "0 Luke Skywalker 172.0 77.0 blond fair blue \n", - "1 C-3PO 167.0 75.0 NaN gold yellow \n", - "2 R2-D2 96.0 32.0 NaN white, blue red \n", - "3 Darth Vader 202.0 136.0 none white yellow \n", - "9 NaN NaN NaN NaN NaN NaN \n", - "10 NaN NaN NaN NaN NaN NaN \n", - "11 NaN NaN NaN NaN NaN NaN \n", - "12 NaN NaN NaN NaN NaN NaN \n", + " name__0 height__1 mass__2 hair_color__3 skin_color__4 \\\n", + "0 Luke Skywalker 172.0 77.0 blond fair \n", + "1 C-3PO 167.0 75.0 NaN gold \n", + "2 R2-D2 96.0 32.0 NaN white, blue \n", + "3 Darth Vader 202.0 136.0 none white \n", + "9 NaN NaN NaN NaN NaN \n", + "10 NaN NaN NaN NaN NaN \n", + "11 NaN NaN NaN NaN NaN \n", + "12 NaN NaN NaN NaN NaN \n", + "\n", + " eye_color__5 birth_year__6 sex__7 gender__8 homeworld__9 ... height__12 \\\n", + "0 blue 19.0 male masculine Tatooine ... NaN \n", + "1 yellow 112.0 none masculine Tatooine ... NaN \n", + "2 red 33.0 none masculine Naboo ... NaN \n", + "3 yellow 41.9 male masculine Tatooine ... NaN \n", + "9 NaN NaN NaN NaN NaN ... 182.0 \n", + "10 NaN NaN NaN NaN NaN ... 188.0 \n", + "11 NaN NaN NaN NaN NaN ... 180.0 \n", + "12 NaN NaN NaN NaN NaN ... 228.0 \n", "\n", - " birth_year sex gender homeworld ... height mass hair_color \\\n", - "0 19.0 male masculine Tatooine ... NaN NaN NaN \n", - "1 112.0 none masculine Tatooine ... NaN NaN NaN \n", - "2 33.0 none masculine Naboo ... NaN NaN NaN \n", - "3 41.9 male masculine Tatooine ... NaN NaN NaN \n", - "9 NaN NaN NaN NaN ... 182.0 77.0 auburn, white \n", - "10 NaN NaN NaN NaN ... 188.0 84.0 blond \n", - "11 NaN NaN NaN NaN ... 180.0 NaN auburn, grey \n", - "12 NaN NaN NaN NaN ... 228.0 112.0 brown \n", + " mass__13 hair_color__14 skin_color__15 eye_color__16 birth_year__17 \\\n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN \n", + "9 77.0 auburn, white fair blue-gray 57.0 \n", + "10 84.0 blond fair blue 41.9 \n", + "11 NaN auburn, grey fair blue 64.0 \n", + "12 112.0 brown unknown blue 200.0 \n", "\n", - " skin_color eye_color birth_year sex gender homeworld species \n", - "0 NaN NaN NaN NaN NaN NaN NaN \n", - "1 NaN NaN NaN NaN NaN NaN NaN \n", - "2 NaN NaN NaN NaN NaN NaN NaN \n", - "3 NaN NaN NaN NaN NaN NaN NaN \n", - "9 fair blue-gray 57.0 male masculine Stewjon Human \n", - "10 fair blue 41.9 male masculine Tatooine Human \n", - "11 fair blue 64.0 male masculine Eriadu Human \n", - "12 unknown blue 200.0 male masculine Kashyyyk Wookiee \n", + " sex__18 gender__19 homeworld__20 species__21 \n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "9 male masculine Stewjon Human \n", + "10 male masculine Tatooine Human \n", + "11 male masculine Eriadu Human \n", + "12 male masculine Kashyyyk Wookiee \n", "\n", "[8 rows x 22 columns]" ] diff --git a/examples/case_when.ipynb b/docs/notebooks/case_when.ipynb similarity index 94% rename from examples/case_when.ipynb rename to docs/notebooks/case_when.ipynb index 1fec64b4..b213f95f 100644 --- a/examples/case_when.ipynb +++ b/docs/notebooks/case_when.ipynb @@ -2,20 +2,41 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, - "id": "modified-queensland", + "execution_count": 11, + "id": "prompt-prompt", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vectorise multiple if_else() statements.\n", + "\n", + " Args:\n", + " _data: The data frame.\n", + " *when_cases: A even-size sequence, with 2n-th element values to match,\n", + " and 2(n+1)-th element the values to replace.\n", + " When matching value is True, then next value will be default to\n", + " replace\n", + "\n", + " Returns:\n", + " A series with values replaced\n", + " \n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/case_when.html\n", "from datar.datasets import starwars \n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(case_when.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "present-incidence", + "id": "round-vehicle", "metadata": {}, "outputs": [ { @@ -92,7 +113,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "korean-municipality", + "id": "arctic-compensation", "metadata": {}, "outputs": [ { @@ -168,7 +189,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "arbitrary-citizen", + "id": "satisfactory-depression", "metadata": {}, "outputs": [ { @@ -243,7 +264,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "emotional-administrator", + "id": "mental-wallace", "metadata": {}, "outputs": [ { @@ -322,7 +343,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "responsible-briefs", + "id": "announced-cricket", "metadata": {}, "outputs": [ { @@ -398,7 +419,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "patent-corporation", + "id": "funded-circumstances", "metadata": {}, "outputs": [ { @@ -474,7 +495,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "toxic-means", + "id": "applicable-jordan", "metadata": {}, "outputs": [ { @@ -516,7 +537,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "suburban-caribbean", + "id": "homeless-fellow", "metadata": {}, "outputs": [ { @@ -690,7 +711,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "opposed-tract", + "id": "wrapped-apple", "metadata": {}, "outputs": [ { @@ -728,7 +749,7 @@ { "cell_type": "code", "execution_count": null, - "id": "marine-seven", + "id": "improving-visit", "metadata": {}, "outputs": [], "source": [] diff --git a/examples/coalesce.ipynb b/docs/notebooks/coalesce.ipynb similarity index 71% rename from examples/coalesce.ipynb rename to docs/notebooks/coalesce.ipynb index 41cf4914..3e4f96de 100644 --- a/examples/coalesce.ipynb +++ b/docs/notebooks/coalesce.ipynb @@ -3,19 +3,40 @@ { "cell_type": "code", "execution_count": 1, - "id": "extreme-excitement", + "id": "geological-validation", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Replace missing values\n", + "\n", + " https://dplyr.tidyverse.org/reference/coalesce.html\n", + "\n", + " Args:\n", + " x: The vector to replace\n", + " replace: The replacement\n", + "\n", + " Returns:\n", + " A vector the same length as the first argument with missing values\n", + " replaced by the first non-missing value.\n", + " \n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/coalesce.html\n", "from datar.datasets import starwars \n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(coalesce.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "global-insight", + "id": "front-municipality", "metadata": {}, "outputs": [ { @@ -45,7 +66,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "crude-islam", + "id": "floating-highlight", "metadata": {}, "outputs": [ { diff --git a/examples/context.ipynb b/docs/notebooks/context.ipynb similarity index 53% rename from examples/context.ipynb rename to docs/notebooks/context.ipynb index 48a50ff3..e2e7f7ac 100644 --- a/examples/context.ipynb +++ b/docs/notebooks/context.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "aquatic-multiple", + "id": "stone-squad", "metadata": {}, "outputs": [], "source": [ @@ -14,66 +14,20 @@ { "cell_type": "code", "execution_count": 2, - "id": "electoral-numbers", + "id": "amber-wallet", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-13 00:30:12][datar][ INFO] `summarise()` ungrouping output (override with `_groups` argument)\n" + "[2021-04-01 17:46:21][datar][ INFO] `summarise()` regrouping output by ['g']. You can override using the `.groups` argument.\n" ] }, { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
gn
0a1
1b2
2c3
\n", - "
" - ], "text/plain": [ - " g n\n", - "0 a 1\n", - "1 b 2\n", - "2 c 3" + "" ] }, "execution_count": 2, @@ -95,14 +49,14 @@ { "cell_type": "code", "execution_count": 3, - "id": "smaller-edmonton", + "id": "meaningful-target", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-13 00:30:12][datar][ INFO] # [DataFrameGroupBy] Groups: ['g'] (3)\n" + "[2021-04-01 17:46:21][datar][ INFO] # [DataFrameGroupBy] Groups: ['g'] (3)\n" ] }, { @@ -135,38 +89,38 @@ " \n", " 0\n", " a\n", - " 0.888835\n", - " 0.614335\n", + " 0.725713\n", + " 0.968646\n", " \n", " \n", " 1\n", " b\n", - " 0.983451\n", - " 0.170819\n", + " 0.090224\n", + " 0.387580\n", " \n", " \n", " 2\n", " b\n", - " 0.896694\n", - " 0.327994\n", + " 0.578032\n", + " 0.826910\n", " \n", " \n", " 3\n", " c\n", - " 0.074433\n", - " 0.917238\n", + " 0.196640\n", + " 0.496450\n", " \n", " \n", " 4\n", " c\n", - " 0.807236\n", - " 0.935654\n", + " 0.426203\n", + " 0.006330\n", " \n", " \n", " 5\n", " c\n", - " 0.165058\n", - " 0.218777\n", + " 0.140672\n", + " 0.208947\n", " \n", " \n", "\n", @@ -174,12 +128,12 @@ ], "text/plain": [ " g x y\n", - "0 a 0.888835 0.614335\n", - "1 b 0.983451 0.170819\n", - "2 b 0.896694 0.327994\n", - "3 c 0.074433 0.917238\n", - "4 c 0.807236 0.935654\n", - "5 c 0.165058 0.218777" + "0 a 0.725713 0.968646\n", + "1 b 0.090224 0.387580\n", + "2 b 0.578032 0.826910\n", + "3 c 0.196640 0.496450\n", + "4 c 0.426203 0.006330\n", + "5 c 0.140672 0.208947" ] }, "execution_count": 3, @@ -188,15 +142,22 @@ } ], "source": [ - "gf >> showme()" + "gf >> display()" ] }, { "cell_type": "code", "execution_count": 4, - "id": "adaptive-ambassador", + "id": "desirable-staff", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-04-01 17:46:21][datar][ INFO] # [DataFrameGroupBy] Groups: ['g'] (3)\n" + ] + }, { "data": { "text/html": [ @@ -228,43 +189,43 @@ " \n", " 0\n", " a\n", - " 0.888835\n", - " 0.614335\n", + " 0.725713\n", + " 0.968646\n", " 0\n", " \n", " \n", " 1\n", " b\n", - " 0.983451\n", - " 0.170819\n", + " 0.090224\n", + " 0.387580\n", " 1\n", " \n", " \n", " 2\n", " b\n", - " 0.896694\n", - " 0.327994\n", + " 0.578032\n", + " 0.826910\n", " 1\n", " \n", " \n", " 3\n", " c\n", - " 0.074433\n", - " 0.917238\n", + " 0.196640\n", + " 0.496450\n", " 2\n", " \n", " \n", " 4\n", " c\n", - " 0.807236\n", - " 0.935654\n", + " 0.426203\n", + " 0.006330\n", " 2\n", " \n", " \n", " 5\n", " c\n", - " 0.165058\n", - " 0.218777\n", + " 0.140672\n", + " 0.208947\n", " 2\n", " \n", " \n", @@ -273,12 +234,12 @@ ], "text/plain": [ " g x y id\n", - "0 a 0.888835 0.614335 0\n", - "1 b 0.983451 0.170819 1\n", - "2 b 0.896694 0.327994 1\n", - "3 c 0.074433 0.917238 2\n", - "4 c 0.807236 0.935654 2\n", - "5 c 0.165058 0.218777 2" + "0 a 0.725713 0.968646 0\n", + "1 b 0.090224 0.387580 1\n", + "2 b 0.578032 0.826910 1\n", + "3 c 0.196640 0.496450 2\n", + "4 c 0.426203 0.006330 2\n", + "5 c 0.140672 0.208947 2" ] }, "execution_count": 4, @@ -287,21 +248,21 @@ } ], "source": [ - "gf >> mutate(id=cur_group_id()) >> showme()" + "gf >> mutate(id=cur_group_id()) >> display()" ] }, { "cell_type": "code", "execution_count": 5, - "id": "integrated-qualification", + "id": "modular-beads", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-13 00:30:13][datar][ INFO] `summarise()` regrouping output by ['g']. You can override using the `.groups` argument.\n", - "[2021-03-13 00:30:13][datar][ INFO] # [DataFrameGroupBy] Groups: ['g'] (3)\n" + "[2021-04-01 17:46:21][datar][ INFO] `summarise()` regrouping output by ['g']. You can override using the `.groups` argument.\n", + "[2021-04-01 17:46:21][datar][ INFO] # [DataFrameGroupBy] Groups: ['g'] (3)\n" ] }, { @@ -380,90 +341,26 @@ } ], "source": [ - "gf >> summarise(row=cur_group_rows()) >> showme()" + "gf >> summarise(row=cur_group_rows()) >> display()" ] }, { "cell_type": "code", "execution_count": 6, - "id": "sharing-digest", + "id": "touched-bracket", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-13 00:30:13][datar][ INFO] `summarise()` ungrouping output (override with `_groups` argument)\n" + "[2021-04-01 17:46:21][datar][ INFO] `summarise()` regrouping output by ['g']. You can override using the `.groups` argument.\n" ] }, { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
gdata
0ag\n", - "3 a\n", - "4 \n", - "5
1bg\n", - "3 b\n", - "4 b\n", - "5
2cg\n", - "3 c\n", - "4 c\n", - "5 c
\n", - "
" - ], "text/plain": [ - " g data\n", - "0 a g\n", - "3 a\n", - "4 \n", - "5 \n", - "1 b g\n", - "3 b\n", - "4 b\n", - "5 \n", - "2 c g\n", - "3 c\n", - "4 c\n", - "5 c" + "" ] }, "execution_count": 6, @@ -479,24 +376,18 @@ { "cell_type": "code", "execution_count": 7, - "id": "difficult-pillow", + "id": "sustained-buffalo", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[ g\n", - " 3 a\n", - " 4 \n", - " 5 ,\n", + " 0 a,\n", " g\n", - " 3 b\n", - " 4 b\n", - " 5 ,\n", + " 0 b,\n", " g\n", - " 3 c\n", - " 4 c\n", - " 5 c]" + " 0 c]" ] }, "execution_count": 7, @@ -511,78 +402,20 @@ { "cell_type": "code", "execution_count": 8, - "id": "under-pierre", + "id": "spiritual-capacity", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-13 00:30:13][datar][ INFO] `summarise()` ungrouping output (override with `_groups` argument)\n" + "[2021-04-01 17:46:21][datar][ INFO] `summarise()` regrouping output by ['g']. You can override using the `.groups` argument.\n" ] }, { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
gdata
0ax y\n", - "3 0.888835 0.614335\n", - "4 ...
1bx y\n", - "3 0.983451 0.170819\n", - "4 ...
2cx y\n", - "3 0.074433 0.917238\n", - "4 ...
\n", - "
" - ], "text/plain": [ - " g data\n", - "0 a x y\n", - "3 0.888835 0.614335\n", - "4 ...\n", - "1 b x y\n", - "3 0.983451 0.170819\n", - "4 ...\n", - "2 c x y\n", - "3 0.074433 0.917238\n", - "4 ..." + "" ] }, "execution_count": 8, @@ -598,24 +431,21 @@ { "cell_type": "code", "execution_count": 9, - "id": "dependent-merit", + "id": "undefined-skating", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[ x y\n", - " 3 0.888835 0.614335\n", - " 4 \n", - " 5 ,\n", - " x y\n", - " 3 0.983451 0.170819\n", - " 4 0.896694 0.327994\n", - " 5 ,\n", - " x y\n", - " 3 0.074433 0.917238\n", - " 4 0.807236 0.935654\n", - " 5 0.165058 0.218777]" + "[ g x y\n", + " 0 a 0.725713 0.968646,\n", + " g x y\n", + " 0 b 0.090224 0.38758\n", + " 1 b 0.578032 0.82691,\n", + " g x y\n", + " 0 c 0.196640 0.496450\n", + " 1 c 0.426203 0.006330\n", + " 2 c 0.140672 0.208947]" ] }, "execution_count": 9, @@ -630,72 +460,20 @@ { "cell_type": "code", "execution_count": 10, - "id": "voluntary-right", + "id": "southeast-change", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-13 00:30:13][datar][ INFO] `summarise()` ungrouping output (override with `_groups` argument)\n" + "[2021-04-01 17:46:22][datar][ INFO] `summarise()` regrouping output by ['g']. You can override using the `.groups` argument.\n" ] }, { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
gdata
0ag x y\n", - "3 a 0.888835 0.614...
1bg x y\n", - "3 b 0.983451 0.170...
2cg x y\n", - "3 c 0.074433 0.917...
\n", - "
" - ], "text/plain": [ - " g data\n", - "0 a g x y\n", - "3 a 0.888835 0.614...\n", - "1 b g x y\n", - "3 b 0.983451 0.170...\n", - "2 c g x y\n", - "3 c 0.074433 0.917..." + "" ] }, "execution_count": 10, @@ -711,24 +489,21 @@ { "cell_type": "code", "execution_count": 11, - "id": "resistant-settlement", + "id": "narrow-partnership", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[ g x y\n", - " 3 a 0.888835 0.614335\n", - " 4 \n", - " 5 ,\n", - " g x y\n", - " 3 b 0.983451 0.170819\n", - " 4 b 0.896694 0.327994\n", - " 5 ,\n", + " 0 a 0.725713 0.968646,\n", + " g x y\n", + " 0 b 0.090224 0.38758\n", + " 1 b 0.578032 0.82691,\n", " g x y\n", - " 3 c 0.074433 0.917238\n", - " 4 c 0.807236 0.935654\n", - " 5 c 0.165058 0.218777]" + " 0 c 0.196640 0.496450\n", + " 1 c 0.426203 0.006330\n", + " 2 c 0.140672 0.208947]" ] }, "execution_count": 11, @@ -743,7 +518,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "western-connection", + "id": "inappropriate-opportunity", "metadata": {}, "outputs": [ { @@ -774,46 +549,46 @@ " \n", " \n", " 0\n", - " x 0.7900279243540883\n", - " y 0.37740734135777476\n", + " x 0.5266599179693725\n", + " y 0.9382743565552832\n", " \n", " \n", " 1\n", - " x 0.9671756699413814\n", - " y 0.029179194753120932\n", + " x 0.008140397891205747\n", + " y 0.15021848176712474\n", " \n", " \n", " 2\n", - " x 0.8040607874933128\n", - " y 0.10758002114779909\n", + " x 0.334121386085333\n", + " y 0.6837802892580668\n", " \n", " \n", " 3\n", - " x 0.0055403010103483846\n", - " y 0.8413261815245825\n", + " x 0.03866728389822765\n", + " y 0.2464626545548167\n", " \n", " \n", " 4\n", - " x 0.6516299325937484\n", - " y 0.8754475349043658\n", + " x 0.18164934280117156\n", + " y 4.0068736482374296e-05\n", " \n", " \n", " 5\n", - " x 0.027244204041674373\n", - " y 0.047863235194903306\n", + " x 0.01978861830097462\n", + " y 0.043658695735944095\n", " \n", " \n", "\n", "" ], "text/plain": [ - " x y\n", - "0 x 0.7900279243540883 y 0.37740734135777476\n", - "1 x 0.9671756699413814 y 0.029179194753120932\n", - "2 x 0.8040607874933128 y 0.10758002114779909\n", - "3 x 0.0055403010103483846 y 0.8413261815245825\n", - "4 x 0.6516299325937484 y 0.8754475349043658\n", - "5 x 0.027244204041674373 y 0.047863235194903306" + " x y\n", + "0 x 0.5266599179693725 y 0.9382743565552832\n", + "1 x 0.008140397891205747 y 0.15021848176712474\n", + "2 x 0.334121386085333 y 0.6837802892580668\n", + "3 x 0.03866728389822765 y 0.2464626545548167\n", + "4 x 0.18164934280117156 y 4.0068736482374296e-05\n", + "5 x 0.01978861830097462 y 0.043658695735944095" ] }, "execution_count": 12, @@ -833,7 +608,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "major-nightmare", + "id": "lovely-strength", "metadata": {}, "outputs": [ { @@ -866,51 +641,51 @@ " \n", " 0\n", " a\n", - " x 0.7900279243540883\n", - " y 0.37740734135777476\n", + " x 0.5266599179693725\n", + " y 0.9382743565552832\n", " \n", " \n", " 1\n", " b\n", - " x 0.9671756699413814\n", - " y 0.029179194753120932\n", + " x 0.008140397891205747\n", + " y 0.15021848176712474\n", " \n", " \n", " 2\n", " b\n", - " x 0.8040607874933128\n", - " y 0.10758002114779909\n", + " x 0.334121386085333\n", + " y 0.6837802892580668\n", " \n", " \n", " 3\n", " c\n", - " x 0.0055403010103483846\n", - " y 0.8413261815245825\n", + " x 0.03866728389822765\n", + " y 0.2464626545548167\n", " \n", " \n", " 4\n", " c\n", - " x 0.6516299325937484\n", - " y 0.8754475349043658\n", + " x 0.18164934280117156\n", + " y 4.0068736482374296e-05\n", " \n", " \n", " 5\n", " c\n", - " x 0.027244204041674373\n", - " y 0.047863235194903306\n", + " x 0.01978861830097462\n", + " y 0.043658695735944095\n", " \n", " \n", "\n", "" ], "text/plain": [ - " g x y\n", - "0 a x 0.7900279243540883 y 0.37740734135777476\n", - "1 b x 0.9671756699413814 y 0.029179194753120932\n", - "2 b x 0.8040607874933128 y 0.10758002114779909\n", - "3 c x 0.0055403010103483846 y 0.8413261815245825\n", - "4 c x 0.6516299325937484 y 0.8754475349043658\n", - "5 c x 0.027244204041674373 y 0.047863235194903306" + " g x y\n", + "0 a x 0.5266599179693725 y 0.9382743565552832\n", + "1 b x 0.008140397891205747 y 0.15021848176712474\n", + "2 b x 0.334121386085333 y 0.6837802892580668\n", + "3 c x 0.03866728389822765 y 0.2464626545548167\n", + "4 c x 0.18164934280117156 y 4.0068736482374296e-05\n", + "5 c x 0.01978861830097462 y 0.043658695735944095" ] }, "execution_count": 13, @@ -929,7 +704,7 @@ { "cell_type": "code", "execution_count": null, - "id": "effective-result", + "id": "statewide-massage", "metadata": {}, "outputs": [], "source": [] diff --git a/examples/count.ipynb b/docs/notebooks/count.ipynb similarity index 96% rename from examples/count.ipynb rename to docs/notebooks/count.ipynb index 9d84b838..7f69a5ba 100644 --- a/examples/count.ipynb +++ b/docs/notebooks/count.ipynb @@ -3,20 +3,45 @@ { "cell_type": "code", "execution_count": 1, - "id": "forbidden-shannon", + "id": "rapid-reference", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Count observations by group\n", + "\n", + " See: https://dplyr.tidyverse.org/reference/count.html\n", + "\n", + " Args:\n", + " _data: The dataframe\n", + " *columns, **mutates: Variables to group by\n", + " wt: Frequency weights. Can be None or a variable:\n", + " If None (the default), counts the number of rows in each group.\n", + " If a variable, computes sum(wt) for each group.\n", + " sort: If TRUE, will show the largest groups at the top.\n", + " name: The name of the new column in the output.\n", + "\n", + " Returns:\n", + " DataFrame object with the count column\n", + " \n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/count.html\n", "from pandas import DataFrame\n", "from datar.datasets import starwars\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(count.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "higher-america", + "id": "minor-processor", "metadata": {}, "outputs": [ { @@ -293,7 +318,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "rough-evans", + "id": "wrong-yahoo", "metadata": {}, "outputs": [ { @@ -570,7 +595,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "annual-penny", + "id": "accessory-cornell", "metadata": {}, "outputs": [ { @@ -662,7 +687,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "amended-singer", + "id": "roman-sector", "metadata": {}, "outputs": [ { @@ -801,7 +826,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "mysterious-adolescent", + "id": "split-basket", "metadata": {}, "outputs": [ { @@ -869,7 +894,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "reported-magnitude", + "id": "natural-thermal", "metadata": {}, "outputs": [ { @@ -930,7 +955,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "abandoned-chair", + "id": "innovative-rabbit", "metadata": {}, "outputs": [ { @@ -983,7 +1008,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "intellectual-musical", + "id": "romance-penalty", "metadata": {}, "outputs": [ { @@ -1254,13 +1279,13 @@ } ], "source": [ - "starwars >> group_by(f.species) >> tally()" + "starwars >> group_by(f.species) >> tally() >> display()" ] }, { "cell_type": "code", "execution_count": 10, - "id": "athletic-illinois", + "id": "emotional-illness", "metadata": {}, "outputs": [ { @@ -1335,7 +1360,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "choice-heavy", + "id": "raised-medline", "metadata": {}, "outputs": [ { @@ -1406,7 +1431,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "skilled-respondent", + "id": "deadly-treat", "metadata": {}, "outputs": [ { diff --git a/examples/cumall.ipynb b/docs/notebooks/cumall.ipynb similarity index 93% rename from examples/cumall.ipynb rename to docs/notebooks/cumall.ipynb index b4cae446..b6f6ef65 100644 --- a/examples/cumall.ipynb +++ b/docs/notebooks/cumall.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "apart-attraction", + "id": "technological-syntax", "metadata": {}, "outputs": [], "source": [ @@ -16,7 +16,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "outer-trauma", + "id": "political-principle", "metadata": {}, "outputs": [ { @@ -43,7 +43,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "caroline-internet", + "id": "interpreted-musician", "metadata": {}, "outputs": [ { @@ -69,7 +69,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "adjusted-transport", + "id": "wanted-reward", "metadata": {}, "outputs": [ { @@ -95,7 +95,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "failing-miami", + "id": "postal-motel", "metadata": {}, "outputs": [ { @@ -121,7 +121,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "religious-framing", + "id": "massive-apache", "metadata": {}, "outputs": [ { @@ -151,22 +151,22 @@ " \n", " \n", " \n", - " 3\n", + " 0\n", " 2020-01-04\n", " -25\n", " \n", " \n", - " 4\n", + " 1\n", " 2020-01-05\n", " -50\n", " \n", " \n", - " 5\n", + " 2\n", " 2020-01-06\n", " 30\n", " \n", " \n", - " 6\n", + " 3\n", " 2020-01-07\n", " 120\n", " \n", @@ -176,10 +176,10 @@ ], "text/plain": [ " date balance\n", - "3 2020-01-04 -25\n", - "4 2020-01-05 -50\n", - "5 2020-01-06 30\n", - "6 2020-01-07 120" + "0 2020-01-04 -25\n", + "1 2020-01-05 -50\n", + "2 2020-01-06 30\n", + "3 2020-01-07 120" ] }, "execution_count": 6, @@ -198,7 +198,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "alpha-marble", + "id": "light-nicaragua", "metadata": {}, "outputs": [ { diff --git a/docs/notebooks/desc.ipynb b/docs/notebooks/desc.ipynb new file mode 100644 index 00000000..9daa15a6 --- /dev/null +++ b/docs/notebooks/desc.ipynb @@ -0,0 +1,144 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "imposed-afghanistan", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transform a vector into a format that will be sorted in descending order\n", + "\n", + " This is useful within arrange().\n", + "\n", + " The original API:\n", + " https://dplyr.tidyverse.org/reference/desc.html\n", + "\n", + " Args:\n", + " x: vector to transform\n", + "\n", + " Returns:\n", + " The descending order of x\n", + " \n" + ] + } + ], + "source": [ + "from datar.base import factor, letters\n", + "from datar.dplyr import desc\n", + "\n", + "print(desc.__doc__)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "sealed-papua", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 -1\n", + "1 -2\n", + "2 -3\n", + "3 -4\n", + "4 -5\n", + "5 -6\n", + "6 -7\n", + "7 -8\n", + "8 -9\n", + "9 -10\n", + "dtype: int64" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "desc(range(1,11))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "swedish-divorce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 -0.0\n", + "1 -1.0\n", + "2 -2.0\n", + "3 -3.0\n", + "4 -4.0\n", + "5 -5.0\n", + "6 -6.0\n", + "7 -7.0\n", + "8 -8.0\n", + "9 -9.0\n", + "10 -10.0\n", + "11 -11.0\n", + "12 -12.0\n", + "13 -13.0\n", + "14 -14.0\n", + "15 -15.0\n", + "16 -16.0\n", + "17 -17.0\n", + "18 -18.0\n", + "19 -19.0\n", + "20 -20.0\n", + "21 -21.0\n", + "22 -22.0\n", + "23 -23.0\n", + "24 -24.0\n", + "25 -25.0\n", + "dtype: float64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "desc(factor(letters))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "limited-pricing", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/distinct.ipynb b/docs/notebooks/distinct.ipynb similarity index 84% rename from examples/distinct.ipynb rename to docs/notebooks/distinct.ipynb index 0c4e1e1c..b415b847 100644 --- a/examples/distinct.ipynb +++ b/docs/notebooks/distinct.ipynb @@ -3,19 +3,42 @@ { "cell_type": "code", "execution_count": 1, - "id": "dated-disaster", + "id": "pressed-fossil", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Select only unique/distinct rows from a data frame.\n", + "\n", + " The original API:\n", + " https://dplyr.tidyverse.org/reference/distinct.html\n", + "\n", + " Args:\n", + " _data: The dataframe\n", + " *columns, **mutates: Optional variables to use when determining\n", + " uniqueness.\n", + " _keep_all: If TRUE, keep all variables in _data\n", + "\n", + " Returns:\n", + " A dataframe without duplicated rows in _data\n", + " \n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/distinct.html\n", "from datar.datasets import starwars\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(distinct.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "tough-consequence", + "id": "automotive-brother", "metadata": {}, "outputs": [ { @@ -40,7 +63,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "responsible-cooling", + "id": "expired-banner", "metadata": {}, "outputs": [ { @@ -61,7 +84,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "imported-little", + "id": "facial-motivation", "metadata": {}, "outputs": [ { @@ -82,7 +105,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "enormous-bubble", + "id": "governmental-serve", "metadata": {}, "outputs": [ { @@ -112,60 +135,60 @@ " \n", " \n", " 0\n", - " 8\n", + " 0\n", " \n", " \n", " 1\n", - " 5\n", + " 4\n", " \n", " \n", " 2\n", - " 1\n", - " \n", - " \n", - " 3\n", - " 9\n", + " 8\n", " \n", " \n", " 4\n", - " 0\n", - " \n", - " \n", - " 5\n", - " 4\n", + " 6\n", " \n", " \n", " 6\n", - " 7\n", + " 2\n", " \n", " \n", " 7\n", - " 2\n", + " 1\n", " \n", " \n", " 8\n", - " 3\n", + " 9\n", " \n", " \n", " 9\n", - " 6\n", + " 3\n", + " \n", + " \n", + " 19\n", + " 5\n", + " \n", + " \n", + " 23\n", + " 7\n", " \n", " \n", "\n", "" ], "text/plain": [ - " x\n", - "0 8\n", - "1 5\n", - "2 1\n", - "3 9\n", - "4 0\n", - "5 4\n", - "6 7\n", - "7 2\n", - "8 3\n", - "9 6" + " x\n", + "0 0\n", + "1 4\n", + "2 8\n", + "4 6\n", + "6 2\n", + "7 1\n", + "8 9\n", + "9 3\n", + "19 5\n", + "23 7" ] }, "execution_count": 5, @@ -180,7 +203,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "fatty-breeding", + "id": "internal-jaguar", "metadata": {}, "outputs": [ { @@ -210,60 +233,60 @@ " \n", " \n", " 0\n", - " 1\n", + " 0\n", " \n", " \n", " 1\n", - " 3\n", + " 8\n", " \n", " \n", " 2\n", - " 5\n", + " 4\n", " \n", " \n", " 3\n", - " 0\n", + " 9\n", " \n", " \n", " 4\n", - " 4\n", + " 1\n", " \n", " \n", " 5\n", - " 2\n", - " \n", - " \n", - " 6\n", - " 9\n", + " 5\n", " \n", " \n", " 7\n", - " 8\n", + " 7\n", " \n", " \n", " 8\n", - " 7\n", + " 6\n", " \n", " \n", " 9\n", - " 6\n", + " 3\n", + " \n", + " \n", + " 19\n", + " 2\n", " \n", " \n", "\n", "" ], "text/plain": [ - " y\n", - "0 1\n", - "1 3\n", - "2 5\n", - "3 0\n", - "4 4\n", - "5 2\n", - "6 9\n", - "7 8\n", - "8 7\n", - "9 6" + " y\n", + "0 0\n", + "1 8\n", + "2 4\n", + "3 9\n", + "4 1\n", + "5 5\n", + "7 7\n", + "8 6\n", + "9 3\n", + "19 2" ] }, "execution_count": 6, @@ -278,7 +301,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "aware-carroll", + "id": "handled-nowhere", "metadata": {}, "outputs": [ { @@ -309,70 +332,70 @@ " \n", " \n", " 0\n", - " 8\n", - " 1\n", + " 0\n", + " 0\n", " \n", " \n", " 1\n", - " 5\n", - " 3\n", + " 4\n", + " 8\n", " \n", " \n", " 2\n", - " 1\n", - " 5\n", - " \n", - " \n", - " 3\n", - " 9\n", - " 2\n", + " 8\n", + " 4\n", " \n", " \n", " 4\n", - " 0\n", - " 3\n", - " \n", - " \n", - " 5\n", - " 4\n", - " 5\n", + " 6\n", + " 1\n", " \n", " \n", " 6\n", - " 7\n", " 2\n", + " 4\n", " \n", " \n", " 7\n", - " 2\n", + " 1\n", " 7\n", " \n", " \n", " 8\n", - " 3\n", + " 9\n", " 6\n", " \n", " \n", " 9\n", - " 6\n", - " 4\n", + " 3\n", + " 3\n", + " \n", + " \n", + " 19\n", + " 5\n", + " 2\n", + " \n", + " \n", + " 23\n", + " 7\n", + " 1\n", " \n", " \n", "\n", "" ], "text/plain": [ - " x y\n", - "0 8 1\n", - "1 5 3\n", - "2 1 5\n", - "3 9 2\n", - "4 0 3\n", - "5 4 5\n", - "6 7 2\n", - "7 2 7\n", - "8 3 6\n", - "9 6 4" + " x y\n", + "0 0 0\n", + "1 4 8\n", + "2 8 4\n", + "4 6 1\n", + "6 2 4\n", + "7 1 7\n", + "8 9 6\n", + "9 3 3\n", + "19 5 2\n", + "23 7 1" ] }, "execution_count": 7, @@ -387,7 +410,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "excessive-plate", + "id": "developmental-stability", "metadata": {}, "outputs": [ { @@ -418,70 +441,70 @@ " \n", " \n", " 0\n", - " 8\n", - " 1\n", + " 0\n", + " 0\n", " \n", " \n", " 1\n", - " 5\n", - " 3\n", + " 4\n", + " 8\n", " \n", " \n", " 2\n", - " 1\n", - " 5\n", + " 8\n", + " 4\n", " \n", " \n", " 3\n", - " 1\n", - " 0\n", + " 8\n", + " 9\n", " \n", " \n", " 4\n", - " 5\n", - " 4\n", + " 6\n", + " 1\n", " \n", " \n", " 5\n", - " 9\n", - " 2\n", - " \n", - " \n", - " 6\n", + " 8\n", " 5\n", - " 9\n", " \n", " \n", " 7\n", - " 8\n", - " 8\n", + " 1\n", + " 7\n", " \n", " \n", " 8\n", - " 2\n", - " 7\n", + " 9\n", + " 6\n", " \n", " \n", " 9\n", " 3\n", - " 6\n", + " 3\n", + " \n", + " \n", + " 19\n", + " 5\n", + " 2\n", " \n", " \n", "\n", "" ], "text/plain": [ - " x y\n", - "0 8 1\n", - "1 5 3\n", - "2 1 5\n", - "3 1 0\n", - "4 5 4\n", - "5 9 2\n", - "6 5 9\n", - "7 8 8\n", - "8 2 7\n", - "9 3 6" + " x y\n", + "0 0 0\n", + "1 4 8\n", + "2 8 4\n", + "3 8 9\n", + "4 6 1\n", + "5 8 5\n", + "7 1 7\n", + "8 9 6\n", + "9 3 3\n", + "19 5 2" ] }, "execution_count": 8, @@ -496,7 +519,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "stable-commercial", + "id": "macro-hygiene", "metadata": {}, "outputs": [ { @@ -526,14 +549,10 @@ " \n", " \n", " 0\n", - " 7\n", + " 0\n", " \n", " \n", " 1\n", - " 2\n", - " \n", - " \n", - " 2\n", " 4\n", " \n", " \n", @@ -542,44 +561,48 @@ " \n", " \n", " 4\n", - " 3\n", + " 5\n", " \n", " \n", " 5\n", - " 5\n", + " 3\n", " \n", " \n", " 6\n", - " 0\n", + " 2\n", " \n", " \n", " 7\n", - " 8\n", + " 6\n", " \n", " \n", - " 8\n", - " 9\n", + " 12\n", + " 7\n", " \n", " \n", - " 9\n", - " 6\n", + " 17\n", + " 8\n", + " \n", + " \n", + " 49\n", + " 9\n", " \n", " \n", "\n", "" ], "text/plain": [ - " diff\n", - "0 7\n", - "1 2\n", - "2 4\n", - "3 1\n", - "4 3\n", - "5 5\n", - "6 0\n", - "7 8\n", - "8 9\n", - "9 6" + " diff\n", + "0 0\n", + "1 4\n", + "3 1\n", + "4 5\n", + "5 3\n", + "6 2\n", + "7 6\n", + "12 7\n", + "17 8\n", + "49 9" ] }, "execution_count": 9, @@ -594,7 +617,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "prostate-version", + "id": "taken-behavior", "metadata": {}, "outputs": [ { @@ -661,31 +684,31 @@ " ...\n", " \n", " \n", - " 62\n", + " 79\n", " none\n", " pale\n", " white\n", " \n", " \n", - " 63\n", + " 81\n", " black\n", " dark\n", " dark\n", " \n", " \n", - " 64\n", + " 82\n", " brown\n", " light\n", " hazel\n", " \n", " \n", - " 65\n", + " 84\n", " none\n", " none\n", " black\n", " \n", " \n", - " 66\n", + " 85\n", " unknown\n", " unknown\n", " unknown\n", @@ -703,11 +726,11 @@ "3 none white yellow\n", "4 brown light brown\n", ".. ... ... ...\n", - "62 none pale white\n", - "63 black dark dark\n", - "64 brown light hazel\n", - "65 none none black\n", - "66 unknown unknown unknown\n", + "79 none pale white\n", + "81 black dark dark\n", + "82 brown light hazel\n", + "84 none none black\n", + "85 unknown unknown unknown\n", "\n", "[67 rows x 3 columns]" ] @@ -724,14 +747,14 @@ { "cell_type": "code", "execution_count": 12, - "id": "accepted-identification", + "id": "abstract-vegetable", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-13 00:31:48][datar][ INFO] # [DataFrameGroupBy] Groups: ['g'] (2)\n" + "[2021-04-02 23:59:01][datar][ INFO] # [DataFrameGroupBy] Groups: ['g'] (2)\n" ] }, { @@ -797,7 +820,7 @@ " x=[1, 1, 2, 1]\n", ") >> group_by(f.g)\n", "\n", - "df >> distinct(f.x) >> showme()" + "df >> distinct(f.x) >> display()" ] } ], diff --git a/examples/drop_na.ipynb b/docs/notebooks/drop_na.ipynb similarity index 87% rename from examples/drop_na.ipynb rename to docs/notebooks/drop_na.ipynb index 5012de1a..ce0f7f14 100644 --- a/examples/drop_na.ipynb +++ b/docs/notebooks/drop_na.ipynb @@ -3,19 +3,38 @@ { "cell_type": "code", "execution_count": 1, - "id": "placed-federation", + "id": "active-massachusetts", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Drop rows containing missing values\n", + "\n", + " See: https://tidyr.tidyverse.org/reference/drop_na.html\n", + "\n", + " Args:\n", + " data: A data frame.\n", + " *columns: Columns to inspect for missing values.\n", + "\n", + " Returns:\n", + " Dataframe with rows with NAs dropped\n", + " \n" + ] + } + ], "source": [ "# https://tidyr.tidyverse.org/reference/drop_na.html\n", "\n", - "from datar.all import *" + "from datar.all import *\n", + "print(drop_na.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "geographic-seminar", + "id": "affiliated-barcelona", "metadata": {}, "outputs": [ { @@ -71,7 +90,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "contrary-transaction", + "id": "aerial-colon", "metadata": {}, "outputs": [ { @@ -132,7 +151,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "ultimate-record", + "id": "stretch-indiana", "metadata": {}, "outputs": [ { diff --git a/examples/expand.ipynb b/docs/notebooks/expand.ipynb similarity index 90% rename from examples/expand.ipynb rename to docs/notebooks/expand.ipynb index 95aea220..eaa785d9 100644 --- a/examples/expand.ipynb +++ b/docs/notebooks/expand.ipynb @@ -3,19 +3,29 @@ { "cell_type": "code", "execution_count": 1, - "id": "subjective-bosnia", + "id": "indonesian-missouri", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "See: https://tidyr.tidyverse.org/reference/expand.html\n" + ] + } + ], "source": [ "# https://tidyr.tidyverse.org/reference/expand.html\n", "\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(expand.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "banner-modification", + "id": "global-cooling", "metadata": {}, "outputs": [ { @@ -51,42 +61,42 @@ " apple\n", " 2010\n", " XS\n", - " -2.453579\n", + " 0.181247\n", " \n", " \n", " 1\n", " orange\n", " 2010\n", " S\n", - " -0.645351\n", + " -0.429350\n", " \n", " \n", " 2\n", " apple\n", " 2012\n", " M\n", - " 1.380075\n", + " 0.370125\n", " \n", " \n", " 3\n", " orange\n", " 2010\n", " S\n", - " -0.383807\n", + " 0.618415\n", " \n", " \n", " 4\n", " orange\n", " 2010\n", " S\n", - " 1.320909\n", + " -0.861787\n", " \n", " \n", " 5\n", " orange\n", " 2012\n", " M\n", - " -0.689147\n", + " -0.001582\n", " \n", " \n", "\n", @@ -94,12 +104,12 @@ ], "text/plain": [ " type year size weights\n", - "0 apple 2010 XS -2.453579\n", - "1 orange 2010 S -0.645351\n", - "2 apple 2012 M 1.380075\n", - "3 orange 2010 S -0.383807\n", - "4 orange 2010 S 1.320909\n", - "5 orange 2012 M -0.689147" + "0 apple 2010 XS 0.181247\n", + "1 orange 2010 S -0.429350\n", + "2 apple 2012 M 0.370125\n", + "3 orange 2010 S 0.618415\n", + "4 orange 2010 S -0.861787\n", + "5 orange 2012 M -0.001582" ] }, "execution_count": 2, @@ -124,7 +134,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "natural-deviation", + "id": "scheduled-warrior", "metadata": {}, "outputs": [ { @@ -182,7 +192,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "attended-morocco", + "id": "determined-relief", "metadata": {}, "outputs": [ { @@ -279,7 +289,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "entitled-representative", + "id": "future-burning", "metadata": {}, "outputs": [ { @@ -316,91 +326,91 @@ " 2010\n", " \n", " \n", - " 1\n", + " 2\n", " apple\n", " XS\n", " 2012\n", " \n", " \n", - " 2\n", + " 6\n", " apple\n", " S\n", " 2010\n", " \n", " \n", - " 3\n", + " 8\n", " apple\n", " S\n", " 2012\n", " \n", " \n", - " 4\n", + " 12\n", " apple\n", " M\n", " 2010\n", " \n", " \n", - " 5\n", + " 14\n", " apple\n", " M\n", " 2012\n", " \n", " \n", - " 6\n", + " 18\n", " apple\n", " L\n", " 2010\n", " \n", " \n", - " 7\n", + " 20\n", " apple\n", " L\n", " 2012\n", " \n", " \n", - " 8\n", + " 24\n", " orange\n", " XS\n", " 2010\n", " \n", " \n", - " 9\n", + " 26\n", " orange\n", " XS\n", " 2012\n", " \n", " \n", - " 10\n", + " 30\n", " orange\n", " S\n", " 2010\n", " \n", " \n", - " 11\n", + " 32\n", " orange\n", " S\n", " 2012\n", " \n", " \n", - " 12\n", + " 36\n", " orange\n", " M\n", " 2010\n", " \n", " \n", - " 13\n", + " 38\n", " orange\n", " M\n", " 2012\n", " \n", " \n", - " 14\n", + " 42\n", " orange\n", " L\n", " 2010\n", " \n", " \n", - " 15\n", + " 44\n", " orange\n", " L\n", " 2012\n", @@ -412,21 +422,21 @@ "text/plain": [ " type size year\n", "0 apple XS 2010\n", - "1 apple XS 2012\n", - "2 apple S 2010\n", - "3 apple S 2012\n", - "4 apple M 2010\n", - "5 apple M 2012\n", - "6 apple L 2010\n", - "7 apple L 2012\n", - "8 orange XS 2010\n", - "9 orange XS 2012\n", - "10 orange S 2010\n", - "11 orange S 2012\n", - "12 orange M 2010\n", - "13 orange M 2012\n", - "14 orange L 2010\n", - "15 orange L 2012" + "2 apple XS 2012\n", + "6 apple S 2010\n", + "8 apple S 2012\n", + "12 apple M 2010\n", + "14 apple M 2012\n", + "18 apple L 2010\n", + "20 apple L 2012\n", + "24 orange XS 2010\n", + "26 orange XS 2012\n", + "30 orange S 2010\n", + "32 orange S 2012\n", + "36 orange M 2010\n", + "38 orange M 2012\n", + "42 orange L 2010\n", + "44 orange L 2012" ] }, "execution_count": 5, @@ -441,7 +451,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "characteristic-annotation", + "id": "warming-tournament", "metadata": {}, "outputs": [ { @@ -499,7 +509,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "respiratory-noise", + "id": "republican-hardware", "metadata": {}, "outputs": [ { @@ -544,7 +554,7 @@ " M\n", " \n", " \n", - " 3\n", + " 5\n", " orange\n", " M\n", " \n", @@ -557,7 +567,7 @@ "0 apple XS\n", "1 orange S\n", "2 apple M\n", - "3 orange M" + "5 orange M" ] }, "execution_count": 7, @@ -572,7 +582,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "conscious-freedom", + "id": "norman-pathology", "metadata": {}, "outputs": [ { @@ -621,7 +631,7 @@ " 2012\n", " \n", " \n", - " 3\n", + " 5\n", " orange\n", " M\n", " 2012\n", @@ -635,7 +645,7 @@ "0 apple XS 2010\n", "1 orange S 2010\n", "2 apple M 2012\n", - "3 orange M 2012" + "5 orange M 2012" ] }, "execution_count": 8, @@ -650,7 +660,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "elegant-trustee", + "id": "social-alignment", "metadata": {}, "outputs": [ { @@ -687,91 +697,91 @@ " 2010\n", " \n", " \n", - " 1\n", + " 2\n", " apple\n", " XS\n", " 2012\n", " \n", " \n", - " 2\n", + " 6\n", " apple\n", " S\n", " 2010\n", " \n", " \n", - " 3\n", + " 8\n", " apple\n", " S\n", " 2012\n", " \n", " \n", - " 4\n", + " 12\n", " apple\n", " M\n", " 2010\n", " \n", " \n", - " 5\n", + " 14\n", " apple\n", " M\n", " 2012\n", " \n", " \n", - " 6\n", + " 18\n", " apple\n", " L\n", " 2010\n", " \n", " \n", - " 7\n", + " 20\n", " apple\n", " L\n", " 2012\n", " \n", " \n", - " 8\n", + " 24\n", " orange\n", " XS\n", " 2010\n", " \n", " \n", - " 9\n", + " 26\n", " orange\n", " XS\n", " 2012\n", " \n", " \n", - " 10\n", + " 30\n", " orange\n", " S\n", " 2010\n", " \n", " \n", - " 11\n", + " 32\n", " orange\n", " S\n", " 2012\n", " \n", " \n", - " 12\n", + " 36\n", " orange\n", " M\n", " 2010\n", " \n", " \n", - " 13\n", + " 38\n", " orange\n", " M\n", " 2012\n", " \n", " \n", - " 14\n", + " 42\n", " orange\n", " L\n", " 2010\n", " \n", " \n", - " 15\n", + " 44\n", " orange\n", " L\n", " 2012\n", @@ -783,21 +793,21 @@ "text/plain": [ " type size year\n", "0 apple XS 2010\n", - "1 apple XS 2012\n", - "2 apple S 2010\n", - "3 apple S 2012\n", - "4 apple M 2010\n", - "5 apple M 2012\n", - "6 apple L 2010\n", - "7 apple L 2012\n", - "8 orange XS 2010\n", - "9 orange XS 2012\n", - "10 orange S 2010\n", - "11 orange S 2012\n", - "12 orange M 2010\n", - "13 orange M 2012\n", - "14 orange L 2010\n", - "15 orange L 2012" + "2 apple XS 2012\n", + "6 apple S 2010\n", + "8 apple S 2012\n", + "12 apple M 2010\n", + "14 apple M 2012\n", + "18 apple L 2010\n", + "20 apple L 2012\n", + "24 orange XS 2010\n", + "26 orange XS 2012\n", + "30 orange S 2010\n", + "32 orange S 2012\n", + "36 orange M 2010\n", + "38 orange M 2012\n", + "42 orange L 2010\n", + "44 orange L 2012" ] }, "execution_count": 9, @@ -811,15 +821,15 @@ }, { "cell_type": "code", - "execution_count": 11, - "id": "ancient-lewis", + "execution_count": 10, + "id": "manual-synthetic", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-15 12:19:27][datar][WARNING] Temporary name used. Use keyword argument to specify the key as column name.\n" + "[2021-04-03 00:26:41][datar][WARNING] Temporary name used. Use keyword argument to specify the key as column name.\n" ] }, { @@ -845,7 +855,7 @@ " \n", " type\n", " size\n", - " _tmp7f05_2\n", + " _tmp7f4d_2\n", " \n", " \n", " \n", @@ -998,7 +1008,7 @@ "" ], "text/plain": [ - " type size _tmp7f05_2\n", + " type size _tmp7f4d_2\n", "0 apple XS 2010\n", "1 apple XS 2011\n", "2 apple XS 2012\n", @@ -1025,7 +1035,7 @@ "23 orange L 2012" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1036,8 +1046,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "id": "novel-chicago", + "execution_count": 11, + "id": "harmful-progressive", "metadata": {}, "outputs": [ { @@ -1243,7 +1253,7 @@ "23 orange L 2012" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1254,8 +1264,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "proud-woman", + "execution_count": 12, + "id": "referenced-exhaust", "metadata": {}, "outputs": [ { @@ -1292,91 +1302,91 @@ " 2010\n", " \n", " \n", - " 1\n", + " 2\n", " apple\n", " XS\n", " 2012\n", " \n", " \n", - " 2\n", + " 6\n", " apple\n", " S\n", " 2010\n", " \n", " \n", - " 3\n", + " 8\n", " apple\n", " S\n", " 2012\n", " \n", " \n", - " 4\n", + " 12\n", " apple\n", " M\n", " 2010\n", " \n", " \n", - " 5\n", + " 14\n", " apple\n", " M\n", " 2012\n", " \n", " \n", - " 6\n", + " 18\n", " apple\n", " L\n", " 2010\n", " \n", " \n", - " 7\n", + " 20\n", " apple\n", " L\n", " 2012\n", " \n", " \n", - " 8\n", + " 24\n", " orange\n", " XS\n", " 2010\n", " \n", " \n", - " 9\n", + " 26\n", " orange\n", " XS\n", " 2012\n", " \n", " \n", - " 10\n", + " 30\n", " orange\n", " S\n", " 2010\n", " \n", " \n", - " 11\n", + " 32\n", " orange\n", " S\n", " 2012\n", " \n", " \n", - " 12\n", + " 36\n", " orange\n", " M\n", " 2010\n", " \n", " \n", - " 13\n", + " 38\n", " orange\n", " M\n", " 2012\n", " \n", " \n", - " 14\n", + " 42\n", " orange\n", " L\n", " 2010\n", " \n", " \n", - " 15\n", + " 44\n", " orange\n", " L\n", " 2012\n", @@ -1388,24 +1398,24 @@ "text/plain": [ " type size year\n", "0 apple XS 2010\n", - "1 apple XS 2012\n", - "2 apple S 2010\n", - "3 apple S 2012\n", - "4 apple M 2010\n", - "5 apple M 2012\n", - "6 apple L 2010\n", - "7 apple L 2012\n", - "8 orange XS 2010\n", - "9 orange XS 2012\n", - "10 orange S 2010\n", - "11 orange S 2012\n", - "12 orange M 2010\n", - "13 orange M 2012\n", - "14 orange L 2010\n", - "15 orange L 2012" + "2 apple XS 2012\n", + "6 apple S 2010\n", + "8 apple S 2012\n", + "12 apple M 2010\n", + "14 apple M 2012\n", + "18 apple L 2010\n", + "20 apple L 2012\n", + "24 orange XS 2010\n", + "26 orange XS 2012\n", + "30 orange S 2010\n", + "32 orange S 2012\n", + "36 orange M 2010\n", + "38 orange M 2012\n", + "42 orange L 2010\n", + "44 orange L 2012" ] }, - "execution_count": 14, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1417,8 +1427,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "respective-motorcycle", + "execution_count": 13, + "id": "little-insulation", "metadata": {}, "outputs": [ { @@ -1540,7 +1550,7 @@ "17 orange L 2012" ] }, - "execution_count": 15, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1551,8 +1561,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "surprising-filling", + "execution_count": 14, + "id": "regional-equilibrium", "metadata": {}, "outputs": [ { @@ -1588,7 +1598,7 @@ " apple\n", " 2010\n", " XS\n", - " -2.453579\n", + " 0.181247\n", " \n", " \n", " 1\n", @@ -1623,7 +1633,7 @@ " apple\n", " 2012\n", " M\n", - " 1.380075\n", + " 0.370125\n", " \n", " \n", " 6\n", @@ -1658,21 +1668,21 @@ " orange\n", " 2010\n", " S\n", - " -0.645351\n", + " -0.429350\n", " \n", " \n", " 11\n", " orange\n", " 2010\n", " S\n", - " -0.383807\n", + " 0.618415\n", " \n", " \n", " 12\n", " orange\n", " 2010\n", " S\n", - " 1.320909\n", + " -0.861787\n", " \n", " \n", " 13\n", @@ -1693,7 +1703,7 @@ " orange\n", " 2012\n", " M\n", - " -0.689147\n", + " -0.001582\n", " \n", " \n", " 16\n", @@ -1715,27 +1725,27 @@ ], "text/plain": [ " type year size weights\n", - "0 apple 2010 XS -2.453579\n", + "0 apple 2010 XS 0.181247\n", "1 apple 2012 XS NaN\n", "2 apple 2010 S NaN\n", "3 apple 2012 S NaN\n", "4 apple 2010 M NaN\n", - "5 apple 2012 M 1.380075\n", + "5 apple 2012 M 0.370125\n", "6 apple 2010 L NaN\n", "7 apple 2012 L NaN\n", "8 orange 2010 XS NaN\n", "9 orange 2012 XS NaN\n", - "10 orange 2010 S -0.645351\n", - "11 orange 2010 S -0.383807\n", - "12 orange 2010 S 1.320909\n", + "10 orange 2010 S -0.429350\n", + "11 orange 2010 S 0.618415\n", + "12 orange 2010 S -0.861787\n", "13 orange 2012 S NaN\n", "14 orange 2010 M NaN\n", - "15 orange 2012 M -0.689147\n", + "15 orange 2012 M -0.001582\n", "16 orange 2010 L NaN\n", "17 orange 2012 L NaN" ] }, - "execution_count": 16, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } diff --git a/examples/expand_grid.ipynb b/docs/notebooks/expand_grid.ipynb similarity index 91% rename from examples/expand_grid.ipynb rename to docs/notebooks/expand_grid.ipynb index 33ef8611..6f67bfb4 100644 --- a/examples/expand_grid.ipynb +++ b/docs/notebooks/expand_grid.ipynb @@ -3,19 +3,41 @@ { "cell_type": "code", "execution_count": 1, - "id": "dated-hello", + "id": "proprietary-shape", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Expand elements into a new dataframe\n", + "\n", + " See: https://tidyr.tidyverse.org/reference/expand_grid.html\n", + "\n", + " Args:\n", + " _data, **kwargs: Name-value pairs. The name will become the column\n", + " name in the output.\n", + " For _data, will try to fetch name via `_data.__dfname__`. If failed\n", + " `_data` will be used.\n", + "\n", + " Returns:\n", + " The expanded dataframe\n", + " \n" + ] + } + ], "source": [ "# https://tidyr.tidyverse.org/reference/expand_grid.html\n", "\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(expand_grid.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "quiet-absolute", + "id": "german-holmes", "metadata": {}, "outputs": [ { @@ -100,7 +122,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "partial-hebrew", + "id": "linear-music", "metadata": {}, "outputs": [ { @@ -218,7 +240,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "moved-basic", + "id": "convertible-norman", "metadata": {}, "outputs": [ { @@ -310,7 +332,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "blind-aspect", + "id": "hourly-recorder", "metadata": {}, "outputs": [ { @@ -393,7 +415,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dietary-plant", + "id": "under-novelty", "metadata": {}, "outputs": [], "source": [] diff --git a/examples/extract.ipynb b/docs/notebooks/extract.ipynb similarity index 82% rename from examples/extract.ipynb rename to docs/notebooks/extract.ipynb index 957044b8..a31d5f02 100644 --- a/examples/extract.ipynb +++ b/docs/notebooks/extract.ipynb @@ -3,19 +3,48 @@ { "cell_type": "code", "execution_count": 1, - "id": "better-questionnaire", + "id": "mediterranean-amino", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Given a regular expression with capturing groups, extract() turns each\n", + " group into a new column. If the groups don't match, or the input is NA,\n", + " the output will be NA.\n", + "\n", + " See: https://tidyr.tidyverse.org/reference/extract.html\n", + "\n", + " Args:\n", + " _data: The dataframe\n", + " col: Column name or position.\n", + " into: Names of new variables to create as character vector.\n", + " Use None to omit the variable in the output.\n", + " regex: a regular expression used to extract the desired values.\n", + " There should be one group (defined by ()) for each element of into.\n", + " remove: If TRUE, remove input column from output data frame.\n", + " convert: The universal type for the extracted columns or a dict for\n", + " individual ones\n", + "\n", + " Returns:\n", + " Dataframe with extracted columns.\n", + " \n" + ] + } + ], "source": [ "# https://tidyr.tidyverse.org/reference/extract.html\n", "\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(extract.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "fifteen-break", + "id": "academic-command", "metadata": {}, "outputs": [ { @@ -89,7 +118,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "minute-remainder", + "id": "satisfactory-broad", "metadata": {}, "outputs": [ { @@ -168,7 +197,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "demonstrated-athens", + "id": "formed-liver", "metadata": {}, "outputs": [ { diff --git a/examples/fill.ipynb b/docs/notebooks/fill.ipynb similarity index 94% rename from examples/fill.ipynb rename to docs/notebooks/fill.ipynb index e51e42f7..a96a4a1e 100644 --- a/examples/fill.ipynb +++ b/docs/notebooks/fill.ipynb @@ -3,19 +3,43 @@ { "cell_type": "code", "execution_count": 1, - "id": "optimum-surgeon", + "id": "turkish-panic", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fills missing values in selected columns using the next or\n", + " previous entry.\n", + "\n", + " See: https://tidyr.tidyverse.org/reference/fill.html\n", + "\n", + " Args:\n", + " _data: A dataframe\n", + " *columns: Columns to fill\n", + " _direction: Direction in which to fill missing values.\n", + " Currently either \"down\" (the default), \"up\",\n", + " \"downup\" (i.e. first down and then up) or\n", + " \"updown\" (first up and then down).\n", + "\n", + " Returns:\n", + " The dataframe with NAs being replaced.\n", + " \n" + ] + } + ], "source": [ "# https://tidyr.tidyverse.org/reference/fill.html\n", "\n", - "from datar.all import *" + "from datar.all import *\n", + "print(fill.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "static-practitioner", + "id": "mathematical-rubber", "metadata": {}, "outputs": [ { @@ -196,7 +220,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "assumed-stereo", + "id": "eastern-balloon", "metadata": {}, "outputs": [ { @@ -358,7 +382,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "genuine-honolulu", + "id": "musical-export", "metadata": {}, "outputs": [ { @@ -506,15 +530,15 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "herbal-mistress", + "execution_count": 6, + "id": "dietary-shadow", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-13 00:33:03][datar][ INFO] # [DataFrameGroupBy] Groups: ['group'] (3)\n" + "[2021-04-03 00:28:43][datar][ INFO] # [DataFrameGroupBy] Groups: ['group'] (3)\n" ] }, { @@ -649,7 +673,7 @@ "11 3 Danielle Observer 9.0" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -674,13 +698,13 @@ "squirrels >> \\\n", " group_by(f.group) >> \\\n", " fill(f.n_squirrels, _direction = \"downup\") >> \\\n", - " showme()" + " display()" ] }, { "cell_type": "code", "execution_count": null, - "id": "appreciated-journalist", + "id": "composite-blame", "metadata": {}, "outputs": [], "source": [] diff --git a/examples/filter-joins.ipynb b/docs/notebooks/filter-joins.ipynb similarity index 86% rename from examples/filter-joins.ipynb rename to docs/notebooks/filter-joins.ipynb index faac05a4..2fd4a25f 100644 --- a/examples/filter-joins.ipynb +++ b/docs/notebooks/filter-joins.ipynb @@ -3,20 +3,38 @@ { "cell_type": "code", "execution_count": 1, - "id": "upper-brake", + "id": "native-rabbit", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Returns all rows from x with a match in y.\n", + "\n", + " See inner_join()\n", + " \n", + "Returns all rows from x without a match in y.\n", + "\n", + " See inner_join()\n", + " \n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/filter-joins.html\n", "\n", "from datar.datasets import band_members, band_instruments\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(semi_join.__doc__)\n", + "print(anti_join.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "piano-emperor", + "id": "completed-letters", "metadata": {}, "outputs": [ { @@ -77,7 +95,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "active-burst", + "id": "sophisticated-composite", "metadata": {}, "outputs": [ { diff --git a/examples/full_seq.ipynb b/docs/notebooks/full_seq.ipynb similarity index 94% rename from examples/full_seq.ipynb rename to docs/notebooks/full_seq.ipynb index 72ae3bba..02bc25ce 100644 --- a/examples/full_seq.ipynb +++ b/docs/notebooks/full_seq.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "usual-monitoring", + "id": "greatest-trailer", "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "taken-tamil", + "id": "molecular-antenna", "metadata": {}, "outputs": [ { diff --git a/examples/get.ipynb b/docs/notebooks/get.ipynb similarity index 89% rename from examples/get.ipynb rename to docs/notebooks/get.ipynb index 5a80df31..f891887b 100644 --- a/examples/get.ipynb +++ b/docs/notebooks/get.ipynb @@ -3,20 +3,42 @@ { "cell_type": "code", "execution_count": 1, - "id": "angry-angle", + "id": "loose-serum", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Get a single element or a subset of a dataframe\n", + "\n", + " Args:\n", + " _data: The dataframe\n", + " rows: The rows to subset the dataframe\n", + " cols: The columns to subset the dataframe\n", + " If both rows and cols are scalar, then a single element will be\n", + " returned\n", + "\n", + " Returns:\n", + " A single element when both rows and cols are scalar, otherwise\n", + " a subset of _data\n", + " \n" + ] + } + ], "source": [ "# datar specific\n", "\n", "from datar.datasets import iris\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(get.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "developed-smoke", + "id": "athletic-wales", "metadata": {}, "outputs": [ { @@ -113,7 +135,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "quick-worse", + "id": "specialized-interface", "metadata": {}, "outputs": [ { @@ -219,7 +241,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "nonprofit-velvet", + "id": "divided-negative", "metadata": {}, "outputs": [ { @@ -241,7 +263,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "tired-conspiracy", + "id": "suited-philadelphia", "metadata": {}, "outputs": [ { @@ -264,7 +286,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "wooden-slope", + "id": "sticky-martin", "metadata": {}, "outputs": [ { diff --git a/examples/group_by.ipynb b/docs/notebooks/group_by.ipynb similarity index 71% rename from examples/group_by.ipynb rename to docs/notebooks/group_by.ipynb index 4899aff9..7282751f 100644 --- a/examples/group_by.ipynb +++ b/docs/notebooks/group_by.ipynb @@ -4,14 +4,34 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Takes an existing tbl and converts it into a grouped tbl where\n", + " operations are performed \"by group\"\n", + "\n", + " Args:\n", + " _data: The dataframe\n", + " *args: variables or computations to group by.\n", + " **kwargs: Extra variables to group the dataframe\n", + "\n", + " Return:\n", + " A DataFrameGroupBy object\n", + " \n" + ] + } + ], "source": [ "from IPython.core.interactiveshell import InteractiveShell\n", "InteractiveShell.ast_node_interactivity = \"all\"\n", "\n", "# https://dplyr.tidyverse.org/reference/group_by.html\n", "from datar.datasets import mtcars\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(group_by.__doc__)" ] }, { @@ -23,7 +43,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-12 16:07:20][datar][ INFO] # [DataFrameGroupBy] Groups: ['cyl'] (3)\n" + "[2021-04-03 00:30:33][datar][ INFO] # [DataFrameGroupBy] Groups: ['cyl'] (3)\n" ] }, { @@ -62,7 +82,7 @@ " \n", " \n", " \n", - " Mazda RX4\n", + " 0\n", " 21.0\n", " 6\n", " 160.0\n", @@ -76,7 +96,7 @@ " 4\n", " \n", " \n", - " Mazda RX4 Wag\n", + " 1\n", " 21.0\n", " 6\n", " 160.0\n", @@ -90,7 +110,7 @@ " 4\n", " \n", " \n", - " Datsun 710\n", + " 2\n", " 22.8\n", " 4\n", " 108.0\n", @@ -104,7 +124,7 @@ " 1\n", " \n", " \n", - " Hornet 4 Drive\n", + " 3\n", " 21.4\n", " 6\n", " 258.0\n", @@ -118,7 +138,7 @@ " 1\n", " \n", " \n", - " Hornet Sportabout\n", + " 4\n", " 18.7\n", " 8\n", " 360.0\n", @@ -132,7 +152,7 @@ " 2\n", " \n", " \n", - " Valiant\n", + " 5\n", " 18.1\n", " 6\n", " 225.0\n", @@ -146,7 +166,7 @@ " 1\n", " \n", " \n", - " Duster 360\n", + " 6\n", " 14.3\n", " 8\n", " 360.0\n", @@ -160,7 +180,7 @@ " 4\n", " \n", " \n", - " Merc 240D\n", + " 7\n", " 24.4\n", " 4\n", " 146.7\n", @@ -174,7 +194,7 @@ " 2\n", " \n", " \n", - " Merc 230\n", + " 8\n", " 22.8\n", " 4\n", " 140.8\n", @@ -188,7 +208,7 @@ " 2\n", " \n", " \n", - " Merc 280\n", + " 9\n", " 19.2\n", " 6\n", " 167.6\n", @@ -202,7 +222,7 @@ " 4\n", " \n", " \n", - " Merc 280C\n", + " 10\n", " 17.8\n", " 6\n", " 167.6\n", @@ -216,7 +236,7 @@ " 4\n", " \n", " \n", - " Merc 450SE\n", + " 11\n", " 16.4\n", " 8\n", " 275.8\n", @@ -230,7 +250,7 @@ " 3\n", " \n", " \n", - " Merc 450SL\n", + " 12\n", " 17.3\n", " 8\n", " 275.8\n", @@ -244,7 +264,7 @@ " 3\n", " \n", " \n", - " Merc 450SLC\n", + " 13\n", " 15.2\n", " 8\n", " 275.8\n", @@ -258,7 +278,7 @@ " 3\n", " \n", " \n", - " Cadillac Fleetwood\n", + " 14\n", " 10.4\n", " 8\n", " 472.0\n", @@ -272,7 +292,7 @@ " 4\n", " \n", " \n", - " Lincoln Continental\n", + " 15\n", " 10.4\n", " 8\n", " 460.0\n", @@ -286,7 +306,7 @@ " 4\n", " \n", " \n", - " Chrysler Imperial\n", + " 16\n", " 14.7\n", " 8\n", " 440.0\n", @@ -300,7 +320,7 @@ " 4\n", " \n", " \n", - " Fiat 128\n", + " 17\n", " 32.4\n", " 4\n", " 78.7\n", @@ -314,7 +334,7 @@ " 1\n", " \n", " \n", - " Honda Civic\n", + " 18\n", " 30.4\n", " 4\n", " 75.7\n", @@ -328,7 +348,7 @@ " 2\n", " \n", " \n", - " Toyota Corolla\n", + " 19\n", " 33.9\n", " 4\n", " 71.1\n", @@ -342,7 +362,7 @@ " 1\n", " \n", " \n", - " Toyota Corona\n", + " 20\n", " 21.5\n", " 4\n", " 120.1\n", @@ -356,7 +376,7 @@ " 1\n", " \n", " \n", - " Dodge Challenger\n", + " 21\n", " 15.5\n", " 8\n", " 318.0\n", @@ -370,7 +390,7 @@ " 2\n", " \n", " \n", - " AMC Javelin\n", + " 22\n", " 15.2\n", " 8\n", " 304.0\n", @@ -384,7 +404,7 @@ " 2\n", " \n", " \n", - " Camaro Z28\n", + " 23\n", " 13.3\n", " 8\n", " 350.0\n", @@ -398,7 +418,7 @@ " 4\n", " \n", " \n", - " Pontiac Firebird\n", + " 24\n", " 19.2\n", " 8\n", " 400.0\n", @@ -412,7 +432,7 @@ " 2\n", " \n", " \n", - " Fiat X1-9\n", + " 25\n", " 27.3\n", " 4\n", " 79.0\n", @@ -426,7 +446,7 @@ " 1\n", " \n", " \n", - " Porsche 914-2\n", + " 26\n", " 26.0\n", " 4\n", " 120.3\n", @@ -440,7 +460,7 @@ " 2\n", " \n", " \n", - " Lotus Europa\n", + " 27\n", " 30.4\n", " 4\n", " 95.1\n", @@ -454,7 +474,7 @@ " 2\n", " \n", " \n", - " Ford Pantera L\n", + " 28\n", " 15.8\n", " 8\n", " 351.0\n", @@ -468,7 +488,7 @@ " 4\n", " \n", " \n", - " Ferrari Dino\n", + " 29\n", " 19.7\n", " 6\n", " 145.0\n", @@ -482,7 +502,7 @@ " 6\n", " \n", " \n", - " Maserati Bora\n", + " 30\n", " 15.0\n", " 8\n", " 301.0\n", @@ -496,7 +516,7 @@ " 8\n", " \n", " \n", - " Volvo 142E\n", + " 31\n", " 21.4\n", " 4\n", " 121.0\n", @@ -514,73 +534,39 @@ "" ], "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear \\\n", - "Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 \n", - "Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 \n", - "Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 \n", - "Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 \n", - "Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 \n", - "Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 \n", - "Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 \n", - "Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 \n", - "Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 \n", - "Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 \n", - "Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 \n", - "Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 \n", - "Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 \n", - "Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 \n", - "Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 \n", - "Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 \n", - "Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 \n", - "Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 \n", - "Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 \n", - "Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 \n", - "Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 \n", - "Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 \n", - "AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 \n", - "Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 \n", - "Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 \n", - "Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 \n", - "Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 \n", - "Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 \n", - "Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 \n", - "Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 \n", - "Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 \n", - "Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 \n", - "\n", - " carb \n", - "Mazda RX4 4 \n", - "Mazda RX4 Wag 4 \n", - "Datsun 710 1 \n", - "Hornet 4 Drive 1 \n", - "Hornet Sportabout 2 \n", - "Valiant 1 \n", - "Duster 360 4 \n", - "Merc 240D 2 \n", - "Merc 230 2 \n", - "Merc 280 4 \n", - "Merc 280C 4 \n", - "Merc 450SE 3 \n", - "Merc 450SL 3 \n", - "Merc 450SLC 3 \n", - "Cadillac Fleetwood 4 \n", - "Lincoln Continental 4 \n", - "Chrysler Imperial 4 \n", - "Fiat 128 1 \n", - "Honda Civic 2 \n", - "Toyota Corolla 1 \n", - "Toyota Corona 1 \n", - "Dodge Challenger 2 \n", - "AMC Javelin 2 \n", - "Camaro Z28 4 \n", - "Pontiac Firebird 2 \n", - "Fiat X1-9 1 \n", - "Porsche 914-2 2 \n", - "Lotus Europa 2 \n", - "Ford Pantera L 4 \n", - "Ferrari Dino 6 \n", - "Maserati Bora 8 \n", - "Volvo 142E 2 " + " mpg cyl disp hp drat wt qsec vs am gear carb\n", + "0 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4\n", + "1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4\n", + "2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1\n", + "3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1\n", + "4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2\n", + "5 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1\n", + "6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4\n", + "7 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2\n", + "8 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2\n", + "9 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4\n", + "10 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4\n", + "11 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3\n", + "12 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3\n", + "13 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3\n", + "14 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4\n", + "15 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4\n", + "16 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4\n", + "17 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1\n", + "18 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2\n", + "19 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1\n", + "20 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1\n", + "21 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2\n", + "22 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2\n", + "23 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4\n", + "24 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2\n", + "25 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1\n", + "26 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2\n", + "27 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2\n", + "28 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4\n", + "29 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6\n", + "30 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8\n", + "31 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2" ] }, "execution_count": 2, @@ -590,7 +576,7 @@ ], "source": [ "by_cyl = mtcars >> group_by(f.cyl) \n", - "by_cyl >> showme()" + "by_cyl >> display()" ] }, { @@ -622,63 +608,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-12 16:07:20][datar][ INFO] `summarise()` ungrouping output (override with `_groups` argument)\n" + "[2021-04-03 00:30:33][datar][ INFO] `summarise()` regrouping output by ['cyl']. You can override using the `.groups` argument.\n" ] }, { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
cyldisphp
06183.314286122.285714
04105.13636482.636364
08353.100000209.214286
\n", - "
" - ], "text/plain": [ - " cyl disp hp\n", - "0 6 183.314286 122.285714\n", - "0 4 105.136364 82.636364\n", - "0 8 353.100000 209.214286" + "" ] }, "execution_count": 4, @@ -702,7 +638,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-12 16:07:20][datar][ INFO] # [DataFrameGroupBy] Groups: ['cyl'] (3)\n" + "[2021-04-03 00:30:33][datar][ INFO] # [DataFrameGroupBy] Groups: ['cyl'] (3)\n" ] }, { @@ -742,20 +678,6 @@ " \n", " \n", " 0\n", - " 21.4\n", - " 6\n", - " 258.0\n", - " 110\n", - " 3.08\n", - " 3.215\n", - " 19.44\n", - " 1\n", - " 0\n", - " 3\n", - " 1\n", - " \n", - " \n", - " 1\n", " 24.4\n", " 4\n", " 146.7\n", @@ -769,6 +691,20 @@ " 2\n", " \n", " \n", + " 1\n", + " 21.4\n", + " 6\n", + " 258.0\n", + " 110\n", + " 3.08\n", + " 3.215\n", + " 19.44\n", + " 1\n", + " 0\n", + " 3\n", + " 1\n", + " \n", + " \n", " 2\n", " 10.4\n", " 8\n", @@ -788,8 +724,8 @@ ], "text/plain": [ " mpg cyl disp hp drat wt qsec vs am gear carb\n", - "0 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1\n", - "1 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2\n", + "0 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2\n", + "1 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1\n", "2 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4" ] }, @@ -799,7 +735,7 @@ } ], "source": [ - "by_cyl >> filter(f.disp == max(f.disp)) >> showme()" + "by_cyl >> filter(f.disp == max(f.disp)) >> display()" ] }, { @@ -811,8 +747,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-12 16:07:20][datar][ INFO] `summarise()` regrouping output by ['vs'] (override with `_groups` argument)\n", - "[2021-03-12 16:07:20][datar][ INFO] # [DataFrameGroupBy] Groups: ['vs'] (2)\n" + "[2021-04-03 00:30:33][datar][ INFO] `summarise()` regrouping output by ['vs', 'am']. You can override using the `.groups` argument.\n", + "[2021-04-03 00:30:33][datar][ INFO] # [DataFrameGroupBy] Groups: ['vs', 'am'] (4)\n" ] }, { @@ -845,26 +781,26 @@ " \n", " 0\n", " 0\n", - " 1\n", - " 6\n", + " 0\n", + " 12\n", " \n", " \n", - " 0\n", - " 1\n", + " 1\n", + " 0\n", " 1\n", - " 7\n", + " 6\n", " \n", " \n", - " 0\n", + " 2\n", " 1\n", " 0\n", " 7\n", " \n", " \n", - " 0\n", - " 0\n", - " 0\n", - " 12\n", + " 3\n", + " 1\n", + " 1\n", + " 7\n", " \n", " \n", "\n", @@ -872,10 +808,10 @@ ], "text/plain": [ " vs am n\n", - "0 0 1 6\n", - "0 1 1 7\n", - "0 1 0 7\n", - "0 0 0 12" + "0 0 0 12\n", + "1 0 1 6\n", + "2 1 0 7\n", + "3 1 1 7" ] }, "execution_count": 6, @@ -886,7 +822,7 @@ "source": [ "by_vs_am = mtcars >> group_by(f.vs, f.am)\n", "by_vs = by_vs_am >> summarise(n=n())\n", - "by_vs >> showme()" + "by_vs >> display()" ] }, { @@ -898,53 +834,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-12 16:07:20][datar][ INFO] `summarise()` ungrouping output (override with `_groups` argument)\n" + "[2021-04-03 00:30:33][datar][ INFO] `summarise()` regrouping output by ['vs'] (override with `_groups` argument)\n" ] }, { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vsn
0018
0114
\n", - "
" - ], "text/plain": [ - " vs n\n", - "0 0 18\n", - "0 1 14" + "" ] }, "execution_count": 7, @@ -1029,7 +925,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-12 16:07:20][datar][ INFO] # [DataFrameGroupBy] Groups: ['vsam'] (3)\n" + "[2021-04-03 00:30:33][datar][ INFO] # [DataFrameGroupBy] Groups: ['vsam'] (3)\n" ] }, { @@ -1069,7 +965,7 @@ " \n", " \n", " \n", - " Mazda RX4\n", + " 0\n", " 21.0\n", " 6\n", " 160.0\n", @@ -1084,7 +980,7 @@ " 1\n", " \n", " \n", - " Mazda RX4 Wag\n", + " 1\n", " 21.0\n", " 6\n", " 160.0\n", @@ -1099,7 +995,7 @@ " 1\n", " \n", " \n", - " Datsun 710\n", + " 2\n", " 22.8\n", " 4\n", " 108.0\n", @@ -1114,7 +1010,7 @@ " 2\n", " \n", " \n", - " Hornet 4 Drive\n", + " 3\n", " 21.4\n", " 6\n", " 258.0\n", @@ -1129,7 +1025,7 @@ " 1\n", " \n", " \n", - " Hornet Sportabout\n", + " 4\n", " 18.7\n", " 8\n", " 360.0\n", @@ -1144,7 +1040,7 @@ " 0\n", " \n", " \n", - " Valiant\n", + " 5\n", " 18.1\n", " 6\n", " 225.0\n", @@ -1159,7 +1055,7 @@ " 1\n", " \n", " \n", - " Duster 360\n", + " 6\n", " 14.3\n", " 8\n", " 360.0\n", @@ -1174,7 +1070,7 @@ " 0\n", " \n", " \n", - " Merc 240D\n", + " 7\n", " 24.4\n", " 4\n", " 146.7\n", @@ -1189,7 +1085,7 @@ " 1\n", " \n", " \n", - " Merc 230\n", + " 8\n", " 22.8\n", " 4\n", " 140.8\n", @@ -1204,7 +1100,7 @@ " 1\n", " \n", " \n", - " Merc 280\n", + " 9\n", " 19.2\n", " 6\n", " 167.6\n", @@ -1219,7 +1115,7 @@ " 1\n", " \n", " \n", - " Merc 280C\n", + " 10\n", " 17.8\n", " 6\n", " 167.6\n", @@ -1234,7 +1130,7 @@ " 1\n", " \n", " \n", - " Merc 450SE\n", + " 11\n", " 16.4\n", " 8\n", " 275.8\n", @@ -1249,7 +1145,7 @@ " 0\n", " \n", " \n", - " Merc 450SL\n", + " 12\n", " 17.3\n", " 8\n", " 275.8\n", @@ -1264,7 +1160,7 @@ " 0\n", " \n", " \n", - " Merc 450SLC\n", + " 13\n", " 15.2\n", " 8\n", " 275.8\n", @@ -1279,7 +1175,7 @@ " 0\n", " \n", " \n", - " Cadillac Fleetwood\n", + " 14\n", " 10.4\n", " 8\n", " 472.0\n", @@ -1294,7 +1190,7 @@ " 0\n", " \n", " \n", - " Lincoln Continental\n", + " 15\n", " 10.4\n", " 8\n", " 460.0\n", @@ -1309,7 +1205,7 @@ " 0\n", " \n", " \n", - " Chrysler Imperial\n", + " 16\n", " 14.7\n", " 8\n", " 440.0\n", @@ -1324,7 +1220,7 @@ " 0\n", " \n", " \n", - " Fiat 128\n", + " 17\n", " 32.4\n", " 4\n", " 78.7\n", @@ -1339,7 +1235,7 @@ " 2\n", " \n", " \n", - " Honda Civic\n", + " 18\n", " 30.4\n", " 4\n", " 75.7\n", @@ -1354,7 +1250,7 @@ " 2\n", " \n", " \n", - " Toyota Corolla\n", + " 19\n", " 33.9\n", " 4\n", " 71.1\n", @@ -1369,7 +1265,7 @@ " 2\n", " \n", " \n", - " Toyota Corona\n", + " 20\n", " 21.5\n", " 4\n", " 120.1\n", @@ -1384,7 +1280,7 @@ " 1\n", " \n", " \n", - " Dodge Challenger\n", + " 21\n", " 15.5\n", " 8\n", " 318.0\n", @@ -1399,7 +1295,7 @@ " 0\n", " \n", " \n", - " AMC Javelin\n", + " 22\n", " 15.2\n", " 8\n", " 304.0\n", @@ -1414,7 +1310,7 @@ " 0\n", " \n", " \n", - " Camaro Z28\n", + " 23\n", " 13.3\n", " 8\n", " 350.0\n", @@ -1429,7 +1325,7 @@ " 0\n", " \n", " \n", - " Pontiac Firebird\n", + " 24\n", " 19.2\n", " 8\n", " 400.0\n", @@ -1444,7 +1340,7 @@ " 0\n", " \n", " \n", - " Fiat X1-9\n", + " 25\n", " 27.3\n", " 4\n", " 79.0\n", @@ -1459,7 +1355,7 @@ " 2\n", " \n", " \n", - " Porsche 914-2\n", + " 26\n", " 26.0\n", " 4\n", " 120.3\n", @@ -1474,7 +1370,7 @@ " 1\n", " \n", " \n", - " Lotus Europa\n", + " 27\n", " 30.4\n", " 4\n", " 95.1\n", @@ -1489,7 +1385,7 @@ " 2\n", " \n", " \n", - " Ford Pantera L\n", + " 28\n", " 15.8\n", " 8\n", " 351.0\n", @@ -1504,7 +1400,7 @@ " 1\n", " \n", " \n", - " Ferrari Dino\n", + " 29\n", " 19.7\n", " 6\n", " 145.0\n", @@ -1519,7 +1415,7 @@ " 1\n", " \n", " \n", - " Maserati Bora\n", + " 30\n", " 15.0\n", " 8\n", " 301.0\n", @@ -1534,7 +1430,7 @@ " 1\n", " \n", " \n", - " Volvo 142E\n", + " 31\n", " 21.4\n", " 4\n", " 121.0\n", @@ -1553,73 +1449,39 @@ "" ], "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear \\\n", - "Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 \n", - "Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 \n", - "Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 \n", - "Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 \n", - "Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 \n", - "Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 \n", - "Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 \n", - "Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 \n", - "Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 \n", - "Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 \n", - "Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 \n", - "Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 \n", - "Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 \n", - "Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 \n", - "Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 \n", - "Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 \n", - "Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 \n", - "Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 \n", - "Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 \n", - "Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 \n", - "Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 \n", - "Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 \n", - "AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 \n", - "Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 \n", - "Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 \n", - "Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 \n", - "Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 \n", - "Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 \n", - "Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 \n", - "Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 \n", - "Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 \n", - "Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 \n", - "\n", - " carb vsam \n", - "Mazda RX4 4 1 \n", - "Mazda RX4 Wag 4 1 \n", - "Datsun 710 1 2 \n", - "Hornet 4 Drive 1 1 \n", - "Hornet Sportabout 2 0 \n", - "Valiant 1 1 \n", - "Duster 360 4 0 \n", - "Merc 240D 2 1 \n", - "Merc 230 2 1 \n", - "Merc 280 4 1 \n", - "Merc 280C 4 1 \n", - "Merc 450SE 3 0 \n", - "Merc 450SL 3 0 \n", - "Merc 450SLC 3 0 \n", - "Cadillac Fleetwood 4 0 \n", - "Lincoln Continental 4 0 \n", - "Chrysler Imperial 4 0 \n", - "Fiat 128 1 2 \n", - "Honda Civic 2 2 \n", - "Toyota Corolla 1 2 \n", - "Toyota Corona 1 1 \n", - "Dodge Challenger 2 0 \n", - "AMC Javelin 2 0 \n", - "Camaro Z28 4 0 \n", - "Pontiac Firebird 2 0 \n", - "Fiat X1-9 1 2 \n", - "Porsche 914-2 2 1 \n", - "Lotus Europa 2 2 \n", - "Ford Pantera L 4 1 \n", - "Ferrari Dino 6 1 \n", - "Maserati Bora 8 1 \n", - "Volvo 142E 2 2 " + " mpg cyl disp hp drat wt qsec vs am gear carb vsam\n", + "0 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 1\n", + "1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 1\n", + "2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 2\n", + "3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 1\n", + "4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 0\n", + "5 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 1\n", + "6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 0\n", + "7 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 1\n", + "8 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 1\n", + "9 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 1\n", + "10 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 1\n", + "11 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 0\n", + "12 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 0\n", + "13 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 0\n", + "14 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 0\n", + "15 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 0\n", + "16 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 0\n", + "17 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 2\n", + "18 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 2\n", + "19 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 2\n", + "20 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 1\n", + "21 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 0\n", + "22 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 0\n", + "23 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 0\n", + "24 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 0\n", + "25 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 2\n", + "26 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 1\n", + "27 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 2\n", + "28 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 1\n", + "29 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 1\n", + "30 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 1\n", + "31 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 2" ] }, "execution_count": 9, @@ -1630,7 +1492,7 @@ "source": [ "mtcars_vsam = mtcars >> group_by(vsam=f.vs + f.am) \n", "mtcars_vsam >> group_vars()\n", - "mtcars_vsam >> showme()" + "mtcars_vsam >> display()" ] }, { diff --git a/examples/group_map.ipynb b/docs/notebooks/group_map.ipynb similarity index 78% rename from examples/group_map.ipynb rename to docs/notebooks/group_map.ipynb index 1861b835..bcf81ae2 100644 --- a/examples/group_map.ipynb +++ b/docs/notebooks/group_map.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "packed-macintosh", + "id": "wanted-cooking", "metadata": {}, "outputs": [], "source": [ @@ -15,25 +15,21 @@ { "cell_type": "code", "execution_count": 2, - "id": "established-fight", + "id": "banner-czech", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[ mpg cyl disp hp drat wt qsec vs am gear carb\n", - " Mazda RX4 21.0 6 160.0 110 3.9 2.620 16.46 0 1 4 4\n", - " Mazda RX4 Wag 21.0 6 160.0 110 3.9 2.875 17.02 0 1 4 4,\n", - " mpg cyl disp hp drat wt qsec vs am gear carb\n", - " Datsun 710 22.8 4 108.0 93 3.85 2.32 18.61 1 1 4 1\n", - " Merc 240D 24.4 4 146.7 62 3.69 3.19 20.00 1 0 4 2,\n", - " mpg cyl disp hp drat wt qsec vs am gear \\\n", - " Hornet Sportabout 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 \n", - " Duster 360 14.3 8 360.0 245 3.21 3.57 15.84 0 0 3 \n", - " \n", - " carb \n", - " Hornet Sportabout 2 \n", - " Duster 360 4 ]" + "[ mpg cyl disp hp drat wt qsec vs am gear carb\n", + " 2 22.8 4 108.0 93 3.85 2.32 18.61 1 1 4 1\n", + " 7 24.4 4 146.7 62 3.69 3.19 20.00 1 0 4 2,\n", + " mpg cyl disp hp drat wt qsec vs am gear carb\n", + " 0 21.0 6 160.0 110 3.9 2.620 16.46 0 1 4 4\n", + " 1 21.0 6 160.0 110 3.9 2.875 17.02 0 1 4 4,\n", + " mpg cyl disp hp drat wt qsec vs am gear carb\n", + " 4 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 2\n", + " 6 14.3 8 360.0 245 3.21 3.57 15.84 0 0 3 4]" ] }, "execution_count": 2, @@ -50,7 +46,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "removable-contest", + "id": "harmful-magnitude", "metadata": {}, "outputs": [ { @@ -89,35 +85,7 @@ " \n", " \n", " \n", - " Mazda RX4\n", - " 21.0\n", - " 6\n", - " 160.0\n", - " 110\n", - " 3.90\n", - " 2.620\n", - " 16.46\n", - " 0\n", - " 1\n", - " 4\n", - " 4\n", - " \n", - " \n", - " Mazda RX4 Wag\n", - " 21.0\n", - " 6\n", - " 160.0\n", - " 110\n", - " 3.90\n", - " 2.875\n", - " 17.02\n", - " 0\n", - " 1\n", - " 4\n", - " 4\n", - " \n", - " \n", - " Datsun 710\n", + " 0\n", " 22.8\n", " 4\n", " 108.0\n", @@ -131,7 +99,7 @@ " 1\n", " \n", " \n", - " Merc 240D\n", + " 1\n", " 24.4\n", " 4\n", " 146.7\n", @@ -145,7 +113,35 @@ " 2\n", " \n", " \n", - " Hornet Sportabout\n", + " 2\n", + " 21.0\n", + " 6\n", + " 160.0\n", + " 110\n", + " 3.90\n", + " 2.620\n", + " 16.46\n", + " 0\n", + " 1\n", + " 4\n", + " 4\n", + " \n", + " \n", + " 3\n", + " 21.0\n", + " 6\n", + " 160.0\n", + " 110\n", + " 3.90\n", + " 2.875\n", + " 17.02\n", + " 0\n", + " 1\n", + " 4\n", + " 4\n", + " \n", + " \n", + " 4\n", " 18.7\n", " 8\n", " 360.0\n", @@ -159,7 +155,7 @@ " 2\n", " \n", " \n", - " Duster 360\n", + " 5\n", " 14.3\n", " 8\n", " 360.0\n", @@ -177,21 +173,13 @@ "" ], "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear \\\n", - "Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 \n", - "Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 \n", - "Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 \n", - "Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 \n", - "Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 \n", - "Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 \n", - "\n", - " carb \n", - "Mazda RX4 4 \n", - "Mazda RX4 Wag 4 \n", - "Datsun 710 1 \n", - "Merc 240D 2 \n", - "Hornet Sportabout 2 \n", - "Duster 360 4 " + " mpg cyl disp hp drat wt qsec vs am gear carb\n", + "0 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1\n", + "1 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2\n", + "2 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4\n", + "3 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4\n", + "4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2\n", + "5 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4" ] }, "execution_count": 3, @@ -208,15 +196,15 @@ { "cell_type": "code", "execution_count": 4, - "id": "neural-slovenia", + "id": "perfect-algeria", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Function(func='quantile'),\n", - " Function(func='quantile'),\n", - " Function(func='quantile')]" + "[array([1.4 , 1.5 , 1.575]),\n", + " array([4. , 4.35, 4.6 ]),\n", + " array([5.1 , 5.55 , 5.875])]" ] }, "execution_count": 4, @@ -233,7 +221,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "higher-slide", + "id": "returning-factory", "metadata": {}, "outputs": [ { @@ -261,7 +249,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "sudden-surveillance", + "id": "determined-bosnia", "metadata": {}, "outputs": [ { diff --git a/examples/group_split.ipynb b/docs/notebooks/group_split.ipynb similarity index 98% rename from examples/group_split.ipynb rename to docs/notebooks/group_split.ipynb index 135d1434..2830dae3 100644 --- a/examples/group_split.ipynb +++ b/docs/notebooks/group_split.ipynb @@ -3,20 +3,30 @@ { "cell_type": "code", "execution_count": 1, - "id": "olympic-completion", + "id": "straight-sight", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Get a list of data in each group\n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/group_split.html\n", "\n", "from datar.datasets import iris\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(group_split.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "treated-stevens", + "id": "advised-convention", "metadata": {}, "outputs": [ { @@ -191,7 +201,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "excited-behalf", + "id": "alike-serum", "metadata": {}, "outputs": [ { @@ -254,7 +264,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "driving-freeware", + "id": "conservative-zimbabwe", "metadata": {}, "outputs": [ { @@ -348,7 +358,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "dress-tournament", + "id": "electronic-reminder", "metadata": {}, "outputs": [ { @@ -411,7 +421,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "danish-measure", + "id": "aquatic-panama", "metadata": {}, "outputs": [ { @@ -584,7 +594,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "specialized-stanley", + "id": "decent-hearing", "metadata": {}, "outputs": [ { @@ -647,7 +657,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "general-vietnam", + "id": "handmade-diversity", "metadata": {}, "outputs": [ { @@ -673,7 +683,7 @@ { "cell_type": "code", "execution_count": null, - "id": "valuable-flash", + "id": "considered-spoke", "metadata": {}, "outputs": [], "source": [] diff --git a/examples/group_trim.ipynb b/docs/notebooks/group_trim.ipynb similarity index 88% rename from examples/group_trim.ipynb rename to docs/notebooks/group_trim.ipynb index 984d9065..2d250c05 100644 --- a/examples/group_trim.ipynb +++ b/docs/notebooks/group_trim.ipynb @@ -3,20 +3,30 @@ { "cell_type": "code", "execution_count": 1, - "id": "dominican-election", + "id": "czech-federation", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trim the unused group levels\n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/group_trim.html\n", "\n", "from datar.datasets import iris\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(group_trim.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "decent-protein", + "id": "official-estimate", "metadata": {}, "outputs": [ { @@ -40,7 +50,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "considerable-submission", + "id": "looking-wyoming", "metadata": {}, "outputs": [ { diff --git a/examples/lead-lag.ipynb b/docs/notebooks/lead-lag.ipynb similarity index 88% rename from examples/lead-lag.ipynb rename to docs/notebooks/lead-lag.ipynb index c72857fc..27f9e8e5 100644 --- a/examples/lead-lag.ipynb +++ b/docs/notebooks/lead-lag.ipynb @@ -3,20 +3,46 @@ { "cell_type": "code", "execution_count": 1, - "id": "acting-fleet", + "id": "adjacent-given", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Find next values in a vector\n", + "\n", + " Args:\n", + " series: Vector of values\n", + " n: Positive integer of length 1, giving the number of positions to\n", + " lead or lag by\n", + " default: Value used for non-existent rows.\n", + " order_by: Override the default ordering to use another vector or column\n", + "\n", + " Returns:\n", + " Lead or lag values with default values filled to series.\n", + " \n", + "Find previous values in a vector\n", + "\n", + " See lead()\n", + " \n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/lead-lag.html\n", "\n", "from datar import f\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(lead.__doc__)\n", + "print(lag.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "running-artwork", + "id": "flexible-personal", "metadata": {}, "outputs": [ { @@ -44,7 +70,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "arctic-raleigh", + "id": "liked-swedish", "metadata": {}, "outputs": [ { @@ -70,7 +96,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "surprising-static", + "id": "prompt-closing", "metadata": {}, "outputs": [ { @@ -155,7 +181,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "about-clinton", + "id": "varying-stress", "metadata": {}, "outputs": [ { @@ -181,7 +207,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "upset-fundamentals", + "id": "novel-ballet", "metadata": {}, "outputs": [ { @@ -207,7 +233,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "impaired-newton", + "id": "choice-retailer", "metadata": {}, "outputs": [ { @@ -233,7 +259,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "widespread-vancouver", + "id": "weird-former", "metadata": {}, "outputs": [ { @@ -259,7 +285,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "still-punch", + "id": "agreed-treasurer", "metadata": {}, "outputs": [ { @@ -285,7 +311,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "critical-baking", + "id": "isolated-tampa", "metadata": {}, "outputs": [ { @@ -311,7 +337,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "guilty-vatican", + "id": "broadband-portal", "metadata": {}, "outputs": [ { @@ -345,13 +371,13 @@ " 0\n", " 2000\n", " 0\n", - " 1.0\n", + " 16.0\n", " \n", " \n", " 1\n", " 2001\n", " 1\n", - " 4.0\n", + " 25.0\n", " \n", " \n", " 2\n", @@ -363,19 +389,19 @@ " 3\n", " 2003\n", " 9\n", - " 16.0\n", + " 4.0\n", " \n", " \n", " 4\n", " 2004\n", " 16\n", - " 25.0\n", + " 1.0\n", " \n", " \n", " 5\n", " 2005\n", " 25\n", - " 0.0\n", + " 9.0\n", " \n", " \n", "\n", @@ -383,12 +409,12 @@ ], "text/plain": [ " year value previous_year_value\n", - "0 2000 0 1.0\n", - "1 2001 1 4.0\n", + "0 2000 0 16.0\n", + "1 2001 1 25.0\n", "2 2002 4 NaN\n", - "3 2003 9 16.0\n", - "4 2004 16 25.0\n", - "5 2005 25 0.0" + "3 2003 9 4.0\n", + "4 2004 16 1.0\n", + "5 2005 25 9.0" ] }, "execution_count": 11, @@ -409,7 +435,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "mathematical-chambers", + "id": "ambient-compression", "metadata": {}, "outputs": [ { @@ -502,7 +528,7 @@ { "cell_type": "code", "execution_count": null, - "id": "welcome-defeat", + "id": "italic-hebrew", "metadata": {}, "outputs": [], "source": [] diff --git a/examples/mutate-joins.ipynb b/docs/notebooks/mutate-joins.ipynb similarity index 94% rename from examples/mutate-joins.ipynb rename to docs/notebooks/mutate-joins.ipynb index f3ce1e4c..063c90b5 100644 --- a/examples/mutate-joins.ipynb +++ b/docs/notebooks/mutate-joins.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "favorite-terminal", + "id": "sacred-thirty", "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "expanded-transportation", + "id": "charming-locking", "metadata": {}, "outputs": [ { @@ -79,7 +79,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "afraid-peoples", + "id": "early-titanium", "metadata": {}, "outputs": [ { @@ -150,7 +150,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "polar-heritage", + "id": "introductory-values", "metadata": {}, "outputs": [ { @@ -221,7 +221,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "prompt-postcard", + "id": "fresh-undergraduate", "metadata": {}, "outputs": [ { @@ -299,7 +299,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "caring-dining", + "id": "parental-simon", "metadata": {}, "outputs": [ { @@ -323,32 +323,47 @@ " \n", " \n", " \n", - " name\n", + " key_0\n", + " name_x\n", " band\n", + " name_y\n", " plays\n", " \n", " \n", " \n", " \n", " 0\n", + " Mick\n", + " Mick\n", + " Stones\n", " John\n", - " Beatles\n", " guitar\n", " \n", " \n", " 1\n", - " Paul\n", + " John\n", + " John\n", " Beatles\n", + " Paul\n", " bass\n", " \n", + " \n", + " 2\n", + " Paul\n", + " Paul\n", + " Beatles\n", + " Keith\n", + " guitar\n", + " \n", " \n", "\n", "" ], "text/plain": [ - " name band plays\n", - "0 John Beatles guitar\n", - "1 Paul Beatles bass" + " key_0 name_x band name_y plays\n", + "0 Mick Mick Stones John guitar\n", + "1 John John Beatles Paul bass\n", + "2 Paul Paul Beatles Keith guitar" ] }, "execution_count": 6, @@ -363,7 +378,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "democratic-provision", + "id": "arabic-probe", "metadata": {}, "outputs": [ { @@ -441,7 +456,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "finnish-siemens", + "id": "broad-feeling", "metadata": {}, "outputs": [ { @@ -524,7 +539,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "european-shoulder", + "id": "impressive-camcorder", "metadata": {}, "outputs": [ { @@ -599,7 +614,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "approved-delhi", + "id": "removed-atlanta", "metadata": {}, "outputs": [ { diff --git a/examples/mutate.ipynb b/docs/notebooks/mutate.ipynb similarity index 89% rename from examples/mutate.ipynb rename to docs/notebooks/mutate.ipynb index 96aa6af8..50d78ced 100644 --- a/examples/mutate.ipynb +++ b/docs/notebooks/mutate.ipynb @@ -2,16 +2,65 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adds new variables and preserves existing ones\n", + "\n", + " The original API:\n", + " https://dplyr.tidyverse.org/reference/summarise.html\n", + "\n", + " Args:\n", + " _data: A data frame\n", + " _keep: allows you to control which columns from _data are retained\n", + " in the output:\n", + " - \"all\", the default, retains all variables.\n", + " - \"used\" keeps any variables used to make new variables;\n", + " it's useful for checking your work as it displays inputs and\n", + " outputs side-by-side.\n", + " - \"unused\" keeps only existing variables not used to make new\n", + " variables.\n", + " - \"none\", only keeps grouping keys (like transmute()).\n", + " _before, _after: Optionally, control where new columns should appear\n", + " (the default is to add to the right hand side).\n", + " See relocate() for more details.\n", + " *args, **kwargs: Name-value pairs. The name gives the name of the column\n", + " in the output. The value can be:\n", + " - A vector of length 1, which will be recycled to the correct\n", + " length.\n", + " - A vector the same length as the current group (or the whole\n", + " data frame if ungrouped).\n", + " - None to remove the column\n", + "\n", + " Returns:\n", + " An object of the same type as _data. The output has the following\n", + " properties:\n", + " - Rows are not affected.\n", + " - Existing columns will be preserved according to the _keep\n", + " argument. New columns will be placed according to the\n", + " _before and _after arguments. If _keep = \"none\"\n", + " (as in transmute()), the output order is determined only\n", + " by ..., not the order of existing columns.\n", + " - Columns given value None will be removed\n", + " - Groups will be recomputed if a grouping variable is mutated.\n", + " - Data frame attributes are preserved.\n", + " \n" + ] + } + ], "source": [ "from IPython.core.interactiveshell import InteractiveShell\n", "InteractiveShell.ast_node_interactivity = \"all\"\n", "\n", "# https://dplyr.tidyverse.org/reference/mutate.html\n", "from datar.datasets import starwars\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(mutate.__doc__)" ] }, { @@ -573,14 +622,14 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-13 00:38:01][datar][ INFO] # [DataFrameGroupBy] Groups: ['homeworld'] (49)\n" + "[2021-04-03 00:32:34][datar][ INFO] # [DataFrameGroupBy] Groups: ['homeworld'] (49)\n" ] }, { @@ -613,38 +662,38 @@ " \n", " \n", " 0\n", - " Luke Skywalker\n", - " 77.0\n", - " Tatooine\n", - " 5.0\n", + " Leia Organa\n", + " 49.0\n", + " Alderaan\n", + " 2.0\n", " \n", " \n", " 1\n", - " C-3PO\n", - " 75.0\n", - " Tatooine\n", - " 6.0\n", + " Bail Prestor Organa\n", + " NaN\n", + " Alderaan\n", + " NaN\n", " \n", " \n", " 2\n", - " Darth Vader\n", - " 136.0\n", - " Tatooine\n", + " Raymus Antilles\n", + " 79.0\n", + " Alderaan\n", " 1.0\n", " \n", " \n", " 3\n", - " Owen Lars\n", - " 120.0\n", - " Tatooine\n", - " 2.0\n", + " Ratts Tyerell\n", + " 15.0\n", + " Aleen Minor\n", + " 1.0\n", " \n", " \n", " 4\n", - " Beru Whitesun lars\n", - " 75.0\n", - " Tatooine\n", - " 6.0\n", + " Lobot\n", + " 79.0\n", + " Bespin\n", + " 1.0\n", " \n", " \n", " ...\n", @@ -694,23 +743,23 @@ "" ], "text/plain": [ - " name mass homeworld rank\n", - "0 Luke Skywalker 77.0 Tatooine 5.0\n", - "1 C-3PO 75.0 Tatooine 6.0\n", - "2 Darth Vader 136.0 Tatooine 1.0\n", - "3 Owen Lars 120.0 Tatooine 2.0\n", - "4 Beru Whitesun lars 75.0 Tatooine 6.0\n", - ".. ... ... ... ...\n", - "82 Finn NaN NaN NaN\n", - "83 Rey NaN NaN NaN\n", - "84 Poe Dameron NaN NaN NaN\n", - "85 BB8 NaN NaN NaN\n", - "86 Captain Phasma NaN NaN NaN\n", + " name mass homeworld rank\n", + "0 Leia Organa 49.0 Alderaan 2.0\n", + "1 Bail Prestor Organa NaN Alderaan NaN\n", + "2 Raymus Antilles 79.0 Alderaan 1.0\n", + "3 Ratts Tyerell 15.0 Aleen Minor 1.0\n", + "4 Lobot 79.0 Bespin 1.0\n", + ".. ... ... ... ...\n", + "82 Finn NaN NaN NaN\n", + "83 Rey NaN NaN NaN\n", + "84 Poe Dameron NaN NaN NaN\n", + "85 BB8 NaN NaN NaN\n", + "86 Captain Phasma NaN NaN NaN\n", "\n", "[87 rows x 4 columns]" ] }, - "execution_count": 17, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -720,12 +769,12 @@ " select(f.name, f.mass, f.homeworld) >> \\\n", " group_by(f.homeworld) >> \\\n", " mutate(rank=min_rank(desc(f.mass))) >> \\\n", - " showme()" + " display()" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -770,7 +819,7 @@ "0 1 2 3" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -783,7 +832,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -828,7 +877,7 @@ "0 3 1 2" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -839,7 +888,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -884,7 +933,7 @@ "0 1 3 2" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -895,7 +944,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -944,7 +993,7 @@ "0 1 2 a b 3" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -956,7 +1005,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -1001,7 +1050,7 @@ "0 1 2 3" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1012,7 +1061,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -1057,7 +1106,7 @@ "0 a b 3" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1068,7 +1117,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -1109,7 +1158,7 @@ "0 3" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1120,7 +1169,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1250,7 +1299,7 @@ "[87 rows x 4 columns]" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1263,14 +1312,14 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/pwwang/github/datar/datar/base/funcs.py:116: RuntimeWarning: Mean of empty slice\n", + "/home/pwwang/github/datar/datar/base/funcs.py:118: RuntimeWarning: Mean of empty slice\n", " return numpy.nanmean(series) if na_rm else numpy.mean(series)\n" ] }, @@ -1304,38 +1353,38 @@ " \n", " \n", " 0\n", - " Luke Skywalker\n", - " 77.0\n", - " Human\n", - " 0.930156\n", + " Ratts Tyerell\n", + " 15.0\n", + " Aleena\n", + " 1.0\n", " \n", " \n", " 1\n", - " Darth Vader\n", - " 136.0\n", - " Human\n", - " 1.642873\n", + " Dexter Jettster\n", + " 102.0\n", + " Besalisk\n", + " 1.0\n", " \n", " \n", " 2\n", - " Leia Organa\n", - " 49.0\n", - " Human\n", - " 0.591917\n", + " Ki-Adi-Mundi\n", + " 82.0\n", + " Cerean\n", + " 1.0\n", " \n", " \n", " 3\n", - " Owen Lars\n", - " 120.0\n", - " Human\n", - " 1.449594\n", + " Mas Amedda\n", + " NaN\n", + " Chagrian\n", + " NaN\n", " \n", " \n", " 4\n", - " Beru Whitesun lars\n", - " 75.0\n", - " Human\n", - " 0.905996\n", + " Zam Wesell\n", + " 55.0\n", + " Clawdite\n", + " 1.0\n", " \n", " \n", " ...\n", @@ -1346,10 +1395,10 @@ " \n", " \n", " 82\n", - " Tion Medon\n", - " 80.0\n", - " Pau'an\n", - " 1.000000\n", + " Eeth Koth\n", + " NaN\n", + " Zabrak\n", + " NaN\n", " \n", " \n", " 83\n", @@ -1370,7 +1419,7 @@ " Sly Moore\n", " 48.0\n", " NaN\n", - " 1.000000\n", + " 1.0\n", " \n", " \n", " 86\n", @@ -1385,23 +1434,23 @@ "" ], "text/plain": [ - " name mass species mass_norm\n", - "0 Luke Skywalker 77.0 Human 0.930156\n", - "1 Darth Vader 136.0 Human 1.642873\n", - "2 Leia Organa 49.0 Human 0.591917\n", - "3 Owen Lars 120.0 Human 1.449594\n", - "4 Beru Whitesun lars 75.0 Human 0.905996\n", - ".. ... ... ... ...\n", - "82 Tion Medon 80.0 Pau'an 1.000000\n", - "83 Ric Olié NaN NaN NaN\n", - "84 Quarsh Panaka NaN NaN NaN\n", - "85 Sly Moore 48.0 NaN 1.000000\n", - "86 Captain Phasma NaN NaN NaN\n", + " name mass species mass_norm\n", + "0 Ratts Tyerell 15.0 Aleena 1.0\n", + "1 Dexter Jettster 102.0 Besalisk 1.0\n", + "2 Ki-Adi-Mundi 82.0 Cerean 1.0\n", + "3 Mas Amedda NaN Chagrian NaN\n", + "4 Zam Wesell 55.0 Clawdite 1.0\n", + ".. ... ... ... ...\n", + "82 Eeth Koth NaN Zabrak NaN\n", + "83 Ric Olié NaN NaN NaN\n", + "84 Quarsh Panaka NaN NaN NaN\n", + "85 Sly Moore 48.0 NaN 1.0\n", + "86 Captain Phasma NaN NaN NaN\n", "\n", "[87 rows x 4 columns]" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1416,7 +1465,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1655,7 +1704,7 @@ "[87 rows x 12 columns]" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } diff --git a/examples/n_distinct.ipynb b/docs/notebooks/n_distinct.ipynb similarity index 93% rename from examples/n_distinct.ipynb rename to docs/notebooks/n_distinct.ipynb index fdc9f954..52b6c88b 100644 --- a/examples/n_distinct.ipynb +++ b/docs/notebooks/n_distinct.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "unexpected-rabbit", + "id": "liked-upgrade", "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "official-bibliography", + "id": "hawaiian-extreme", "metadata": {}, "outputs": [ { @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "allied-newport", + "id": "unlimited-apparatus", "metadata": {}, "outputs": [ { diff --git a/examples/na_if.ipynb b/docs/notebooks/na_if.ipynb similarity index 95% rename from examples/na_if.ipynb rename to docs/notebooks/na_if.ipynb index 76d44fe9..5e26d86f 100644 --- a/examples/na_if.ipynb +++ b/docs/notebooks/na_if.ipynb @@ -3,19 +3,37 @@ { "cell_type": "code", "execution_count": 1, - "id": "considerable-contractor", + "id": "pointed-crisis", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Convert an annoying value to NA\n", + "\n", + " Args:\n", + " x: Vector to modify\n", + " y: Value to replace with NA\n", + "\n", + " Returns:\n", + " A vector with values replaced.\n", + " \n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/na_if.html\n", "from datar.datasets import starwars\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(na_if.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "joint-termination", + "id": "elegant-globe", "metadata": {}, "outputs": [ { @@ -41,7 +59,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "discrete-uniform", + "id": "defensive-anchor", "metadata": {}, "outputs": [ { @@ -67,7 +85,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "interpreted-switzerland", + "id": "extreme-intensity", "metadata": {}, "outputs": [ { @@ -92,7 +110,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "cooperative-timber", + "id": "charming-forth", "metadata": {}, "outputs": [ { @@ -118,7 +136,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "satisfied-watts", + "id": "complex-preparation", "metadata": {}, "outputs": [ { @@ -238,7 +256,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "directed-cable", + "id": "choice-olympus", "metadata": {}, "outputs": [ { diff --git a/examples/near.ipynb b/docs/notebooks/near.ipynb similarity index 93% rename from examples/near.ipynb rename to docs/notebooks/near.ipynb index 8b083bc7..610a46f6 100644 --- a/examples/near.ipynb +++ b/docs/notebooks/near.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "heard-extraction", + "id": "acknowledged-secretariat", "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "fitted-victorian", + "id": "upset-former", "metadata": {}, "outputs": [ { @@ -36,7 +36,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "hairy-receiver", + "id": "fallen-repeat", "metadata": {}, "outputs": [ { diff --git a/examples/nest-join.ipynb b/docs/notebooks/nest-join.ipynb similarity index 94% rename from examples/nest-join.ipynb rename to docs/notebooks/nest-join.ipynb index b1dc1d99..b5cea48d 100644 --- a/examples/nest-join.ipynb +++ b/docs/notebooks/nest-join.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "abandoned-toyota", + "id": "authorized-starter", "metadata": {}, "outputs": [], "source": [ @@ -16,7 +16,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "atlantic-chemical", + "id": "photographic-continent", "metadata": {}, "outputs": [ { @@ -66,7 +66,7 @@ " Paul\n", " Beatles\n", " plays\n", - "1 bass\n", + "0 bass\n", " \n", " \n", "\n", @@ -80,7 +80,7 @@ "1 John Beatles plays\n", "0 guitar\n", "2 Paul Beatles plays\n", - "1 bass" + "0 bass" ] }, "execution_count": 2, @@ -95,8 +95,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "broad-andrew", + "execution_count": 3, + "id": "handled-tampa", "metadata": {}, "outputs": [ { @@ -109,7 +109,7 @@ " 0 guitar]" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } diff --git a/examples/nth.ipynb b/docs/notebooks/nth.ipynb similarity index 92% rename from examples/nth.ipynb rename to docs/notebooks/nth.ipynb index 6b510972..f63f3248 100644 --- a/examples/nth.ipynb +++ b/docs/notebooks/nth.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "alpha-qualification", + "id": "apart-purchase", "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "advanced-korean", + "id": "digital-lambda", "metadata": {}, "outputs": [ { @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "manual-insider", + "id": "searching-brisbane", "metadata": {}, "outputs": [ { @@ -60,7 +60,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "aging-writer", + "id": "fallen-access", "metadata": {}, "outputs": [ { @@ -81,7 +81,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "amber-norwegian", + "id": "bottom-boards", "metadata": {}, "outputs": [ { @@ -102,7 +102,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "useful-michael", + "id": "faced-briefs", "metadata": {}, "outputs": [ { @@ -123,7 +123,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "flush-infrared", + "id": "coastal-correction", "metadata": {}, "outputs": [ { @@ -144,7 +144,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "collectible-joyce", + "id": "precise-secretariat", "metadata": {}, "outputs": [ { @@ -165,7 +165,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "acoustic-gothic", + "id": "prescribed-argument", "metadata": {}, "outputs": [ { @@ -186,7 +186,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "enabling-armenia", + "id": "perceived-rebel", "metadata": {}, "outputs": [ { diff --git a/examples/pivot_longer.ipynb b/docs/notebooks/pivot_longer.ipynb similarity index 92% rename from examples/pivot_longer.ipynb rename to docs/notebooks/pivot_longer.ipynb index f28b62ea..3389288b 100644 --- a/examples/pivot_longer.ipynb +++ b/docs/notebooks/pivot_longer.ipynb @@ -2,21 +2,84 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, - "id": "thermal-revolution", + "execution_count": 9, + "id": "awful-albania", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\"lengthens\" data, increasing the number of rows and\n", + " decreasing the number of columns.\n", + "\n", + " Args:\n", + " _data: A data frame to pivot.\n", + " cols: Columns to pivot into longer format.\n", + " names_to: A string specifying the name of the column to create from\n", + " the data stored in the column names of data.\n", + " Can be a character vector, creating multiple columns, if names_sep\n", + " or names_pattern is provided. In this case, there are two special\n", + " values you can take advantage of:\n", + " - None will discard that component of the name.\n", + " - .value indicates that component of the name defines the name of\n", + " the column containing the cell values, overriding values_to.\n", + " names_prefix: A regular expression used to remove matching text from\n", + " the start of each variable name.\n", + " names_sep, names_pattern: If names_to contains multiple values,\n", + " these arguments control how the column name is broken up.\n", + " names_sep takes the same specification as separate(), and\n", + " can either be a numeric vector (specifying positions to break on),\n", + " or a single string (specifying a regular expression to split on).\n", + " names_pattern: takes the same specification as extract(),\n", + " a regular expression containing matching groups (()).\n", + " names_ptypes, values_ptypes: A list of column name-prototype pairs.\n", + " A prototype (or ptype for short) is a zero-length vector\n", + " (like integer() or numeric()) that defines the type, class, and\n", + " attributes of a vector. Use these arguments if you want to confirm\n", + " that the created columns are the types that you expect.\n", + " Note that if you want to change (instead of confirm) the types\n", + " of specific columns, you should use names_transform or\n", + " values_transform instead.\n", + " names_transform, values_transform: A list of column name-function pairs.\n", + " Use these arguments if you need to change the types of\n", + " specific columns. For example,\n", + " names_transform = dict(week = as.integer) would convert a\n", + " character variable called week to an integer.\n", + " If not specified, the type of the columns generated from names_to\n", + " will be character, and the type of the variables generated from\n", + " values_to will be the common type of the input columns used to\n", + " generate them.\n", + " names_repair: Not supported yet.\n", + " values_to: A string specifying the name of the column to create from\n", + " the data stored in cell values. If names_to is a character\n", + " containing the special .value sentinel, this value will be ignored,\n", + " and the name of the value column will be derived from part of\n", + " the existing column names.\n", + " values_drop_na: If TRUE, will drop rows that contain only NAs in\n", + " the value_to column. This effectively converts explicit missing\n", + " values to implicit missing values, and should generally be used\n", + " only when missing values in data were created by its structure.\n", + "\n", + " Returns:\n", + " The pivoted dataframe.\n", + " \n" + ] + } + ], "source": [ "# https://tidyr.tidyverse.org/reference/pivot_longer.html\n", - "from datar import f\n", + "\n", "from datar.datasets import relig_income, billboard, who, anscombe\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(pivot_longer.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "consolidated-encyclopedia", + "id": "seventh-deficit", "metadata": {}, "outputs": [ { @@ -364,7 +427,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "promising-blues", + "id": "effective-notification", "metadata": {}, "outputs": [ { @@ -495,7 +558,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "compatible-rescue", + "id": "committed-painting", "metadata": {}, "outputs": [ { @@ -867,7 +930,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "straight-cycling", + "id": "legendary-murder", "metadata": {}, "outputs": [ { @@ -1028,7 +1091,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "improved-jason", + "id": "adopted-macro", "metadata": {}, "outputs": [ { @@ -1223,7 +1286,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "national-berry", + "id": "static-toner", "metadata": {}, "outputs": [ { @@ -1410,7 +1473,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "entertaining-python", + "id": "mexican-narrative", "metadata": {}, "outputs": [ { @@ -1772,7 +1835,7 @@ { "cell_type": "code", "execution_count": null, - "id": "connected-title", + "id": "ignored-panel", "metadata": {}, "outputs": [], "source": [] diff --git a/examples/pivot_wider.ipynb b/docs/notebooks/pivot_wider.ipynb similarity index 97% rename from examples/pivot_wider.ipynb rename to docs/notebooks/pivot_wider.ipynb index 5f0f56a2..3777af61 100644 --- a/examples/pivot_wider.ipynb +++ b/docs/notebooks/pivot_wider.ipynb @@ -3,20 +3,63 @@ { "cell_type": "code", "execution_count": 1, - "id": "pleasant-lawyer", + "id": "hungry-circus", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\"widens\" data, increasing the number of columns and decreasing\n", + " the number of rows.\n", + "\n", + " Args:\n", + " _data: A data frame to pivot.\n", + " id_cols: A set of columns that uniquely identifies each observation.\n", + " Defaults to all columns in data except for the columns specified\n", + " in names_from and values_from.\n", + " names_from, values_from: A pair of arguments describing which column\n", + " (or columns) to get the name of the output column (names_from),\n", + " and which column (or columns) to get the cell values from\n", + " (values_from).\n", + " names_prefix: String added to the start of every variable name.\n", + " names_sep: If names_from or values_from contains multiple variables,\n", + " this will be used to join their values together into a single\n", + " string to use as a column name.\n", + " names_glue: Instead of names_sep and names_prefix, you can supply\n", + " a glue specification that uses the names_from columns\n", + " (and special _value) to create custom column names.\n", + " names_sort: Should the column names be sorted? If FALSE, the default,\n", + " column names are ordered by first appearance.\n", + " names_repair: todo\n", + " values_fill: Optionally, a (scalar) value that specifies what\n", + " each value should be filled in with when missing.\n", + " values_fn: Optionally, a function applied to the value in each cell\n", + " in the output. You will typically use this when the combination\n", + " of id_cols and value column does not uniquely identify\n", + " an observation.\n", + " This can be a dict you want to apply different aggregations to\n", + " different value columns.\n", + "\n", + " Returns:\n", + " The pivoted dataframe.\n", + " \n" + ] + } + ], "source": [ "# https://tidyr.tidyverse.org/reference/pivot_wider.html\n", "\n", "from datar.datasets import fish_encounters, us_rent_income, warpbreaks \n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(pivot_wider.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "acknowledged-milan", + "id": "lonely-eating", "metadata": {}, "outputs": [ { @@ -146,7 +189,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "processed-crime", + "id": "worldwide-firewall", "metadata": {}, "outputs": [ { @@ -505,7 +548,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "arbitrary-steam", + "id": "several-functionality", "metadata": {}, "outputs": [ { @@ -864,7 +907,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "confident-association", + "id": "thousand-gossip", "metadata": {}, "outputs": [ { @@ -1018,7 +1061,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "advised-phenomenon", + "id": "turned-pressure", "metadata": {}, "outputs": [ { @@ -1603,7 +1646,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "quiet-amplifier", + "id": "tutorial-jurisdiction", "metadata": {}, "outputs": [ { @@ -2192,7 +2235,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "isolated-integral", + "id": "respected-federal", "metadata": {}, "outputs": [ { @@ -2781,7 +2824,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "announced-glenn", + "id": "moved-costa", "metadata": {}, "outputs": [ { @@ -3209,7 +3252,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "agreed-briefing", + "id": "friendly-excellence", "metadata": {}, "outputs": [ { diff --git a/examples/pull.ipynb b/docs/notebooks/pull.ipynb similarity index 88% rename from examples/pull.ipynb rename to docs/notebooks/pull.ipynb index f0c19e2e..e3a7f894 100644 --- a/examples/pull.ipynb +++ b/docs/notebooks/pull.ipynb @@ -3,20 +3,48 @@ { "cell_type": "code", "execution_count": 1, - "id": "stretch-modern", + "id": "essential-primary", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pull a series or a dataframe from a dataframe\n", + "\n", + " Args:\n", + " _data: The dataframe\n", + " var: The column to pull\n", + " name: If specified, a zip object will be return with the name-value\n", + " pairs. It can be a column name or a list of strs with the same\n", + " length as the series\n", + " Only works when pulling `a` for name `a$b`\n", + " to: Type of data to return.\n", + " Only works when pulling `a` for name `a$b`\n", + " - series: Return a pandas Series object\n", + " Group information will be lost\n", + " - array: Return a numpy.ndarray object\n", + " - list: Return a python list\n", + "\n", + " Returns:\n", + " The series data.\n", + " \n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/pull.html\n", "\n", "from datar.datasets import starwars, mtcars\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(pull.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "israeli-darwin", + "id": "several-session", "metadata": {}, "outputs": [ { @@ -69,7 +97,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "corrected-fault", + "id": "impressive-encyclopedia", "metadata": {}, "outputs": [ { @@ -121,7 +149,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "powered-tennessee", + "id": "specific-london", "metadata": {}, "outputs": [ { @@ -174,7 +202,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "exact-height", + "id": "closing-terminology", "metadata": {}, "outputs": [ { @@ -227,7 +255,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "developmental-monitor", + "id": "consecutive-grounds", "metadata": {}, "outputs": [ { @@ -259,7 +287,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "certain-tongue", + "id": "complete-lighting", "metadata": {}, "outputs": [ { diff --git a/examples/ranking.ipynb b/docs/notebooks/ranking.ipynb similarity index 94% rename from examples/ranking.ipynb rename to docs/notebooks/ranking.ipynb index 89b624d3..333b93a3 100644 --- a/examples/ranking.ipynb +++ b/docs/notebooks/ranking.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "retained-comparison", + "id": "duplicate-plain", "metadata": {}, "outputs": [], "source": [ @@ -18,7 +18,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "ranging-hybrid", + "id": "russian-neighbor", "metadata": {}, "outputs": [ { @@ -146,7 +146,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "fifteen-cable", + "id": "cosmetic-columbus", "metadata": {}, "outputs": [ { @@ -243,7 +243,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "textile-fluid", + "id": "metallic-excuse", "metadata": {}, "outputs": [ { @@ -848,7 +848,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "radical-arthritis", + "id": "former-think", "metadata": {}, "outputs": [ { @@ -887,7 +887,7 @@ " \n", " \n", " \n", - " Mazda RX4\n", + " 0\n", " 21.0\n", " 6\n", " 160.0\n", @@ -901,7 +901,7 @@ " 4\n", " \n", " \n", - " Mazda RX4 Wag\n", + " 1\n", " 21.0\n", " 6\n", " 160.0\n", @@ -915,7 +915,7 @@ " 4\n", " \n", " \n", - " Datsun 710\n", + " 2\n", " 22.8\n", " 4\n", " 108.0\n", @@ -929,7 +929,7 @@ " 1\n", " \n", " \n", - " Hornet 4 Drive\n", + " 3\n", " 21.4\n", " 6\n", " 258.0\n", @@ -943,7 +943,7 @@ " 1\n", " \n", " \n", - " Hornet Sportabout\n", + " 4\n", " 18.7\n", " 8\n", " 360.0\n", @@ -957,7 +957,7 @@ " 2\n", " \n", " \n", - " Valiant\n", + " 5\n", " 18.1\n", " 6\n", " 225.0\n", @@ -971,7 +971,7 @@ " 1\n", " \n", " \n", - " Duster 360\n", + " 6\n", " 14.3\n", " 8\n", " 360.0\n", @@ -985,7 +985,7 @@ " 4\n", " \n", " \n", - " Merc 240D\n", + " 7\n", " 24.4\n", " 4\n", " 146.7\n", @@ -999,7 +999,7 @@ " 2\n", " \n", " \n", - " Merc 230\n", + " 8\n", " 22.8\n", " 4\n", " 140.8\n", @@ -1013,7 +1013,7 @@ " 2\n", " \n", " \n", - " Merc 280\n", + " 9\n", " 19.2\n", " 6\n", " 167.6\n", @@ -1031,29 +1031,17 @@ "" ], "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear \\\n", - "Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 \n", - "Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 \n", - "Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 \n", - "Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 \n", - "Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 \n", - "Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 \n", - "Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 \n", - "Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 \n", - "Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 \n", - "Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 \n", - "\n", - " carb \n", - "Mazda RX4 4 \n", - "Mazda RX4 Wag 4 \n", - "Datsun 710 1 \n", - "Hornet 4 Drive 1 \n", - "Hornet Sportabout 2 \n", - "Valiant 1 \n", - "Duster 360 4 \n", - "Merc 240D 2 \n", - "Merc 230 2 \n", - "Merc 280 4 " + " mpg cyl disp hp drat wt qsec vs am gear carb\n", + "0 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4\n", + "1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4\n", + "2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1\n", + "3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1\n", + "4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2\n", + "5 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1\n", + "6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4\n", + "7 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2\n", + "8 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2\n", + "9 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4" ] }, "execution_count": 5, diff --git a/examples/readme.ipynb b/docs/notebooks/readme.ipynb similarity index 99% rename from examples/readme.ipynb rename to docs/notebooks/readme.ipynb index 1ffe8e27..39e16ea6 100644 --- a/examples/readme.ipynb +++ b/docs/notebooks/readme.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "improving-color", + "id": "comparative-duration", "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "verbal-liechtenstein", + "id": "understanding-focus", "metadata": {}, "outputs": [ { @@ -41,7 +41,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "expired-battle", + "id": "living-convertible", "metadata": {}, "outputs": [ { @@ -63,7 +63,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "middle-shield", + "id": "peaceful-stanley", "metadata": {}, "outputs": [ { @@ -71,8 +71,8 @@ "output_type": "stream", "text": [ " x y\n", - "2 2 two\n", - "3 3 three\n" + "0 2 two\n", + "1 3 three\n" ] } ], @@ -83,7 +83,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "genetic-ultimate", + "id": "greenhouse-supervision", "metadata": {}, "outputs": [ { @@ -91,8 +91,8 @@ "output_type": "stream", "text": [ " x y z\n", - "2 2 two 1\n", - "3 3 three 1\n" + "0 2 two 1\n", + "1 3 three 1\n" ] } ], @@ -103,7 +103,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "dress-record", + "id": "pharmaceutical-threshold", "metadata": {}, "outputs": [ { @@ -119,7 +119,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 6, @@ -144,7 +144,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "pressed-parking", + "id": "sufficient-guitar", "metadata": {}, "outputs": [ { @@ -175,10 +175,11 @@ "# for example: klib\n", "import klib\n", "from pipda import register_verb\n", + "from datar.core.contexts import Context\n", "from datar.datasets import iris\n", "from datar.dplyr import pull\n", "\n", - "dist_plot = register_verb(func=klib.dist_plot)\n", + "dist_plot = register_verb(context=Context.EVAL)(klib.dist_plot)\n", "iris >> pull(f.Sepal_Length) >> dist_plot()" ] } diff --git a/examples/recode.ipynb b/docs/notebooks/recode.ipynb similarity index 79% rename from examples/recode.ipynb rename to docs/notebooks/recode.ipynb index e39874ab..d44a06bf 100644 --- a/examples/recode.ipynb +++ b/docs/notebooks/recode.ipynb @@ -3,25 +3,49 @@ { "cell_type": "code", "execution_count": 1, - "id": "naval-mitchell", + "id": "identified-twelve", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Recode a vector, replacing elements in it\n", + "\n", + " Args:\n", + " series: A vector to modify\n", + " *args, **kwargs: replacements\n", + " _default: If supplied, all values not otherwise matched will be\n", + " given this value. If not supplied and if the replacements are\n", + " the same type as the original values in series, unmatched values\n", + " are not changed. If not supplied and if the replacements are\n", + " not compatible, unmatched values are replaced with NA.\n", + " _missing: If supplied, any missing values in .x will be replaced\n", + " by this value.\n", + "\n", + " Returns:\n", + " The vector with values replaced\n", + " \n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/recode.html\n", "\n", - "from datar.all import *" + "from datar.all import *\n", + "print(recode.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "blessed-ottawa", + "id": "appreciated-aerospace", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Apple', 'Apple', 'b', 'Apple', 'c', 'c', 'b', 'c', 'c', 'Apple']" + "['b', 'c', 'Apple', 'c', 'c', 'b', 'b', 'Apple', 'c', 'c']" ] }, "execution_count": 2, @@ -37,13 +61,13 @@ { "cell_type": "code", "execution_count": 3, - "id": "exempt-religion", + "id": "crucial-version", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Apple', 'Apple', 'Banana', 'Apple', 'c', 'c', 'Banana', 'c', 'c', 'Apple']" + "['Banana', 'c', 'Apple', 'c', 'c', 'Banana', 'Banana', 'Apple', 'c', 'c']" ] }, "execution_count": 3, @@ -58,13 +82,13 @@ { "cell_type": "code", "execution_count": 4, - "id": "sonic-sampling", + "id": "latest-render", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Apple', 'Apple', 'Banana', 'Apple', nan, nan, 'Banana', nan, nan, 'Apple']" + "['Banana', nan, 'Apple', nan, nan, 'Banana', 'Banana', 'Apple', nan, nan]" ] }, "execution_count": 4, @@ -79,22 +103,22 @@ { "cell_type": "code", "execution_count": 5, - "id": "inappropriate-potter", + "id": "renewable-worry", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['apple',\n", - " 'apple',\n", - " 'banana',\n", + "['banana',\n", + " 'carrot',\n", " 'apple',\n", " 'carrot',\n", " 'carrot',\n", " 'banana',\n", + " 'banana',\n", + " 'apple',\n", " 'carrot',\n", - " 'carrot',\n", - " 'apple']" + " 'carrot']" ] }, "execution_count": 5, @@ -110,7 +134,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "possible-physiology", + "id": "unlike-pathology", "metadata": {}, "outputs": [ { @@ -132,7 +156,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "established-table", + "id": "authentic-reynolds", "metadata": {}, "outputs": [ { @@ -153,7 +177,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "bridal-estate", + "id": "tutorial-diagnosis", "metadata": {}, "outputs": [ { @@ -174,7 +198,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "superior-franchise", + "id": "complete-maker", "metadata": {}, "outputs": [ { @@ -195,7 +219,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "sunset-output", + "id": "authorized-wallet", "metadata": {}, "outputs": [ { @@ -216,7 +240,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "medical-hurricane", + "id": "finite-bahamas", "metadata": {}, "outputs": [ { @@ -237,7 +261,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "engaged-investor", + "id": "laughing-permit", "metadata": {}, "outputs": [ { @@ -260,7 +284,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "elementary-venezuela", + "id": "grateful-intervention", "metadata": {}, "outputs": [ { @@ -282,7 +306,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "small-willow", + "id": "consistent-number", "metadata": {}, "outputs": [ { @@ -304,7 +328,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "figured-martin", + "id": "collective-mixture", "metadata": {}, "outputs": [ { @@ -326,7 +350,7 @@ { "cell_type": "code", "execution_count": 16, - "id": "further-track", + "id": "measured-bronze", "metadata": {}, "outputs": [ { @@ -348,7 +372,7 @@ { "cell_type": "code", "execution_count": 17, - "id": "injured-seeker", + "id": "turkish-ballet", "metadata": {}, "outputs": [ { @@ -370,7 +394,7 @@ { "cell_type": "code", "execution_count": 18, - "id": "environmental-iceland", + "id": "particular-stationery", "metadata": {}, "outputs": [ { @@ -392,14 +416,14 @@ { "cell_type": "code", "execution_count": 19, - "id": "convinced-requirement", + "id": "comic-authentication", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['apple', 'apple', 'banana', 'apple', 'carrot', 'carrot', 'banana', 'carrot', 'carrot', 'apple']\n", - "Categories (3, object): ['apple', 'banana', 'carrot']" + "['banana', 'carrot', 'apple', 'carrot', 'carrot', 'banana', 'banana', 'apple', 'carrot', 'carrot']\n", + "Categories (3, object): ['banana', 'carrot', 'apple']" ] }, "execution_count": 19, diff --git a/examples/relocate.ipynb b/docs/notebooks/relocate.ipynb similarity index 94% rename from examples/relocate.ipynb rename to docs/notebooks/relocate.ipynb index 160bffeb..32aa4915 100644 --- a/examples/relocate.ipynb +++ b/docs/notebooks/relocate.ipynb @@ -4,11 +4,36 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "change column positions\n", + "\n", + " Args:\n", + " _data: A data frame\n", + " column, *columns: Columns to move\n", + " _before, _after: Destination. Supplying neither will move columns to\n", + " the left-hand side; specifying both is an error.\n", + "\n", + " Returns:\n", + " An object of the same type as .data. The output has the following\n", + " properties:\n", + " - Rows are not affected.\n", + " - The same columns appear in the output, but (usually) in a\n", + " different place.\n", + " - Data frame attributes are preserved.\n", + " - Groups are not affected\n", + " \n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/relocate.html\n", "from pandas import DataFrame\n", - "from datar.all import *" + "from datar.all import *\n", + "print(relocate.__doc__)" ] }, { diff --git a/examples/rename.ipynb b/docs/notebooks/rename.ipynb similarity index 96% rename from examples/rename.ipynb rename to docs/notebooks/rename.ipynb index c5d28557..99aa7eeb 100644 --- a/examples/rename.ipynb +++ b/docs/notebooks/rename.ipynb @@ -3,20 +3,39 @@ { "cell_type": "code", "execution_count": 1, - "id": "above-circle", + "id": "separate-barrel", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Changes the names of individual variables using new_name = old_name\n", + " syntax\n", + "\n", + " Args:\n", + " _data: The dataframe\n", + " **kwargs: The new_name = old_name pairs\n", + "\n", + " Returns:\n", + " The dataframe with new names\n", + " \n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/rename.html\n", "\n", "from datar.datasets import iris\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(rename.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "handled-logic", + "id": "convinced-roommate", "metadata": {}, "outputs": [ { @@ -170,7 +189,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "national-dress", + "id": "urban-sellers", "metadata": {}, "outputs": [ { @@ -324,7 +343,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "electric-blank", + "id": "mathematical-dictionary", "metadata": {}, "outputs": [ { @@ -478,7 +497,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "universal-ending", + "id": "answering-fundamentals", "metadata": {}, "outputs": [ { diff --git a/examples/replace_na.ipynb b/docs/notebooks/replace_na.ipynb similarity index 82% rename from examples/replace_na.ipynb rename to docs/notebooks/replace_na.ipynb index 8cf574c4..ba6cfbae 100644 --- a/examples/replace_na.ipynb +++ b/docs/notebooks/replace_na.ipynb @@ -3,19 +3,43 @@ { "cell_type": "code", "execution_count": 1, - "id": "invalid-guidance", + "id": "overhead-stone", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Replace NA with a value\n", + "\n", + " This function can be also used not as a verb. As a function called as\n", + " an argument in a verb, _data is passed implicitly. Then one could\n", + " pass series_or_replace as the data to replace.\n", + "\n", + " Args:\n", + " _data: The data piped in\n", + " series_or_replace: When called as argument of a verb, this is the\n", + " data to replace. Otherwise this is the replacement.\n", + " replace: The value to replace with\n", + "\n", + " Returns:\n", + " Corresponding data with NAs replaced\n", + " \n" + ] + } + ], "source": [ "# https://tidyr.tidyverse.org/reference/replace_na.html\n", "\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(replace_na.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "together-newfoundland", + "id": "moved-rugby", "metadata": {}, "outputs": [ { @@ -83,7 +107,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "colonial-lincoln", + "id": "smoking-municipality", "metadata": {}, "outputs": [ { @@ -150,7 +174,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "framed-release", + "id": "recorded-syria", "metadata": {}, "outputs": [ { @@ -174,7 +198,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "indoor-tiger", + "id": "fatty-accused", "metadata": {}, "outputs": [ { @@ -198,7 +222,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "heavy-denial", + "id": "capital-clarity", "metadata": {}, "outputs": [], "source": [ diff --git a/examples/rowwise.ipynb b/docs/notebooks/rowwise.ipynb similarity index 57% rename from examples/rowwise.ipynb rename to docs/notebooks/rowwise.ipynb index 19eb09cd..b5b391cd 100644 --- a/examples/rowwise.ipynb +++ b/docs/notebooks/rowwise.ipynb @@ -4,12 +4,32 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Compute on a data frame a row-at-a-time\n", + "\n", + " Args:\n", + " _data: The dataframe\n", + " *columns: Variables to be preserved when calling summarise().\n", + " This is typically a set of variables whose combination\n", + " uniquely identify each row.\n", + "\n", + " Returns:\n", + " A row-wise data frame\n", + " \n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/rowwise.html\n", "\n", "from datar.datasets import iris\n", - "from datar.all import *\n" + "from datar.all import *\n", + "\n", + "print(rowwise.__doc__)" ] }, { @@ -47,45 +67,45 @@ " \n", " \n", " 0\n", - " 0.654043\n", - " 0.164943\n", - " 0.573693\n", - " 0.464226\n", + " 0.888912\n", + " 0.317292\n", + " 0.826542\n", + " 0.677582\n", " \n", " \n", " 1\n", - " 0.689119\n", - " 0.590817\n", - " 0.355858\n", - " 0.545265\n", + " 0.683771\n", + " 0.228575\n", + " 0.344580\n", + " 0.418975\n", " \n", " \n", " 2\n", - " 0.996729\n", - " 0.455562\n", - " 0.556582\n", - " 0.669624\n", + " 0.641990\n", + " 0.626599\n", + " 0.560846\n", + " 0.609811\n", " \n", " \n", " 3\n", - " 0.560269\n", - " 0.139997\n", - " 0.541517\n", - " 0.413927\n", + " 0.260390\n", + " 0.351010\n", + " 0.143023\n", + " 0.251475\n", " \n", " \n", " 4\n", - " 0.349431\n", - " 0.301431\n", - " 0.317143\n", - " 0.322668\n", + " 0.012773\n", + " 0.541104\n", + " 0.523489\n", + " 0.359122\n", " \n", " \n", " 5\n", - " 0.333165\n", - " 0.549818\n", - " 0.618501\n", - " 0.500495\n", + " 0.598446\n", + " 0.525574\n", + " 0.431912\n", + " 0.518644\n", " \n", " \n", "\n", @@ -93,12 +113,12 @@ ], "text/plain": [ " x y z m\n", - "0 0.654043 0.164943 0.573693 0.464226\n", - "1 0.689119 0.590817 0.355858 0.545265\n", - "2 0.996729 0.455562 0.556582 0.669624\n", - "3 0.560269 0.139997 0.541517 0.413927\n", - "4 0.349431 0.301431 0.317143 0.322668\n", - "5 0.333165 0.549818 0.618501 0.500495" + "0 0.888912 0.317292 0.826542 0.677582\n", + "1 0.683771 0.228575 0.344580 0.418975\n", + "2 0.641990 0.626599 0.560846 0.609811\n", + "3 0.260390 0.351010 0.143023 0.251475\n", + "4 0.012773 0.541104 0.523489 0.359122\n", + "5 0.598446 0.525574 0.431912 0.518644" ] }, "execution_count": 2, @@ -108,8 +128,8 @@ ], "source": [ "df = tibble(x=runif(6), y=runif(6), z=runif(6))\n", - "# func applied to rowwise df has to be called with c_across\n", - "df >> rowwise() >> mutate(m=c_across([f.x, f.y, f.z], mean)) " + "\n", + "df >> rowwise() >> mutate(m=mean(c_across([f.x, f.y, f.z]))) " ] }, { @@ -147,45 +167,45 @@ " \n", " \n", " 0\n", - " 0.654043\n", - " 0.164943\n", - " 0.573693\n", - " 0.464226\n", + " 0.888912\n", + " 0.317292\n", + " 0.826542\n", + " 0.677582\n", " \n", " \n", " 1\n", - " 0.689119\n", - " 0.590817\n", - " 0.355858\n", - " 0.545265\n", + " 0.683771\n", + " 0.228575\n", + " 0.344580\n", + " 0.418975\n", " \n", " \n", " 2\n", - " 0.996729\n", - " 0.455562\n", - " 0.556582\n", - " 0.669624\n", + " 0.641990\n", + " 0.626599\n", + " 0.560846\n", + " 0.609811\n", " \n", " \n", " 3\n", - " 0.560269\n", - " 0.139997\n", - " 0.541517\n", - " 0.413927\n", + " 0.260390\n", + " 0.351010\n", + " 0.143023\n", + " 0.251475\n", " \n", " \n", " 4\n", - " 0.349431\n", - " 0.301431\n", - " 0.317143\n", - " 0.322668\n", + " 0.012773\n", + " 0.541104\n", + " 0.523489\n", + " 0.359122\n", " \n", " \n", " 5\n", - " 0.333165\n", - " 0.549818\n", - " 0.618501\n", - " 0.500495\n", + " 0.598446\n", + " 0.525574\n", + " 0.431912\n", + " 0.518644\n", " \n", " \n", "\n", @@ -193,12 +213,12 @@ ], "text/plain": [ " x y z m\n", - "0 0.654043 0.164943 0.573693 0.464226\n", - "1 0.689119 0.590817 0.355858 0.545265\n", - "2 0.996729 0.455562 0.556582 0.669624\n", - "3 0.560269 0.139997 0.541517 0.413927\n", - "4 0.349431 0.301431 0.317143 0.322668\n", - "5 0.333165 0.549818 0.618501 0.500495" + "0 0.888912 0.317292 0.826542 0.677582\n", + "1 0.683771 0.228575 0.344580 0.418975\n", + "2 0.641990 0.626599 0.560846 0.609811\n", + "3 0.260390 0.351010 0.143023 0.251475\n", + "4 0.012773 0.541104 0.523489 0.359122\n", + "5 0.598446 0.525574 0.431912 0.518644" ] }, "execution_count": 3, @@ -207,7 +227,7 @@ } ], "source": [ - "df >> rowwise() >> mutate(m=c_across(f[f.x:f.z], mean)) " + "df >> rowwise() >> mutate(m=mean(c_across(f[f.x:f.z])))" ] }, { @@ -245,45 +265,45 @@ " \n", " \n", " 0\n", - " 0.654043\n", - " 0.164943\n", - " 0.573693\n", - " 0.164943\n", + " 0.888912\n", + " 0.317292\n", + " 0.826542\n", + " 0.317292\n", " \n", " \n", " 1\n", - " 0.689119\n", - " 0.590817\n", - " 0.355858\n", - " 0.355858\n", + " 0.683771\n", + " 0.228575\n", + " 0.344580\n", + " 0.228575\n", " \n", " \n", " 2\n", - " 0.996729\n", - " 0.455562\n", - " 0.556582\n", - " 0.455562\n", + " 0.641990\n", + " 0.626599\n", + " 0.560846\n", + " 0.560846\n", " \n", " \n", " 3\n", - " 0.560269\n", - " 0.139997\n", - " 0.541517\n", - " 0.139997\n", + " 0.260390\n", + " 0.351010\n", + " 0.143023\n", + " 0.143023\n", " \n", " \n", " 4\n", - " 0.349431\n", - " 0.301431\n", - " 0.317143\n", - " 0.301431\n", + " 0.012773\n", + " 0.541104\n", + " 0.523489\n", + " 0.012773\n", " \n", " \n", " 5\n", - " 0.333165\n", - " 0.549818\n", - " 0.618501\n", - " 0.333165\n", + " 0.598446\n", + " 0.525574\n", + " 0.431912\n", + " 0.431912\n", " \n", " \n", "\n", @@ -291,12 +311,12 @@ ], "text/plain": [ " x y z m\n", - "0 0.654043 0.164943 0.573693 0.164943\n", - "1 0.689119 0.590817 0.355858 0.355858\n", - "2 0.996729 0.455562 0.556582 0.455562\n", - "3 0.560269 0.139997 0.541517 0.139997\n", - "4 0.349431 0.301431 0.317143 0.301431\n", - "5 0.333165 0.549818 0.618501 0.333165" + "0 0.888912 0.317292 0.826542 0.317292\n", + "1 0.683771 0.228575 0.344580 0.228575\n", + "2 0.641990 0.626599 0.560846 0.560846\n", + "3 0.260390 0.351010 0.143023 0.143023\n", + "4 0.012773 0.541104 0.523489 0.012773\n", + "5 0.598446 0.525574 0.431912 0.431912" ] }, "execution_count": 4, @@ -305,7 +325,7 @@ } ], "source": [ - "df >> rowwise() >> mutate(m=c_across([f.x, f.y, f.z], min)) " + "df >> rowwise() >> mutate(m=min(c_across([f.x, f.y, f.z]))) " ] }, { @@ -343,45 +363,45 @@ " \n", " \n", " 0\n", - " 0.654043\n", - " 0.164943\n", - " 0.573693\n", - " 0.164943\n", + " 0.888912\n", + " 0.317292\n", + " 0.826542\n", + " 0.317292\n", " \n", " \n", " 1\n", - " 0.689119\n", - " 0.590817\n", - " 0.355858\n", - " 0.355858\n", + " 0.683771\n", + " 0.228575\n", + " 0.344580\n", + " 0.228575\n", " \n", " \n", " 2\n", - " 0.996729\n", - " 0.455562\n", - " 0.556582\n", - " 0.455562\n", + " 0.641990\n", + " 0.626599\n", + " 0.560846\n", + " 0.560846\n", " \n", " \n", " 3\n", - " 0.560269\n", - " 0.139997\n", - " 0.541517\n", - " 0.139997\n", + " 0.260390\n", + " 0.351010\n", + " 0.143023\n", + " 0.143023\n", " \n", " \n", " 4\n", - " 0.349431\n", - " 0.301431\n", - " 0.317143\n", - " 0.301431\n", + " 0.012773\n", + " 0.541104\n", + " 0.523489\n", + " 0.012773\n", " \n", " \n", " 5\n", - " 0.333165\n", - " 0.549818\n", - " 0.618501\n", - " 0.333165\n", + " 0.598446\n", + " 0.525574\n", + " 0.431912\n", + " 0.431912\n", " \n", " \n", "\n", @@ -389,12 +409,12 @@ ], "text/plain": [ " x y z m\n", - "0 0.654043 0.164943 0.573693 0.164943\n", - "1 0.689119 0.590817 0.355858 0.355858\n", - "2 0.996729 0.455562 0.556582 0.455562\n", - "3 0.560269 0.139997 0.541517 0.139997\n", - "4 0.349431 0.301431 0.317143 0.301431\n", - "5 0.333165 0.549818 0.618501 0.333165" + "0 0.888912 0.317292 0.826542 0.317292\n", + "1 0.683771 0.228575 0.344580 0.228575\n", + "2 0.641990 0.626599 0.560846 0.560846\n", + "3 0.260390 0.351010 0.143023 0.143023\n", + "4 0.012773 0.541104 0.523489 0.012773\n", + "5 0.598446 0.525574 0.431912 0.431912" ] }, "execution_count": 5, @@ -403,7 +423,7 @@ } ], "source": [ - "df >> rowwise() >> mutate(m=c_across(f[f.x:f.z], min)) " + "df >> rowwise() >> mutate(m=min(c_across(f[f.x:f.z]))) " ] }, { @@ -441,45 +461,45 @@ " \n", " \n", " 0\n", - " 0.654043\n", - " 0.164943\n", - " 0.573693\n", - " 0.164943\n", + " 0.888912\n", + " 0.317292\n", + " 0.826542\n", + " 0.317292\n", " \n", " \n", " 1\n", - " 0.689119\n", - " 0.590817\n", - " 0.355858\n", - " 0.355858\n", + " 0.683771\n", + " 0.228575\n", + " 0.344580\n", + " 0.228575\n", " \n", " \n", " 2\n", - " 0.996729\n", - " 0.455562\n", - " 0.556582\n", - " 0.455562\n", + " 0.641990\n", + " 0.626599\n", + " 0.560846\n", + " 0.560846\n", " \n", " \n", " 3\n", - " 0.560269\n", - " 0.139997\n", - " 0.541517\n", - " 0.139997\n", + " 0.260390\n", + " 0.351010\n", + " 0.143023\n", + " 0.143023\n", " \n", " \n", " 4\n", - " 0.349431\n", - " 0.301431\n", - " 0.317143\n", - " 0.301431\n", + " 0.012773\n", + " 0.541104\n", + " 0.523489\n", + " 0.012773\n", " \n", " \n", " 5\n", - " 0.333165\n", - " 0.549818\n", - " 0.618501\n", - " 0.333165\n", + " 0.598446\n", + " 0.525574\n", + " 0.431912\n", + " 0.431912\n", " \n", " \n", "\n", @@ -487,12 +507,12 @@ ], "text/plain": [ " x y z m\n", - "0 0.654043 0.164943 0.573693 0.164943\n", - "1 0.689119 0.590817 0.355858 0.355858\n", - "2 0.996729 0.455562 0.556582 0.455562\n", - "3 0.560269 0.139997 0.541517 0.139997\n", - "4 0.349431 0.301431 0.317143 0.301431\n", - "5 0.333165 0.549818 0.618501 0.333165" + "0 0.888912 0.317292 0.826542 0.317292\n", + "1 0.683771 0.228575 0.344580 0.228575\n", + "2 0.641990 0.626599 0.560846 0.560846\n", + "3 0.260390 0.351010 0.143023 0.143023\n", + "4 0.012773 0.541104 0.523489 0.012773\n", + "5 0.598446 0.525574 0.431912 0.431912" ] }, "execution_count": 6, @@ -506,9 +526,17 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-04-06 23:42:56][datar][ INFO] `summarise()` has grouped output by ['sim']. You can override using the `_groups` argument.\n", + "[2021-04-06 23:42:56][datar][ INFO] # [DataFrameGroupBy] Groups: ['sim'] (3)\n" + ] + }, { "data": { "text/html": [ @@ -538,17 +566,17 @@ " \n", " 0\n", " 1\n", - " [[[-0.333892507795308]]]\n", + " [0.9996658184501107]\n", " \n", " \n", - " 1\n", + " 0\n", " 2\n", - " [[[-0.3209192213552989, -0.3552761221338856, -...\n", + " [4.435399346334073, 1.7221520588327917]\n", " \n", " \n", - " 2\n", + " 0\n", " 3\n", - " [[[0.11869637609568977, -1.76631756384409]], [...\n", + " [0.9233950658821094, 1.3154177668657419, 0.873...\n", " \n", " \n", "\n", @@ -556,12 +584,12 @@ ], "text/plain": [ " sim z\n", - "0 1 [[[-0.333892507795308]]]\n", - "1 2 [[[-0.3209192213552989, -0.3552761221338856, -...\n", - "2 3 [[[0.11869637609568977, -1.76631756384409]], [..." + "0 1 [0.9996658184501107]\n", + "0 2 [4.435399346334073, 1.7221520588327917]\n", + "0 3 [0.9233950658821094, 1.3154177668657419, 0.873..." ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -574,7 +602,7 @@ " sd=[1, 4, 2]\n", ")\n", "\n", - "params >> rowwise(f.sim) >> summarise(z=c_across([f.n, f.mean, f.sd], rnorm))" + "params >> rowwise(f.sim) >> summarise(z=[rnorm(f.n, f.mean, f.sd)]) >> display()" ] }, { diff --git a/examples/select.ipynb b/docs/notebooks/select.ipynb similarity index 98% rename from examples/select.ipynb rename to docs/notebooks/select.ipynb index a678a5b0..7c52f8ca 100644 --- a/examples/select.ipynb +++ b/docs/notebooks/select.ipynb @@ -4,11 +4,29 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Select (and optionally rename) variables in a data frame\n", + "\n", + " Args:\n", + " *columns: The columns to select\n", + " **renamings: The columns to rename and select in new => old column way.\n", + "\n", + " Returns:\n", + " The dataframe with select columns\n", + " \n" + ] + } + ], "source": [ "# https://dplyr.tidyverse.org/reference/select.html\n", "from datar.datasets import starwars, iris\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(select.__doc__)" ] }, { diff --git a/examples/separate.ipynb b/docs/notebooks/separate.ipynb similarity index 89% rename from examples/separate.ipynb rename to docs/notebooks/separate.ipynb index 819e0426..429e96cd 100644 --- a/examples/separate.ipynb +++ b/docs/notebooks/separate.ipynb @@ -3,19 +3,55 @@ { "cell_type": "code", "execution_count": 1, - "id": "regulation-piano", + "id": "overhead-assets", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Given either a regular expression or a vector of character positions,\n", + " turns a single character column into multiple columns.\n", + "\n", + " Args:\n", + " _data: The dataframe\n", + " col: Column name or position.\n", + " into: Names of new variables to create as character vector.\n", + " Use None to omit the variable in the output.\n", + " sep: Separator between columns.\n", + " TODO: support index split (sep is an integer)\n", + " remove: If TRUE, remove input column from output data frame.\n", + " convert: The universal type for the extracted columns or a dict for\n", + " individual ones\n", + " extra: If sep is a character vector, this controls what happens when\n", + " there are too many pieces. There are three valid options:\n", + " - \"warn\" (the default): emit a warning and drop extra values.\n", + " - \"drop\": drop any extra values without a warning.\n", + " - \"merge\": only splits at most length(into) times\n", + " fill: If sep is a character vector, this controls what happens when\n", + " there are not enough pieces. There are three valid options:\n", + " - \"warn\" (the default): emit a warning and fill from the right\n", + " - \"right\": fill with missing values on the right\n", + " - \"left\": fill with missing values on the left\n", + "\n", + " Returns:\n", + " Dataframe with separated columns.\n", + " \n" + ] + } + ], "source": [ "# https://tidyr.tidyverse.org/reference/separate.html\n", "\n", - "from datar.all import *" + "from datar.all import *\n", + "\n", + "print(separate.__doc__)" ] }, { "cell_type": "code", "execution_count": 2, - "id": "comparative-consultation", + "id": "standard-syracuse", "metadata": {}, "outputs": [ { @@ -89,7 +125,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "technological-animal", + "id": "excessive-wings", "metadata": {}, "outputs": [ { @@ -157,15 +193,15 @@ { "cell_type": "code", "execution_count": 4, - "id": "likely-harbor", + "id": "intimate-sheriff", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-13 00:44:11][datar][WARNING] Expected 2 pieces. Additional pieces discarded in 1 rows [2].\n", - "[2021-03-13 00:44:11][datar][WARNING] Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [0].\n" + "[2021-04-03 00:50:55][datar][WARNING] Expected 2 pieces. Additional pieces discarded in 1 rows [2].\n", + "[2021-04-03 00:50:55][datar][WARNING] Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [0].\n" ] }, { @@ -239,7 +275,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "amino-daniel", + "id": "moved-burst", "metadata": {}, "outputs": [ { @@ -312,7 +348,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "informal-shoulder", + "id": "million-absorption", "metadata": {}, "outputs": [ { @@ -385,14 +421,14 @@ { "cell_type": "code", "execution_count": 7, - "id": "seventh-prospect", + "id": "distant-stations", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-13 00:44:11][datar][WARNING] Expected 3 pieces. Missing pieces filled with `NA` in 2 rows [0, 1].\n" + "[2021-04-03 00:50:55][datar][WARNING] Expected 3 pieces. Missing pieces filled with `NA` in 2 rows [0, 1].\n" ] }, { @@ -470,7 +506,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "experienced-conflict", + "id": "monetary-portuguese", "metadata": {}, "outputs": [ { @@ -532,7 +568,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "front-firmware", + "id": "desirable-generic", "metadata": {}, "outputs": [ { @@ -606,14 +642,14 @@ { "cell_type": "code", "execution_count": 10, - "id": "included-migration", + "id": "impressive-bulgarian", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-13 00:44:12][datar][WARNING] Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [3].\n" + "[2021-04-03 00:50:55][datar][WARNING] Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [3].\n" ] }, { @@ -693,14 +729,14 @@ { "cell_type": "code", "execution_count": 11, - "id": "mighty-instruction", + "id": "advisory-exemption", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-13 00:44:12][datar][WARNING] Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [3].\n" + "[2021-04-03 00:50:55][datar][WARNING] Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [3].\n" ] }, { @@ -779,7 +815,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "backed-upgrade", + "id": "returning-morrison", "metadata": {}, "outputs": [ { @@ -800,7 +836,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "assumed-stroke", + "id": "chemical-harvard", "metadata": {}, "outputs": [], "source": [ @@ -814,7 +850,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "streaming-thought", + "id": "incoming-result", "metadata": {}, "outputs": [ { @@ -906,7 +942,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "medium-colon", + "id": "registered-personal", "metadata": {}, "outputs": [ { diff --git a/docs/notebooks/setops.ipynb b/docs/notebooks/setops.ipynb new file mode 100644 index 00000000..52b87c84 --- /dev/null +++ b/docs/notebooks/setops.ipynb @@ -0,0 +1,1361 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "august-firmware", + "metadata": {}, + "outputs": [], + "source": [ + "# https://dplyr.tidyverse.org/reference/setops.html\n", + "\n", + "from datar.datasets import mtcars\n", + "from datar.all import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "smaller-peripheral", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
V0V1V2V3V4V5V6V7V8V9V10
0mpgcyldisphpdratwtqsecvsamgearcarb
\n", + "
" + ], + "text/plain": [ + " V0 V1 V2 V3 V4 V5 V6 V7 V8 V9 V10\n", + "0 mpg cyl disp hp drat wt qsec vs am gear carb" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "first = mtcars >> slice(f[:20])\n", + "second = mtcars >> slice(f[9:33])\n", + "\n", + "intersect(first, second) # or first >> intersect(second)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "second-lighter", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
V0V1V2V3V4V5V6V7V8V9V10
0mpgcyldisphpdratwtqsecvsamgearcarb
\n", + "
" + ], + "text/plain": [ + " V0 V1 V2 V3 V4 V5 V6 V7 V8 V9 V10\n", + "0 mpg cyl disp hp drat wt qsec vs am gear carb" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "union(first, second)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "adjacent-debate", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
V0V1V2V3V4V5V6V7V8V9V10
0mpgcyldisphpdratwtqsecvsamgearcarb
\n", + "
" + ], + "text/plain": [ + " V0 V1 V2 V3 V4 V5 V6 V7 V8 V9 V10\n", + "0 mpg cyl disp hp drat wt qsec vs am gear carb" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "setdiff(first, second)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "legal-spyware", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
V0V1V2V3V4V5V6V7V8V9V10
11mpgcyldisphpdratwtqsecvsamgearcarb
\n", + "
" + ], + "text/plain": [ + " V0 V1 V2 V3 V4 V5 V6 V7 V8 V9 V10\n", + "11 mpg cyl disp hp drat wt qsec vs am gear carb" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "setdiff(second, first)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "gorgeous-scholar", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgcyldisphpdratwtqsecvsamgearcarb
021.06160.01103.902.62016.460144
121.06160.01103.902.87517.020144
222.84108.0933.852.32018.611141
321.46258.01103.083.21519.441031
418.78360.01753.153.44017.020032
518.16225.01052.763.46020.221031
614.38360.02453.213.57015.840034
724.44146.7623.693.19020.001042
822.84140.8953.923.15022.901042
919.26167.61233.923.44018.301044
1017.86167.61233.923.44018.901044
1116.48275.81803.074.07017.400033
1217.38275.81803.073.73017.600033
1315.28275.81803.073.78018.000033
1410.48472.02052.935.25017.980034
1510.48460.02153.005.42417.820034
1614.78440.02303.235.34517.420034
1732.4478.7664.082.20019.471141
1830.4475.7524.931.61518.521142
1933.9471.1654.221.83519.901141
2019.26167.61233.923.44018.301044
2117.86167.61233.923.44018.901044
2216.48275.81803.074.07017.400033
2317.38275.81803.073.73017.600033
2415.28275.81803.073.78018.000033
2510.48472.02052.935.25017.980034
2610.48460.02153.005.42417.820034
2714.78440.02303.235.34517.420034
2832.4478.7664.082.20019.471141
2930.4475.7524.931.61518.521142
3033.9471.1654.221.83519.901141
3121.54120.1973.702.46520.011031
3215.58318.01502.763.52016.870032
3315.28304.01503.153.43517.300032
3413.38350.02453.733.84015.410034
3519.28400.01753.083.84517.050032
3627.3479.0664.081.93518.901141
3726.04120.3914.432.14016.700152
3830.4495.11133.771.51316.901152
3915.88351.02644.223.17014.500154
4019.76145.01753.622.77015.500156
4115.08301.03353.543.57014.600158
4221.44121.01094.112.78018.601142
\n", + "
" + ], + "text/plain": [ + " mpg cyl disp hp drat wt qsec vs am gear carb\n", + "0 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4\n", + "1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4\n", + "2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1\n", + "3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1\n", + "4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2\n", + "5 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1\n", + "6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4\n", + "7 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2\n", + "8 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2\n", + "9 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4\n", + "10 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4\n", + "11 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3\n", + "12 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3\n", + "13 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3\n", + "14 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4\n", + "15 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4\n", + "16 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4\n", + "17 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1\n", + "18 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2\n", + "19 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1\n", + "20 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4\n", + "21 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4\n", + "22 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3\n", + "23 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3\n", + "24 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3\n", + "25 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4\n", + "26 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4\n", + "27 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4\n", + "28 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1\n", + "29 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2\n", + "30 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1\n", + "31 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1\n", + "32 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2\n", + "33 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2\n", + "34 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4\n", + "35 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2\n", + "36 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1\n", + "37 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2\n", + "38 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2\n", + "39 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4\n", + "40 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6\n", + "41 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8\n", + "42 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "union_all(first, second)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "regular-allowance", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "setequal(mtcars, mtcars >> slice(f[::-1]))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "consecutive-russell", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
V0
0column
\n", + "
" + ], + "text/plain": [ + " V0\n", + "0 column" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a = tibble(column=c(*range(11), 10))\n", + "b = tibble(column=c(*range(6), 5))\n", + "intersect(a, b)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "prospective-ribbon", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
V0
0column
\n", + "
" + ], + "text/plain": [ + " V0\n", + "0 column" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "union(a, b)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "powered-satisfaction", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
V0
7column
\n", + "
" + ], + "text/plain": [ + " V0\n", + "7 column" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "setdiff(a, b)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "consecutive-atlantic", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
column
00
11
22
33
44
55
66
77
88
99
1010
1110
120
131
142
153
164
175
185
\n", + "
" + ], + "text/plain": [ + " column\n", + "0 0\n", + "1 1\n", + "2 2\n", + "3 3\n", + "4 4\n", + "5 5\n", + "6 6\n", + "7 7\n", + "8 8\n", + "9 9\n", + "10 10\n", + "11 10\n", + "12 0\n", + "13 1\n", + "14 2\n", + "15 3\n", + "16 4\n", + "17 5\n", + "18 5" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "union_all(a, b)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/slice.ipynb b/docs/notebooks/slice.ipynb similarity index 90% rename from examples/slice.ipynb rename to docs/notebooks/slice.ipynb index 1a439a59..8808be58 100644 --- a/examples/slice.ipynb +++ b/docs/notebooks/slice.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "younger-transfer", + "id": "liberal-vampire", "metadata": {}, "outputs": [], "source": [ @@ -16,7 +16,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "animated-rating", + "id": "tropical-exclusion", "metadata": {}, "outputs": [ { @@ -89,7 +89,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "activated-prayer", + "id": "affected-sense", "metadata": {}, "outputs": [ { @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "virgin-sampling", + "id": "scheduled-tobago", "metadata": {}, "outputs": [ { @@ -236,7 +236,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "valued-vietnam", + "id": "north-battery", "metadata": {}, "outputs": [ { @@ -744,7 +744,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "early-store", + "id": "hired-method", "metadata": {}, "outputs": [ { @@ -1253,7 +1253,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "floppy-ministry", + "id": "synthetic-mobile", "metadata": {}, "outputs": [ { @@ -1761,7 +1761,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "sensitive-tragedy", + "id": "weird-dream", "metadata": {}, "outputs": [ { @@ -1879,7 +1879,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "organizational-disability", + "id": "starting-hardwood", "metadata": {}, "outputs": [ { @@ -1998,7 +1998,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "structural-mattress", + "id": "north-iraqi", "metadata": {}, "outputs": [ { @@ -2138,7 +2138,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "romantic-cardiff", + "id": "radical-sheet", "metadata": {}, "outputs": [ { @@ -2271,7 +2271,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "assigned-grenada", + "id": "about-bachelor", "metadata": {}, "outputs": [ { @@ -2411,7 +2411,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "promising-bishop", + "id": "neural-elimination", "metadata": {}, "outputs": [ { @@ -2544,7 +2544,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "biblical-sending", + "id": "prescription-chick", "metadata": {}, "outputs": [ { @@ -2767,7 +2767,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "conscious-nickel", + "id": "packed-attachment", "metadata": {}, "outputs": [ { @@ -2840,7 +2840,7 @@ { "cell_type": "code", "execution_count": 16, - "id": "variable-floating", + "id": "humanitarian-interval", "metadata": {}, "outputs": [ { @@ -2913,7 +2913,7 @@ { "cell_type": "code", "execution_count": 17, - "id": "magnetic-exercise", + "id": "covered-array", "metadata": {}, "outputs": [ { @@ -2952,32 +2952,32 @@ " \n", " \n", " \n", - " Mazda RX4 Wag\n", - " 21.0\n", - " 6\n", - " 160.0\n", - " 110\n", - " 3.90\n", - " 2.875\n", - " 17.02\n", - " 0\n", - " 1\n", - " 4\n", + " Lotus Europa\n", + " 30.4\n", " 4\n", + " 95.1\n", + " 113\n", + " 3.77\n", + " 1.513\n", + " 16.90\n", + " 1\n", + " 1\n", + " 5\n", + " 2\n", " \n", " \n", - " Valiant\n", - " 18.1\n", - " 6\n", - " 225.0\n", - " 105\n", - " 2.76\n", - " 3.460\n", - " 20.22\n", + " Merc 230\n", + " 22.8\n", + " 4\n", + " 140.8\n", + " 95\n", + " 3.92\n", + " 3.150\n", + " 22.90\n", " 1\n", " 0\n", - " 3\n", - " 1\n", + " 4\n", + " 2\n", " \n", " \n", " Maserati Bora\n", @@ -2994,32 +2994,32 @@ " 8\n", " \n", " \n", - " Volvo 142E\n", - " 21.4\n", - " 4\n", - " 121.0\n", - " 109\n", - " 4.11\n", - " 2.780\n", - " 18.60\n", - " 1\n", - " 1\n", + " Camaro Z28\n", + " 13.3\n", + " 8\n", + " 350.0\n", + " 245\n", + " 3.73\n", + " 3.840\n", + " 15.41\n", + " 0\n", + " 0\n", + " 3\n", " 4\n", - " 2\n", " \n", " \n", - " Lotus Europa\n", - " 30.4\n", - " 4\n", - " 95.1\n", - " 113\n", - " 3.77\n", - " 1.513\n", - " 16.90\n", - " 1\n", + " Mazda RX4\n", + " 21.0\n", + " 6\n", + " 160.0\n", + " 110\n", + " 3.90\n", + " 2.620\n", + " 16.46\n", + " 0\n", " 1\n", - " 5\n", - " 2\n", + " 4\n", + " 4\n", " \n", " \n", "\n", @@ -3027,11 +3027,11 @@ ], "text/plain": [ " mpg cyl disp hp drat wt qsec vs am gear carb\n", - "Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4\n", - "Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1\n", + "Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2\n", + "Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2\n", "Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8\n", - "Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2\n", - "Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2" + "Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4\n", + "Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4" ] }, "execution_count": 17, @@ -3046,7 +3046,7 @@ { "cell_type": "code", "execution_count": 18, - "id": "organized-representation", + "id": "classified-grammar", "metadata": {}, "outputs": [ { @@ -3179,7 +3179,7 @@ { "cell_type": "code", "execution_count": 19, - "id": "paperback-arthur", + "id": "breathing-persian", "metadata": {}, "outputs": [ { @@ -3312,7 +3312,7 @@ { "cell_type": "code", "execution_count": 20, - "id": "warming-blind", + "id": "knowing-legislature", "metadata": {}, "outputs": [ { @@ -3351,74 +3351,74 @@ " \n", " \n", " \n", - " Hornet Sportabout\n", - " 18.7\n", + " Merc 450SE\n", + " 16.4\n", " 8\n", - " 360.0\n", - " 175\n", - " 3.15\n", - " 3.440\n", - " 17.02\n", + " 275.8\n", + " 180\n", + " 3.07\n", + " 4.070\n", + " 17.40\n", " 0\n", " 0\n", " 3\n", - " 2\n", + " 3\n", " \n", " \n", - " Camaro Z28\n", - " 13.3\n", + " Chrysler Imperial\n", + " 14.7\n", " 8\n", - " 350.0\n", - " 245\n", - " 3.73\n", - " 3.840\n", - " 15.41\n", + " 440.0\n", + " 230\n", + " 3.23\n", + " 5.345\n", + " 17.42\n", " 0\n", " 0\n", " 3\n", " 4\n", " \n", " \n", - " Valiant\n", - " 18.1\n", + " Merc 280C\n", + " 17.8\n", " 6\n", - " 225.0\n", - " 105\n", - " 2.76\n", - " 3.460\n", - " 20.22\n", + " 167.6\n", + " 123\n", + " 3.92\n", + " 3.440\n", + " 18.90\n", " 1\n", " 0\n", - " 3\n", - " 1\n", + " 4\n", + " 4\n", " \n", " \n", - " Toyota Corona\n", - " 21.5\n", + " Volvo 142E\n", + " 21.4\n", " 4\n", - " 120.1\n", - " 97\n", - " 3.70\n", - " 2.465\n", - " 20.01\n", + " 121.0\n", + " 109\n", + " 4.11\n", + " 2.780\n", + " 18.60\n", " 1\n", - " 0\n", - " 3\n", " 1\n", + " 4\n", + " 2\n", " \n", " \n", - " Merc 240D\n", - " 24.4\n", - " 4\n", - " 146.7\n", - " 62\n", - " 3.69\n", - " 3.190\n", - " 20.00\n", - " 1\n", + " Merc 450SLC\n", + " 15.2\n", + " 8\n", + " 275.8\n", + " 180\n", + " 3.07\n", + " 3.780\n", + " 18.00\n", " 0\n", - " 4\n", - " 2\n", + " 0\n", + " 3\n", + " 3\n", " \n", " \n", "\n", @@ -3426,18 +3426,18 @@ ], "text/plain": [ " mpg cyl disp hp drat wt qsec vs am gear \\\n", - "Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 \n", - "Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 \n", - "Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 \n", - "Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 \n", - "Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 \n", + "Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 \n", + "Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 \n", + "Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 \n", + "Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 \n", + "Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 \n", "\n", " carb \n", - "Hornet Sportabout 2 \n", - "Camaro Z28 4 \n", - "Valiant 1 \n", - "Toyota Corona 1 \n", - "Merc 240D 2 " + "Merc 450SE 3 \n", + "Chrysler Imperial 4 \n", + "Merc 280C 4 \n", + "Volvo 142E 2 \n", + "Merc 450SLC 3 " ] }, "execution_count": 20, @@ -3452,7 +3452,7 @@ { "cell_type": "code", "execution_count": 21, - "id": "quiet-kernel", + "id": "tutorial-nightlife", "metadata": {}, "outputs": [ { @@ -3484,27 +3484,27 @@ " \n", " 0\n", " a\n", - " 0.955210\n", + " 0.468066\n", " \n", " \n", " 1\n", " b\n", - " 0.991565\n", + " 0.250791\n", " \n", " \n", " 2\n", " b\n", - " 0.801570\n", + " 0.999076\n", " \n", " \n", " 3\n", " c\n", - " 0.433407\n", + " 0.876503\n", " \n", " \n", " 4\n", " c\n", - " 0.914839\n", + " 0.908480\n", " \n", " \n", "\n", @@ -3512,11 +3512,11 @@ ], "text/plain": [ " group x\n", - "0 a 0.955210\n", - "1 b 0.991565\n", - "2 b 0.801570\n", - "3 c 0.433407\n", - "4 c 0.914839" + "0 a 0.468066\n", + "1 b 0.250791\n", + "2 b 0.999076\n", + "3 c 0.876503\n", + "4 c 0.908480" ] }, "execution_count": 21, @@ -3535,7 +3535,7 @@ { "cell_type": "code", "execution_count": 22, - "id": "discrete-george", + "id": "recognized-chart", "metadata": {}, "outputs": [ { @@ -3566,18 +3566,18 @@ " \n", " \n", " 0\n", - " a\n", - " 0.955210\n", + " b\n", + " 0.250791\n", " \n", " \n", " 1\n", - " b\n", - " 0.991565\n", + " c\n", + " 0.876503\n", " \n", " \n", - " 3\n", + " 2\n", " c\n", - " 0.433407\n", + " 0.908480\n", " \n", " \n", "\n", @@ -3585,9 +3585,9 @@ ], "text/plain": [ " group x\n", - "0 a 0.955210\n", - "1 b 0.991565\n", - "3 c 0.433407" + "0 b 0.250791\n", + "1 c 0.876503\n", + "2 c 0.908480" ] }, "execution_count": 22, @@ -3602,7 +3602,7 @@ { "cell_type": "code", "execution_count": 23, - "id": "portable-phenomenon", + "id": "mexican-indianapolis", "metadata": {}, "outputs": [ { @@ -3641,7 +3641,7 @@ " \n", " \n", " \n", - " Mazda RX4\n", + " 0\n", " 21.0\n", " 6\n", " 160.0\n", @@ -3659,8 +3659,8 @@ "" ], "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear carb\n", - "Mazda RX4 21.0 6 160.0 110 3.9 2.62 16.46 0 1 4 4" + " mpg cyl disp hp drat wt qsec vs am gear carb\n", + "0 21.0 6 160.0 110 3.9 2.62 16.46 0 1 4 4" ] }, "execution_count": 23, @@ -3676,7 +3676,7 @@ { "cell_type": "code", "execution_count": 24, - "id": "suspended-alias", + "id": "pointed-gauge", "metadata": {}, "outputs": [ { @@ -3715,7 +3715,7 @@ " \n", " \n", " \n", - " Volvo 142E\n", + " 0\n", " 21.4\n", " 4\n", " 121.0\n", @@ -3733,8 +3733,8 @@ "" ], "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear carb\n", - "Volvo 142E 21.4 4 121.0 109 4.11 2.78 18.6 1 1 4 2" + " mpg cyl disp hp drat wt qsec vs am gear carb\n", + "0 21.4 4 121.0 109 4.11 2.78 18.6 1 1 4 2" ] }, "execution_count": 24, @@ -3749,7 +3749,7 @@ { "cell_type": "code", "execution_count": 25, - "id": "athletic-culture", + "id": "generic-stylus", "metadata": {}, "outputs": [ { @@ -3788,7 +3788,7 @@ " \n", " \n", " \n", - " Mazda RX4\n", + " 0\n", " 21.0\n", " 6\n", " 160.0\n", @@ -3802,7 +3802,7 @@ " 4\n", " \n", " \n", - " Mazda RX4 Wag\n", + " 1\n", " 21.0\n", " 6\n", " 160.0\n", @@ -3816,7 +3816,7 @@ " 4\n", " \n", " \n", - " Datsun 710\n", + " 2\n", " 22.8\n", " 4\n", " 108.0\n", @@ -3830,7 +3830,7 @@ " 1\n", " \n", " \n", - " Hornet 4 Drive\n", + " 3\n", " 21.4\n", " 6\n", " 258.0\n", @@ -3844,7 +3844,7 @@ " 1\n", " \n", " \n", - " Hornet Sportabout\n", + " 4\n", " 18.7\n", " 8\n", " 360.0\n", @@ -3858,7 +3858,7 @@ " 2\n", " \n", " \n", - " Valiant\n", + " 5\n", " 18.1\n", " 6\n", " 225.0\n", @@ -3872,7 +3872,7 @@ " 1\n", " \n", " \n", - " Duster 360\n", + " 6\n", " 14.3\n", " 8\n", " 360.0\n", @@ -3886,7 +3886,7 @@ " 4\n", " \n", " \n", - " Merc 240D\n", + " 7\n", " 24.4\n", " 4\n", " 146.7\n", @@ -3900,7 +3900,7 @@ " 2\n", " \n", " \n", - " Merc 230\n", + " 8\n", " 22.8\n", " 4\n", " 140.8\n", @@ -3914,7 +3914,7 @@ " 2\n", " \n", " \n", - " Merc 280\n", + " 9\n", " 19.2\n", " 6\n", " 167.6\n", @@ -3928,7 +3928,7 @@ " 4\n", " \n", " \n", - " Merc 280C\n", + " 10\n", " 17.8\n", " 6\n", " 167.6\n", @@ -3942,7 +3942,7 @@ " 4\n", " \n", " \n", - " Merc 450SE\n", + " 11\n", " 16.4\n", " 8\n", " 275.8\n", @@ -3956,7 +3956,7 @@ " 3\n", " \n", " \n", - " Merc 450SL\n", + " 12\n", " 17.3\n", " 8\n", " 275.8\n", @@ -3970,7 +3970,7 @@ " 3\n", " \n", " \n", - " Merc 450SLC\n", + " 13\n", " 15.2\n", " 8\n", " 275.8\n", @@ -3984,7 +3984,7 @@ " 3\n", " \n", " \n", - " Cadillac Fleetwood\n", + " 14\n", " 10.4\n", " 8\n", " 472.0\n", @@ -3998,7 +3998,7 @@ " 4\n", " \n", " \n", - " Lincoln Continental\n", + " 15\n", " 10.4\n", " 8\n", " 460.0\n", @@ -4012,7 +4012,7 @@ " 4\n", " \n", " \n", - " Chrysler Imperial\n", + " 16\n", " 14.7\n", " 8\n", " 440.0\n", @@ -4026,7 +4026,7 @@ " 4\n", " \n", " \n", - " Fiat 128\n", + " 17\n", " 32.4\n", " 4\n", " 78.7\n", @@ -4040,7 +4040,7 @@ " 1\n", " \n", " \n", - " Honda Civic\n", + " 18\n", " 30.4\n", " 4\n", " 75.7\n", @@ -4054,7 +4054,7 @@ " 2\n", " \n", " \n", - " Toyota Corolla\n", + " 19\n", " 33.9\n", " 4\n", " 71.1\n", @@ -4068,7 +4068,7 @@ " 1\n", " \n", " \n", - " Toyota Corona\n", + " 20\n", " 21.5\n", " 4\n", " 120.1\n", @@ -4082,7 +4082,7 @@ " 1\n", " \n", " \n", - " Dodge Challenger\n", + " 21\n", " 15.5\n", " 8\n", " 318.0\n", @@ -4096,7 +4096,7 @@ " 2\n", " \n", " \n", - " AMC Javelin\n", + " 22\n", " 15.2\n", " 8\n", " 304.0\n", @@ -4110,7 +4110,7 @@ " 2\n", " \n", " \n", - " Camaro Z28\n", + " 23\n", " 13.3\n", " 8\n", " 350.0\n", @@ -4124,7 +4124,7 @@ " 4\n", " \n", " \n", - " Pontiac Firebird\n", + " 24\n", " 19.2\n", " 8\n", " 400.0\n", @@ -4138,7 +4138,7 @@ " 2\n", " \n", " \n", - " Fiat X1-9\n", + " 25\n", " 27.3\n", " 4\n", " 79.0\n", @@ -4152,7 +4152,7 @@ " 1\n", " \n", " \n", - " Porsche 914-2\n", + " 26\n", " 26.0\n", " 4\n", " 120.3\n", @@ -4166,7 +4166,7 @@ " 2\n", " \n", " \n", - " Lotus Europa\n", + " 27\n", " 30.4\n", " 4\n", " 95.1\n", @@ -4180,7 +4180,7 @@ " 2\n", " \n", " \n", - " Ford Pantera L\n", + " 28\n", " 15.8\n", " 8\n", " 351.0\n", @@ -4194,7 +4194,7 @@ " 4\n", " \n", " \n", - " Ferrari Dino\n", + " 29\n", " 19.7\n", " 6\n", " 145.0\n", @@ -4208,7 +4208,7 @@ " 6\n", " \n", " \n", - " Maserati Bora\n", + " 30\n", " 15.0\n", " 8\n", " 301.0\n", @@ -4222,7 +4222,7 @@ " 8\n", " \n", " \n", - " Volvo 142E\n", + " 31\n", " 21.4\n", " 4\n", " 121.0\n", @@ -4240,73 +4240,39 @@ "" ], "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear \\\n", - "Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 \n", - "Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 \n", - "Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 \n", - "Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 \n", - "Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 \n", - "Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 \n", - "Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 \n", - "Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 \n", - "Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 \n", - "Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 \n", - "Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 \n", - "Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 \n", - "Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 \n", - "Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 \n", - "Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 \n", - "Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 \n", - "Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 \n", - "Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 \n", - "Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 \n", - "Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 \n", - "Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 \n", - "Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 \n", - "AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 \n", - "Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 \n", - "Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 \n", - "Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 \n", - "Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 \n", - "Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 \n", - "Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 \n", - "Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 \n", - "Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 \n", - "Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 \n", - "\n", - " carb \n", - "Mazda RX4 4 \n", - "Mazda RX4 Wag 4 \n", - "Datsun 710 1 \n", - "Hornet 4 Drive 1 \n", - "Hornet Sportabout 2 \n", - "Valiant 1 \n", - "Duster 360 4 \n", - "Merc 240D 2 \n", - "Merc 230 2 \n", - "Merc 280 4 \n", - "Merc 280C 4 \n", - "Merc 450SE 3 \n", - "Merc 450SL 3 \n", - "Merc 450SLC 3 \n", - "Cadillac Fleetwood 4 \n", - "Lincoln Continental 4 \n", - "Chrysler Imperial 4 \n", - "Fiat 128 1 \n", - "Honda Civic 2 \n", - "Toyota Corolla 1 \n", - "Toyota Corona 1 \n", - "Dodge Challenger 2 \n", - "AMC Javelin 2 \n", - "Camaro Z28 4 \n", - "Pontiac Firebird 2 \n", - "Fiat X1-9 1 \n", - "Porsche 914-2 2 \n", - "Lotus Europa 2 \n", - "Ford Pantera L 4 \n", - "Ferrari Dino 6 \n", - "Maserati Bora 8 \n", - "Volvo 142E 2 " + " mpg cyl disp hp drat wt qsec vs am gear carb\n", + "0 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4\n", + "1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4\n", + "2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1\n", + "3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1\n", + "4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2\n", + "5 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1\n", + "6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4\n", + "7 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2\n", + "8 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2\n", + "9 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4\n", + "10 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4\n", + "11 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3\n", + "12 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3\n", + "13 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3\n", + "14 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4\n", + "15 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4\n", + "16 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4\n", + "17 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1\n", + "18 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2\n", + "19 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1\n", + "20 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1\n", + "21 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2\n", + "22 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2\n", + "23 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4\n", + "24 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2\n", + "25 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1\n", + "26 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2\n", + "27 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2\n", + "28 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4\n", + "29 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6\n", + "30 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8\n", + "31 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2" ] }, "execution_count": 25, @@ -4321,7 +4287,7 @@ { "cell_type": "code", "execution_count": 26, - "id": "artistic-chosen", + "id": "alpine-notion", "metadata": {}, "outputs": [ { @@ -4360,7 +4326,7 @@ " \n", " \n", " \n", - " Valiant\n", + " 0\n", " 18.1\n", " 6\n", " 225.0\n", @@ -4374,7 +4340,7 @@ " 1\n", " \n", " \n", - " Duster 360\n", + " 1\n", " 14.3\n", " 8\n", " 360.0\n", @@ -4388,7 +4354,7 @@ " 4\n", " \n", " \n", - " Merc 240D\n", + " 2\n", " 24.4\n", " 4\n", " 146.7\n", @@ -4402,7 +4368,7 @@ " 2\n", " \n", " \n", - " Merc 230\n", + " 3\n", " 22.8\n", " 4\n", " 140.8\n", @@ -4416,7 +4382,7 @@ " 2\n", " \n", " \n", - " Merc 280\n", + " 4\n", " 19.2\n", " 6\n", " 167.6\n", @@ -4430,7 +4396,7 @@ " 4\n", " \n", " \n", - " Merc 280C\n", + " 5\n", " 17.8\n", " 6\n", " 167.6\n", @@ -4444,7 +4410,7 @@ " 4\n", " \n", " \n", - " Merc 450SE\n", + " 6\n", " 16.4\n", " 8\n", " 275.8\n", @@ -4458,7 +4424,7 @@ " 3\n", " \n", " \n", - " Merc 450SL\n", + " 7\n", " 17.3\n", " 8\n", " 275.8\n", @@ -4472,7 +4438,7 @@ " 3\n", " \n", " \n", - " Merc 450SLC\n", + " 8\n", " 15.2\n", " 8\n", " 275.8\n", @@ -4486,7 +4452,7 @@ " 3\n", " \n", " \n", - " Cadillac Fleetwood\n", + " 9\n", " 10.4\n", " 8\n", " 472.0\n", @@ -4500,7 +4466,7 @@ " 4\n", " \n", " \n", - " Lincoln Continental\n", + " 10\n", " 10.4\n", " 8\n", " 460.0\n", @@ -4514,7 +4480,7 @@ " 4\n", " \n", " \n", - " Chrysler Imperial\n", + " 11\n", " 14.7\n", " 8\n", " 440.0\n", @@ -4528,7 +4494,7 @@ " 4\n", " \n", " \n", - " Fiat 128\n", + " 12\n", " 32.4\n", " 4\n", " 78.7\n", @@ -4542,7 +4508,7 @@ " 1\n", " \n", " \n", - " Honda Civic\n", + " 13\n", " 30.4\n", " 4\n", " 75.7\n", @@ -4556,7 +4522,7 @@ " 2\n", " \n", " \n", - " Toyota Corolla\n", + " 14\n", " 33.9\n", " 4\n", " 71.1\n", @@ -4570,7 +4536,7 @@ " 1\n", " \n", " \n", - " Toyota Corona\n", + " 15\n", " 21.5\n", " 4\n", " 120.1\n", @@ -4584,7 +4550,7 @@ " 1\n", " \n", " \n", - " Dodge Challenger\n", + " 16\n", " 15.5\n", " 8\n", " 318.0\n", @@ -4598,7 +4564,7 @@ " 2\n", " \n", " \n", - " AMC Javelin\n", + " 17\n", " 15.2\n", " 8\n", " 304.0\n", @@ -4612,7 +4578,7 @@ " 2\n", " \n", " \n", - " Camaro Z28\n", + " 18\n", " 13.3\n", " 8\n", " 350.0\n", @@ -4626,7 +4592,7 @@ " 4\n", " \n", " \n", - " Pontiac Firebird\n", + " 19\n", " 19.2\n", " 8\n", " 400.0\n", @@ -4640,7 +4606,7 @@ " 2\n", " \n", " \n", - " Fiat X1-9\n", + " 20\n", " 27.3\n", " 4\n", " 79.0\n", @@ -4654,7 +4620,7 @@ " 1\n", " \n", " \n", - " Porsche 914-2\n", + " 21\n", " 26.0\n", " 4\n", " 120.3\n", @@ -4668,7 +4634,7 @@ " 2\n", " \n", " \n", - " Lotus Europa\n", + " 22\n", " 30.4\n", " 4\n", " 95.1\n", @@ -4682,7 +4648,7 @@ " 2\n", " \n", " \n", - " Ford Pantera L\n", + " 23\n", " 15.8\n", " 8\n", " 351.0\n", @@ -4696,7 +4662,7 @@ " 4\n", " \n", " \n", - " Ferrari Dino\n", + " 24\n", " 19.7\n", " 6\n", " 145.0\n", @@ -4710,7 +4676,7 @@ " 6\n", " \n", " \n", - " Maserati Bora\n", + " 25\n", " 15.0\n", " 8\n", " 301.0\n", @@ -4724,7 +4690,7 @@ " 8\n", " \n", " \n", - " Volvo 142E\n", + " 26\n", " 21.4\n", " 4\n", " 121.0\n", @@ -4742,63 +4708,34 @@ "" ], "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear \\\n", - "Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 \n", - "Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 \n", - "Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 \n", - "Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 \n", - "Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 \n", - "Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 \n", - "Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 \n", - "Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 \n", - "Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 \n", - "Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 \n", - "Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 \n", - "Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 \n", - "Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 \n", - "Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 \n", - "Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 \n", - "Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 \n", - "Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 \n", - "AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 \n", - "Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 \n", - "Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 \n", - "Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 \n", - "Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 \n", - "Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 \n", - "Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 \n", - "Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 \n", - "Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 \n", - "Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 \n", - "\n", - " carb \n", - "Valiant 1 \n", - "Duster 360 4 \n", - "Merc 240D 2 \n", - "Merc 230 2 \n", - "Merc 280 4 \n", - "Merc 280C 4 \n", - "Merc 450SE 3 \n", - "Merc 450SL 3 \n", - "Merc 450SLC 3 \n", - "Cadillac Fleetwood 4 \n", - "Lincoln Continental 4 \n", - "Chrysler Imperial 4 \n", - "Fiat 128 1 \n", - "Honda Civic 2 \n", - "Toyota Corolla 1 \n", - "Toyota Corona 1 \n", - "Dodge Challenger 2 \n", - "AMC Javelin 2 \n", - "Camaro Z28 4 \n", - "Pontiac Firebird 2 \n", - "Fiat X1-9 1 \n", - "Porsche 914-2 2 \n", - "Lotus Europa 2 \n", - "Ford Pantera L 4 \n", - "Ferrari Dino 6 \n", - "Maserati Bora 8 \n", - "Volvo 142E 2 " + " mpg cyl disp hp drat wt qsec vs am gear carb\n", + "0 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1\n", + "1 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4\n", + "2 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2\n", + "3 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2\n", + "4 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4\n", + "5 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4\n", + "6 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3\n", + "7 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3\n", + "8 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3\n", + "9 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4\n", + "10 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4\n", + "11 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4\n", + "12 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1\n", + "13 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2\n", + "14 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1\n", + "15 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1\n", + "16 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2\n", + "17 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2\n", + "18 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4\n", + "19 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2\n", + "20 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1\n", + "21 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2\n", + "22 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2\n", + "23 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4\n", + "24 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6\n", + "25 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8\n", + "26 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2" ] }, "execution_count": 26, @@ -4812,8 +4749,8 @@ }, { "cell_type": "code", - "execution_count": 28, - "id": "inner-usage", + "execution_count": 27, + "id": "tropical-panama", "metadata": {}, "outputs": [ { @@ -4845,32 +4782,32 @@ " \n", " 0\n", " a\n", - " 0.030618\n", + " 0.173758\n", " \n", " \n", " 1\n", " a\n", - " 0.051196\n", + " 0.249367\n", " \n", " \n", " 2\n", " b\n", - " 0.034103\n", + " 0.047988\n", " \n", " \n", " 3\n", " b\n", - " 0.048701\n", + " 0.059135\n", " \n", " \n", " 4\n", " c\n", - " 0.003454\n", + " 0.011330\n", " \n", " \n", " 5\n", " c\n", - " 0.012797\n", + " 0.021035\n", " \n", " \n", "\n", @@ -4878,15 +4815,15 @@ ], "text/plain": [ " group x\n", - "0 a 0.030618\n", - "1 a 0.051196\n", - "2 b 0.034103\n", - "3 b 0.048701\n", - "4 c 0.003454\n", - "5 c 0.012797" + "0 a 0.173758\n", + "1 a 0.249367\n", + "2 b 0.047988\n", + "3 b 0.059135\n", + "4 c 0.011330\n", + "5 c 0.021035" ] }, - "execution_count": 28, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -4901,8 +4838,8 @@ }, { "cell_type": "code", - "execution_count": 29, - "id": "photographic-methodology", + "execution_count": 28, + "id": "irish-doctor", "metadata": {}, "outputs": [ { @@ -4934,32 +4871,32 @@ " \n", " 0\n", " a\n", - " 0.863627\n", + " 0.815562\n", " \n", " \n", " 1\n", " a\n", - " 0.819979\n", + " 0.697991\n", " \n", " \n", " 2\n", " b\n", - " 0.991147\n", + " 0.957060\n", " \n", " \n", " 3\n", " b\n", - " 0.932743\n", + " 0.857645\n", " \n", " \n", " 4\n", " c\n", - " 0.975293\n", + " 0.972660\n", " \n", " \n", " 5\n", " c\n", - " 0.948667\n", + " 0.878532\n", " \n", " \n", "\n", @@ -4967,15 +4904,15 @@ ], "text/plain": [ " group x\n", - "0 a 0.863627\n", - "1 a 0.819979\n", - "2 b 0.991147\n", - "3 b 0.932743\n", - "4 c 0.975293\n", - "5 c 0.948667" + "0 a 0.815562\n", + "1 a 0.697991\n", + "2 b 0.957060\n", + "3 b 0.857645\n", + "4 c 0.972660\n", + "5 c 0.878532" ] }, - "execution_count": 29, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -4986,8 +4923,8 @@ }, { "cell_type": "code", - "execution_count": 33, - "id": "coordinated-sleeping", + "execution_count": 29, + "id": "intense-arbitration", "metadata": {}, "outputs": [ { @@ -5019,32 +4956,32 @@ " \n", " 0\n", " a\n", - " 0.203219\n", + " 0.815562\n", " \n", " \n", " 1\n", " a\n", - " 0.292320\n", + " 0.258590\n", " \n", " \n", " 2\n", " b\n", - " 0.561585\n", + " 0.495959\n", " \n", " \n", " 3\n", " b\n", - " 0.638679\n", + " 0.957060\n", " \n", " \n", " 4\n", " c\n", - " 0.864610\n", + " 0.011330\n", " \n", " \n", " 5\n", " c\n", - " 0.369900\n", + " 0.217908\n", " \n", " \n", "\n", @@ -5052,15 +4989,15 @@ ], "text/plain": [ " group x\n", - "0 a 0.203219\n", - "1 a 0.292320\n", - "2 b 0.561585\n", - "3 b 0.638679\n", - "4 c 0.864610\n", - "5 c 0.369900" + "0 a 0.815562\n", + "1 a 0.258590\n", + "2 b 0.495959\n", + "3 b 0.957060\n", + "4 c 0.011330\n", + "5 c 0.217908" ] }, - "execution_count": 33, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } diff --git a/examples/summarise.ipynb b/docs/notebooks/summarise.ipynb similarity index 85% rename from examples/summarise.ipynb rename to docs/notebooks/summarise.ipynb index 4eef2428..91f25394 100644 --- a/examples/summarise.ipynb +++ b/docs/notebooks/summarise.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "european-observer", + "id": "respective-female", "metadata": {}, "outputs": [], "source": [ @@ -16,7 +16,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "similar-school", + "id": "prospective-documentary", "metadata": {}, "outputs": [ { @@ -71,14 +71,15 @@ { "cell_type": "code", "execution_count": 3, - "id": "gorgeous-cleaners", + "id": "independent-xerox", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-12 18:31:33][datar][ INFO] `summarise()` ungrouping output (override with `_groups` argument)\n" + "[2021-04-03 00:52:56][datar][ INFO] `summarise()` regrouping output by ['cyl']. You can override using the `.groups` argument.\n", + "[2021-04-03 00:52:56][datar][ INFO] # [DataFrameGroupBy] Groups: ['cyl'] (3)\n" ] }, { @@ -110,17 +111,17 @@ " \n", " \n", " 0\n", - " 6\n", - " 183.314286\n", - " 7\n", - " \n", - " \n", - " 1\n", " 4\n", " 105.136364\n", " 11\n", " \n", " \n", + " 1\n", + " 6\n", + " 183.314286\n", + " 7\n", + " \n", + " \n", " 2\n", " 8\n", " 353.100000\n", @@ -132,8 +133,8 @@ ], "text/plain": [ " cyl mean n\n", - "0 6 183.314286 7\n", - "1 4 105.136364 11\n", + "0 4 105.136364 11\n", + "1 6 183.314286 7\n", "2 8 353.100000 14" ] }, @@ -145,15 +146,23 @@ "source": [ "mtcars >> \\\n", " group_by(f.cyl) >> \\\n", - " summarise(mean=mean(f.disp), n=n())" + " summarise(mean=mean(f.disp), n=n()) >> \\\n", + " display()" ] }, { "cell_type": "code", "execution_count": 4, - "id": "breeding-procurement", + "id": "subject-cambodia", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-04-03 00:52:56][datar][ INFO] # [DataFrameGroupBy] Groups: ['cyl'] (3)\n" + ] + }, { "data": { "text/html": [ @@ -183,17 +192,17 @@ " \n", " \n", " 0\n", - " 6\n", - " 183.314286\n", - " 7\n", - " \n", - " \n", - " 1\n", " 4\n", " 105.136364\n", " 11\n", " \n", " \n", + " 1\n", + " 6\n", + " 183.314286\n", + " 7\n", + " \n", + " \n", " 2\n", " 8\n", " 353.100000\n", @@ -205,8 +214,8 @@ ], "text/plain": [ " cyl mean n\n", - "0 6 183.314286 7\n", - "1 4 105.136364 11\n", + "0 4 105.136364 11\n", + "1 6 183.314286 7\n", "2 8 353.100000 14" ] }, @@ -219,15 +228,24 @@ "summarise.inform = False\n", "mtcars >> \\\n", " group_by(f.cyl) >> \\\n", - " summarise(mean=mean(f.disp), n=n())" + " summarise(mean=mean(f.disp), n=n()) >> \\\n", + " display()" ] }, { "cell_type": "code", "execution_count": 5, - "id": "inner-playlist", + "id": "preliminary-monthly", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-04-03 00:52:56][datar][ INFO] `summarise()` regrouping output by ['cyl']. You can override using the `.groups` argument.\n", + "[2021-04-03 00:52:56][datar][ INFO] # [DataFrameGroupBy] Groups: ['cyl'] (3)\n" + ] + }, { "data": { "text/html": [ @@ -257,26 +275,26 @@ " \n", " \n", " 0\n", - " 6\n", - " 160.00\n", + " 4\n", + " 78.85\n", " 0.25\n", " \n", " \n", " 1\n", - " 6\n", - " 196.30\n", + " 4\n", + " 120.65\n", " 0.75\n", " \n", " \n", " 2\n", - " 4\n", - " 78.85\n", + " 6\n", + " 160.00\n", " 0.25\n", " \n", " \n", " 3\n", - " 4\n", - " 120.65\n", + " 6\n", + " 196.30\n", " 0.75\n", " \n", " \n", @@ -297,10 +315,10 @@ ], "text/plain": [ " cyl qs prob\n", - "0 6 160.00 0.25\n", - "1 6 196.30 0.75\n", - "2 4 78.85 0.25\n", - "3 4 120.65 0.75\n", + "0 4 78.85 0.25\n", + "1 4 120.65 0.75\n", + "2 6 160.00 0.25\n", + "3 6 196.30 0.75\n", "4 8 301.75 0.25\n", "5 8 390.00 0.75" ] @@ -315,13 +333,14 @@ "\n", "mtcars >> \\\n", " group_by(f.cyl) >> \\\n", - " summarise(qs=quantile(f.disp, c(0.25, 0.75)), prob=c(0.25, 0.75)) " + " summarise(qs=quantile(f.disp, c(0.25, 0.75)), prob=c(0.25, 0.75)) >> \\\n", + " display()" ] }, { "cell_type": "code", "execution_count": 6, - "id": "optical-change", + "id": "metric-jimmy", "metadata": {}, "outputs": [ { @@ -353,26 +372,26 @@ " \n", " \n", " 0\n", - " 6\n", - " 160.00\n", + " 4\n", + " 78.85\n", " 0.25\n", " \n", " \n", " 1\n", - " 6\n", - " 196.30\n", + " 4\n", + " 120.65\n", " 0.75\n", " \n", " \n", " 2\n", - " 4\n", - " 78.85\n", + " 6\n", + " 160.00\n", " 0.25\n", " \n", " \n", " 3\n", - " 4\n", - " 120.65\n", + " 6\n", + " 196.30\n", " 0.75\n", " \n", " \n", @@ -393,10 +412,10 @@ ], "text/plain": [ " cyl qs prob\n", - "0 6 160.00 0.25\n", - "1 6 196.30 0.75\n", - "2 4 78.85 0.25\n", - "3 4 120.65 0.75\n", + "0 4 78.85 0.25\n", + "1 4 120.65 0.75\n", + "2 6 160.00 0.25\n", + "3 6 196.30 0.75\n", "4 8 301.75 0.25\n", "5 8 390.00 0.75" ] @@ -407,15 +426,23 @@ } ], "source": [ - "_ >> showme()" + "_ >> display()" ] }, { "cell_type": "code", "execution_count": 7, - "id": "starting-works", + "id": "emotional-negative", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-04-03 00:52:56][datar][ INFO] `summarise()` regrouping output by ['cyl']. You can override using the `.groups` argument.\n", + "[2021-04-03 00:52:56][datar][ INFO] # [DataFrameGroupBy] Groups: ['cyl'] (3)\n" + ] + }, { "data": { "text/html": [ @@ -445,26 +472,26 @@ " \n", " \n", " 0\n", - " 6\n", - " 160.00\n", + " 4\n", + " 78.85\n", " 0.25\n", " \n", " \n", " 1\n", - " 6\n", - " 196.30\n", + " 4\n", + " 120.65\n", " 0.75\n", " \n", " \n", " 2\n", - " 4\n", - " 78.85\n", + " 6\n", + " 160.00\n", " 0.25\n", " \n", " \n", " 3\n", - " 4\n", - " 120.65\n", + " 6\n", + " 196.30\n", " 0.75\n", " \n", " \n", @@ -485,10 +512,10 @@ ], "text/plain": [ " cyl x probs\n", - "0 6 160.00 0.25\n", - "1 6 196.30 0.75\n", - "2 4 78.85 0.25\n", - "3 4 120.65 0.75\n", + "0 4 78.85 0.25\n", + "1 4 120.65 0.75\n", + "2 6 160.00 0.25\n", + "3 6 196.30 0.75\n", "4 8 301.75 0.25\n", "5 8 390.00 0.75" ] @@ -508,26 +535,27 @@ "\n", "mtcars >> \\\n", " group_by(f.cyl) >> \\\n", - " summarise(my_quantile(f.disp, c(0.25, 0.75)))\n" + " summarise(my_quantile(f.disp, c(0.25, 0.75))) >> \\\n", + " display()\n" ] }, { "cell_type": "code", "execution_count": 8, - "id": "approved-interest", + "id": "efficient-fellowship", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-12 18:31:34][datar][ INFO] `summarise()` regrouping output by ['cyl'] (override with `_groups` argument)\n" + "[2021-04-03 00:52:56][datar][ INFO] `summarise()` regrouping output by ['cyl', 'vs']. You can override using the `.groups` argument.\n" ] }, { "data": { "text/plain": [ - "['cyl']" + "['cyl', 'vs']" ] }, "execution_count": 8, @@ -545,14 +573,15 @@ { "cell_type": "code", "execution_count": 9, - "id": "fourth-reservoir", + "id": "regulation-guidance", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2021-03-12 18:31:34][datar][ INFO] `summarise()` ungrouping output (override with `_groups` argument)\n" + "[2021-04-03 00:52:56][datar][ INFO] `summarise()` regrouping output by ['cyl']. You can override using the `.groups` argument.\n", + "[2021-04-03 00:52:56][datar][ INFO] # [DataFrameGroupBy] Groups: ['cyl'] (3)\n" ] }, { @@ -584,17 +613,17 @@ " \n", " \n", " 0\n", - " 6\n", - " 183.314286\n", - " 41.562460\n", - " \n", - " \n", - " 1\n", " 4\n", " 105.136364\n", " 26.871594\n", " \n", " \n", + " 1\n", + " 6\n", + " 183.314286\n", + " 41.562460\n", + " \n", + " \n", " 2\n", " 8\n", " 353.100000\n", @@ -606,8 +635,8 @@ ], "text/plain": [ " cyl disp sd\n", - "0 6 183.314286 41.562460\n", - "1 4 105.136364 26.871594\n", + "0 4 105.136364 26.871594\n", + "1 6 183.314286 41.562460\n", "2 8 353.100000 67.771324" ] }, @@ -620,13 +649,14 @@ "# Unlike dplyr's summarise, f.disp can be reused.\n", "mtcars >> \\\n", " group_by(f.cyl) >> \\\n", - " summarise(disp=mean(f.disp), sd=sd(f.disp))" + " summarise(disp=mean(f.disp), sd=sd(f.disp)) >> \\\n", + " display()" ] }, { "cell_type": "code", "execution_count": 10, - "id": "functional-demonstration", + "id": "sized-journalism", "metadata": {}, "outputs": [ { diff --git a/examples/tibble.ipynb b/docs/notebooks/tibble.ipynb similarity index 87% rename from examples/tibble.ipynb rename to docs/notebooks/tibble.ipynb index 94dfbbeb..f35ef2bf 100644 --- a/examples/tibble.ipynb +++ b/docs/notebooks/tibble.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "exempt-locking", + "id": "incident-trustee", "metadata": {}, "outputs": [], "source": [ @@ -18,8 +18,8 @@ }, { "cell_type": "code", - "execution_count": 22, - "id": "stunning-crest", + "execution_count": 2, + "id": "selective-apache", "metadata": {}, "outputs": [ { @@ -86,7 +86,7 @@ "4 4 8" ] }, - "execution_count": 22, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -99,7 +99,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "bizarre-disability", + "id": "equipped-designation", "metadata": {}, "outputs": [ { @@ -184,7 +184,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "welsh-bones", + "id": "incredible-compiler", "metadata": {}, "outputs": [ { @@ -205,7 +205,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "herbal-navigator", + "id": "still-respect", "metadata": {}, "outputs": [ { @@ -236,53 +236,53 @@ " \n", " \n", " 0\n", - " 0.264847\n", - " 0.529694\n", + " 0.907413\n", + " 1.814827\n", " \n", " \n", " 1\n", - " 0.616953\n", - " 1.233906\n", + " 0.531231\n", + " 1.062462\n", " \n", " \n", " 2\n", - " 0.334713\n", - " 0.669425\n", + " 0.271175\n", + " 0.542350\n", " \n", " \n", " 3\n", - " 0.584531\n", - " 1.169061\n", + " 0.006298\n", + " 0.012596\n", " \n", " \n", " 4\n", - " 0.230368\n", - " 0.460736\n", + " 0.242987\n", + " 0.485974\n", " \n", " \n", " 5\n", - " 0.041434\n", - " 0.082868\n", + " 0.314747\n", + " 0.629494\n", " \n", " \n", " 6\n", - " 0.756360\n", - " 1.512721\n", + " 0.328753\n", + " 0.657506\n", " \n", " \n", " 7\n", - " 0.551289\n", - " 1.102578\n", + " 0.693651\n", + " 1.387301\n", " \n", " \n", " 8\n", - " 0.648180\n", - " 1.296360\n", + " 0.315403\n", + " 0.630806\n", " \n", " \n", " 9\n", - " 0.408500\n", - " 0.816999\n", + " 0.809689\n", + " 1.619377\n", " \n", " \n", "\n", @@ -290,16 +290,16 @@ ], "text/plain": [ " x y\n", - "0 0.264847 0.529694\n", - "1 0.616953 1.233906\n", - "2 0.334713 0.669425\n", - "3 0.584531 1.169061\n", - "4 0.230368 0.460736\n", - "5 0.041434 0.082868\n", - "6 0.756360 1.512721\n", - "7 0.551289 1.102578\n", - "8 0.648180 1.296360\n", - "9 0.408500 0.816999" + "0 0.907413 1.814827\n", + "1 0.531231 1.062462\n", + "2 0.271175 0.542350\n", + "3 0.006298 0.012596\n", + "4 0.242987 0.485974\n", + "5 0.314747 0.629494\n", + "6 0.328753 0.657506\n", + "7 0.693651 1.387301\n", + "8 0.315403 0.630806\n", + "9 0.809689 1.619377" ] }, "execution_count": 5, @@ -314,31 +314,41 @@ { "cell_type": "code", "execution_count": 6, - "id": "experimental-april", + "id": "transsexual-merit", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Column name 'x' duplicated.\n" + "Names must be unique: x\n" ] } ], "source": [ + "from datar.core.exceptions import NameNonUniqueError\n", "x = 1\n", "try:\n", " tibble(x, x)\n", - "except ValueError as err:\n", + "except NameNonUniqueError as err:\n", " print(err)" ] }, { "cell_type": "code", "execution_count": 7, - "id": "accepting-tribune", + "id": "medieval-drove", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-04-03 01:01:05][datar][WARNING] New names:\n", + "[2021-04-03 01:01:05][datar][WARNING] * 'x' -> 'x__0'\n", + "[2021-04-03 01:01:05][datar][WARNING] * 'x' -> 'x__1'\n" + ] + }, { "data": { "text/html": [ @@ -360,8 +370,8 @@ " \n", " \n", " \n", - " x_1\n", - " x_2\n", + " x__0\n", + " x__1\n", " \n", " \n", " \n", @@ -375,8 +385,8 @@ "" ], "text/plain": [ - " x_1 x_2\n", - "0 1 1" + " x__0 x__1\n", + "0 1 1" ] }, "execution_count": 7, @@ -391,7 +401,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "talented-soviet", + "id": "quality-thursday", "metadata": {}, "outputs": [ { @@ -416,20 +426,22 @@ " \n", " \n", " x\n", + " x\n", " \n", " \n", " \n", " \n", " 0\n", " 1\n", + " 1\n", " \n", " \n", "\n", "" ], "text/plain": [ - " x\n", - "0 1" + " x x\n", + "0 1 1" ] }, "execution_count": 8, @@ -444,9 +456,18 @@ { "cell_type": "code", "execution_count": 9, - "id": "pressing-vermont", + "id": "automatic-reservoir", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-04-03 01:01:05][datar][WARNING] New names:\n", + "[2021-04-03 01:01:05][datar][WARNING] * 'a * 1' -> 'a___1'\n", + "[2021-04-03 01:01:05][datar][WARNING] * 'a * 2' -> 'a___2'\n" + ] + }, { "data": { "text/html": [ @@ -468,8 +489,8 @@ " \n", " \n", " \n", - " a_1\n", - " a_2\n", + " a___1\n", + " a___2\n", " \n", " \n", " \n", @@ -483,8 +504,8 @@ "" ], "text/plain": [ - " a_1 a_2\n", - "0 1 2" + " a___1 a___2\n", + "0 1 2" ] }, "execution_count": 9, @@ -500,7 +521,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "angry-level", + "id": "minimal-victim", "metadata": {}, "outputs": [ { @@ -549,11 +570,16 @@ } ], "source": [ - "def make_unique(name, raw_names, new_names):\n", - " name_count = new_names.count(name)\n", - " if name_count == 0:\n", - " return name\n", - " return f'{name}_{name_count}'\n", + "from typing import Iterable\n", + "def make_unique(names: Iterable[str]):\n", + " new_names = []\n", + " for name in names:\n", + " name_count = new_names.count(name)\n", + " if name_count == 0:\n", + " new_names.append(name)\n", + " else:\n", + " new_names.append(f'{name}_{name_count}')\n", + " return new_names\n", "\n", "tibble(a, a, _name_repair=make_unique)" ] @@ -561,7 +587,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "aquatic-renaissance", + "id": "burning-speaking", "metadata": {}, "outputs": [ { @@ -621,7 +647,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "improving-exemption", + "id": "binding-trailer", "metadata": {}, "outputs": [ { @@ -676,7 +702,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "ranging-terrorism", + "id": "announced-valuation", "metadata": {}, "outputs": [ { @@ -758,7 +784,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "excess-climb", + "id": "younger-samba", "metadata": {}, "outputs": [ { @@ -783,12 +809,12 @@ " \n", " \n", " a\n", - " b[0]\n", - " b[1]\n", - " b[2]\n", - " b[3]\n", - " c['x']\n", - " c['y']\n", + " b$0\n", + " b$1\n", + " b$2\n", + " b$3\n", + " c$x$0\n", + " c$x$1\n", " \n", " \n", " \n", @@ -837,11 +863,11 @@ "" ], "text/plain": [ - " a b[0] b[1] b[2] b[3] c['x'] c['y']\n", - "0 0 1 0 0 0 1 0\n", - "1 1 0 1 0 0 0 1\n", - "2 2 0 0 1 0 0 0\n", - "3 3 0 0 0 1 0 0" + " a b$0 b$1 b$2 b$3 c$x$0 c$x$1\n", + "0 0 1 0 0 0 1 0\n", + "1 1 0 1 0 0 0 1\n", + "2 2 0 0 1 0 0 0\n", + "3 3 0 0 0 1 0 0" ] }, "execution_count": 14, @@ -862,7 +888,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "economic-arcade", + "id": "sealed-prisoner", "metadata": {}, "outputs": [ { @@ -883,7 +909,7 @@ { "cell_type": "code", "execution_count": 16, - "id": "military-christopher", + "id": "wrong-genre", "metadata": {}, "outputs": [ { @@ -936,7 +962,7 @@ { "cell_type": "code", "execution_count": 17, - "id": "continent-outline", + "id": "figured-knight", "metadata": {}, "outputs": [ { @@ -992,7 +1018,7 @@ { "cell_type": "code", "execution_count": 18, - "id": "tribal-sessions", + "id": "cooperative-yemen", "metadata": {}, "outputs": [ { @@ -1047,7 +1073,7 @@ { "cell_type": "code", "execution_count": 19, - "id": "naughty-serum", + "id": "interesting-sodium", "metadata": {}, "outputs": [ { @@ -1119,7 +1145,7 @@ { "cell_type": "code", "execution_count": 20, - "id": "forced-adolescent", + "id": "nervous-supplement", "metadata": {}, "outputs": [ { diff --git a/examples/uncount.ipynb b/docs/notebooks/uncount.ipynb similarity index 97% rename from examples/uncount.ipynb rename to docs/notebooks/uncount.ipynb index 92bbac69..9895b025 100644 --- a/examples/uncount.ipynb +++ b/docs/notebooks/uncount.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "coral-hampton", + "id": "maritime-remove", "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "outdoor-factory", + "id": "revolutionary-reconstruction", "metadata": {}, "outputs": [ { @@ -79,7 +79,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "minor-friendship", + "id": "thousand-brass", "metadata": {}, "outputs": [ { @@ -146,7 +146,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "constitutional-genetics", + "id": "direct-denver", "metadata": {}, "outputs": [ { @@ -219,7 +219,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "accredited-charger", + "id": "colonial-detector", "metadata": {}, "outputs": [ { @@ -286,7 +286,7 @@ { "cell_type": "code", "execution_count": null, - "id": "associate-tucson", + "id": "strange-complexity", "metadata": {}, "outputs": [], "source": [] diff --git a/examples/unite.ipynb b/docs/notebooks/unite.ipynb similarity index 97% rename from examples/unite.ipynb rename to docs/notebooks/unite.ipynb index b68e803d..4fd983b1 100644 --- a/examples/unite.ipynb +++ b/docs/notebooks/unite.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "affecting-teens", + "id": "informal-tunisia", "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "premium-butter", + "id": "virgin-iceland", "metadata": {}, "outputs": [ { @@ -89,7 +89,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "distant-whole", + "id": "adverse-basis", "metadata": {}, "outputs": [ { @@ -167,7 +167,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "proved-batman", + "id": "offensive-gathering", "metadata": {}, "outputs": [ { @@ -244,8 +244,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "exterior-marker", + "execution_count": 5, + "id": "consolidated-george", "metadata": {}, "outputs": [ { @@ -306,7 +306,7 @@ "3 nan nan" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } diff --git a/examples/with_groups.ipynb b/docs/notebooks/with_groups.ipynb similarity index 74% rename from examples/with_groups.ipynb rename to docs/notebooks/with_groups.ipynb index a91e60d7..c6a31507 100644 --- a/examples/with_groups.ipynb +++ b/docs/notebooks/with_groups.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "insured-factory", + "id": "collective-august", "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "linear-butter", + "id": "assumed-morgan", "metadata": {}, "outputs": [ { @@ -41,6 +41,7 @@ " \n", " g\n", " x\n", + " V0\n", " x_mean\n", " \n", " \n", @@ -48,44 +49,49 @@ " \n", " 0\n", " 1\n", - " 0.254807\n", - " 0.130432\n", + " 0.389578\n", + " g\n", + " 0.377124\n", " \n", " \n", " 1\n", " 1\n", - " 0.006056\n", - " 0.130432\n", + " 0.381427\n", + " g\n", + " 0.377124\n", " \n", " \n", " 2\n", " 2\n", - " 0.878056\n", - " 0.456723\n", + " 0.164776\n", + " g\n", + " 0.377124\n", " \n", " \n", " 3\n", " 2\n", - " 0.035390\n", - " 0.456723\n", + " 0.143215\n", + " g\n", + " 0.377124\n", " \n", " \n", " 4\n", " 3\n", - " 0.823560\n", - " 0.823560\n", + " 0.806624\n", + " g\n", + " 0.377124\n", " \n", " \n", "\n", "" ], "text/plain": [ - " g x x_mean\n", - "0 1 0.254807 0.130432\n", - "1 1 0.006056 0.130432\n", - "2 2 0.878056 0.456723\n", - "3 2 0.035390 0.456723\n", - "4 3 0.823560 0.823560" + " g x V0 x_mean\n", + "0 1 0.389578 g 0.377124\n", + "1 1 0.381427 g 0.377124\n", + "2 2 0.164776 g 0.377124\n", + "3 2 0.143215 g 0.377124\n", + "4 3 0.806624 g 0.377124" ] }, "execution_count": 2, @@ -101,7 +107,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "lonely-korean", + "id": "mechanical-territory", "metadata": {}, "outputs": [ { @@ -127,6 +133,7 @@ " \n", " g\n", " x\n", + " V0\n", " x1\n", " \n", " \n", @@ -134,44 +141,49 @@ " \n", " 0\n", " 1\n", - " 0.254807\n", - " 0.254807\n", + " 0.389578\n", + " g\n", + " 0.389578\n", " \n", " \n", " 1\n", " 1\n", - " 0.006056\n", - " 0.254807\n", + " 0.381427\n", + " g\n", + " 0.389578\n", " \n", " \n", " 2\n", " 2\n", - " 0.878056\n", - " 0.878056\n", + " 0.164776\n", + " g\n", + " 0.389578\n", " \n", " \n", " 3\n", " 2\n", - " 0.035390\n", - " 0.878056\n", + " 0.143215\n", + " g\n", + " 0.389578\n", " \n", " \n", " 4\n", " 3\n", - " 0.823560\n", - " 0.823560\n", + " 0.806624\n", + " g\n", + " 0.389578\n", " \n", " \n", "\n", "" ], "text/plain": [ - " g x x1\n", - "0 1 0.254807 0.254807\n", - "1 1 0.006056 0.254807\n", - "2 2 0.878056 0.878056\n", - "3 2 0.035390 0.878056\n", - "4 3 0.823560 0.823560" + " g x V0 x1\n", + "0 1 0.389578 g 0.389578\n", + "1 1 0.381427 g 0.389578\n", + "2 2 0.164776 g 0.389578\n", + "3 2 0.143215 g 0.389578\n", + "4 3 0.806624 g 0.389578" ] }, "execution_count": 3, @@ -186,7 +198,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "recognized-jaguar", + "id": "super-driver", "metadata": {}, "outputs": [ { @@ -219,32 +231,32 @@ " \n", " 0\n", " 1\n", - " 0.254807\n", - " 0.399574\n", + " 0.389578\n", + " 0.377124\n", " \n", " \n", " 1\n", " 1\n", - " 0.006056\n", - " 0.399574\n", + " 0.381427\n", + " 0.377124\n", " \n", " \n", " 2\n", " 2\n", - " 0.878056\n", - " 0.399574\n", + " 0.164776\n", + " 0.377124\n", " \n", " \n", " 3\n", " 2\n", - " 0.035390\n", - " 0.399574\n", + " 0.143215\n", + " 0.377124\n", " \n", " \n", " 4\n", " 3\n", - " 0.823560\n", - " 0.399574\n", + " 0.806624\n", + " 0.377124\n", " \n", " \n", "\n", @@ -252,11 +264,11 @@ ], "text/plain": [ " g x x_mean\n", - "0 1 0.254807 0.399574\n", - "1 1 0.006056 0.399574\n", - "2 2 0.878056 0.399574\n", - "3 2 0.035390 0.399574\n", - "4 3 0.823560 0.399574" + "0 1 0.389578 0.377124\n", + "1 1 0.381427 0.377124\n", + "2 2 0.164776 0.377124\n", + "3 2 0.143215 0.377124\n", + "4 3 0.806624 0.377124" ] }, "execution_count": 4, @@ -273,7 +285,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "printable-protection", + "id": "foreign-aviation", "metadata": {}, "outputs": [ { @@ -303,23 +315,23 @@ " \n", " \n", " 0\n", - " 0.254807\n", + " 0.389578\n", " \n", " \n", " 1\n", - " 0.006056\n", + " 0.381427\n", " \n", " \n", " 2\n", - " 0.878056\n", + " 0.164776\n", " \n", " \n", " 3\n", - " 0.035390\n", + " 0.143215\n", " \n", " \n", " 4\n", - " 0.823560\n", + " 0.806624\n", " \n", " \n", "\n", @@ -327,11 +339,11 @@ ], "text/plain": [ " x\n", - "0 0.254807\n", - "1 0.006056\n", - "2 0.878056\n", - "3 0.035390\n", - "4 0.823560" + "0 0.389578\n", + "1 0.381427\n", + "2 0.164776\n", + "3 0.143215\n", + "4 0.806624" ] }, "execution_count": 5, diff --git a/docs/requirements.txt b/docs/requirements.txt index 79e540df..110de8d8 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,4 @@ mkdocs-material pymdown-extensions mkapi-fix +mkdocs-jupyter diff --git a/examples/setops.ipynb b/examples/setops.ipynb deleted file mode 100644 index 3efa4e16..00000000 --- a/examples/setops.ipynb +++ /dev/null @@ -1,2401 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "federal-truth", - "metadata": {}, - "outputs": [], - "source": [ - "# https://dplyr.tidyverse.org/reference/setops.html\n", - "\n", - "from datar.datasets import mtcars\n", - "from datar.all import *" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "universal-geography", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mpgcyldisphpdratwtqsecvsamgearcarb
019.26167.61233.923.44018.301044
117.86167.61233.923.44018.901044
216.48275.81803.074.07017.400033
317.38275.81803.073.73017.600033
415.28275.81803.073.78018.000033
510.48472.02052.935.25017.980034
610.48460.02153.005.42417.820034
714.78440.02303.235.34517.420034
832.4478.7664.082.20019.471141
930.4475.7524.931.61518.521142
1033.9471.1654.221.83519.901141
\n", - "
" - ], - "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear carb\n", - "0 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4\n", - "1 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4\n", - "2 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3\n", - "3 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3\n", - "4 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3\n", - "5 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4\n", - "6 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4\n", - "7 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4\n", - "8 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1\n", - "9 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2\n", - "10 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "first = mtcars >> slice(f[:20])\n", - "second = mtcars >> slice(f[9:33])\n", - "\n", - "intersect(first, second) # or first >> intersect(second)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "demanding-money", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mpgcyldisphpdratwtqsecvsamgearcarb
021.06160.01103.902.62016.460144
121.06160.01103.902.87517.020144
222.84108.0933.852.32018.611141
321.46258.01103.083.21519.441031
418.78360.01753.153.44017.020032
518.16225.01052.763.46020.221031
614.38360.02453.213.57015.840034
724.44146.7623.693.19020.001042
822.84140.8953.923.15022.901042
919.26167.61233.923.44018.301044
1017.86167.61233.923.44018.901044
1116.48275.81803.074.07017.400033
1217.38275.81803.073.73017.600033
1315.28275.81803.073.78018.000033
1410.48472.02052.935.25017.980034
1510.48460.02153.005.42417.820034
1614.78440.02303.235.34517.420034
1732.4478.7664.082.20019.471141
1830.4475.7524.931.61518.521142
1933.9471.1654.221.83519.901141
2021.54120.1973.702.46520.011031
2115.58318.01502.763.52016.870032
2215.28304.01503.153.43517.300032
2313.38350.02453.733.84015.410034
2419.28400.01753.083.84517.050032
2527.3479.0664.081.93518.901141
2626.04120.3914.432.14016.700152
2730.4495.11133.771.51316.901152
2815.88351.02644.223.17014.500154
2919.76145.01753.622.77015.500156
3015.08301.03353.543.57014.600158
3121.44121.01094.112.78018.601142
\n", - "
" - ], - "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear carb\n", - "0 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4\n", - "1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4\n", - "2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1\n", - "3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1\n", - "4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2\n", - "5 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1\n", - "6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4\n", - "7 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2\n", - "8 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2\n", - "9 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4\n", - "10 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4\n", - "11 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3\n", - "12 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3\n", - "13 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3\n", - "14 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4\n", - "15 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4\n", - "16 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4\n", - "17 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1\n", - "18 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2\n", - "19 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1\n", - "20 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1\n", - "21 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2\n", - "22 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2\n", - "23 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4\n", - "24 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2\n", - "25 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1\n", - "26 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2\n", - "27 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2\n", - "28 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4\n", - "29 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6\n", - "30 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8\n", - "31 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "union(first, second)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "analyzed-thanksgiving", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mpgcyldisphpdratwtqsecvsamgearcarb
021.06160.01103.902.62016.460144
121.06160.01103.902.87517.020144
222.84108.0933.852.32018.611141
321.46258.01103.083.21519.441031
418.78360.01753.153.44017.020032
518.16225.01052.763.46020.221031
614.38360.02453.213.57015.840034
724.44146.7623.693.19020.001042
822.84140.8953.923.15022.901042
\n", - "
" - ], - "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear carb\n", - "0 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4\n", - "1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4\n", - "2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1\n", - "3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1\n", - "4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2\n", - "5 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1\n", - "6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4\n", - "7 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2\n", - "8 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "setdiff(first, second)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "defined-delivery", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mpgcyldisphpdratwtqsecvsamgearcarb
021.54120.1973.702.46520.011031
115.58318.01502.763.52016.870032
215.28304.01503.153.43517.300032
313.38350.02453.733.84015.410034
419.28400.01753.083.84517.050032
527.3479.0664.081.93518.901141
626.04120.3914.432.14016.700152
730.4495.11133.771.51316.901152
815.88351.02644.223.17014.500154
919.76145.01753.622.77015.500156
1015.08301.03353.543.57014.600158
1121.44121.01094.112.78018.601142
\n", - "
" - ], - "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear carb\n", - "0 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1\n", - "1 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2\n", - "2 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2\n", - "3 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4\n", - "4 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2\n", - "5 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1\n", - "6 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2\n", - "7 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2\n", - "8 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4\n", - "9 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6\n", - "10 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8\n", - "11 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "setdiff(second, first)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "fatal-diary", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mpgcyldisphpdratwtqsecvsamgearcarb
Mazda RX421.06160.01103.902.62016.460144
Mazda RX4 Wag21.06160.01103.902.87517.020144
Datsun 71022.84108.0933.852.32018.611141
Hornet 4 Drive21.46258.01103.083.21519.441031
Hornet Sportabout18.78360.01753.153.44017.020032
Valiant18.16225.01052.763.46020.221031
Duster 36014.38360.02453.213.57015.840034
Merc 240D24.44146.7623.693.19020.001042
Merc 23022.84140.8953.923.15022.901042
Merc 28019.26167.61233.923.44018.301044
Merc 280C17.86167.61233.923.44018.901044
Merc 450SE16.48275.81803.074.07017.400033
Merc 450SL17.38275.81803.073.73017.600033
Merc 450SLC15.28275.81803.073.78018.000033
Cadillac Fleetwood10.48472.02052.935.25017.980034
Lincoln Continental10.48460.02153.005.42417.820034
Chrysler Imperial14.78440.02303.235.34517.420034
Fiat 12832.4478.7664.082.20019.471141
Honda Civic30.4475.7524.931.61518.521142
Toyota Corolla33.9471.1654.221.83519.901141
Merc 28019.26167.61233.923.44018.301044
Merc 280C17.86167.61233.923.44018.901044
Merc 450SE16.48275.81803.074.07017.400033
Merc 450SL17.38275.81803.073.73017.600033
Merc 450SLC15.28275.81803.073.78018.000033
Cadillac Fleetwood10.48472.02052.935.25017.980034
Lincoln Continental10.48460.02153.005.42417.820034
Chrysler Imperial14.78440.02303.235.34517.420034
Fiat 12832.4478.7664.082.20019.471141
Honda Civic30.4475.7524.931.61518.521142
Toyota Corolla33.9471.1654.221.83519.901141
Toyota Corona21.54120.1973.702.46520.011031
Dodge Challenger15.58318.01502.763.52016.870032
AMC Javelin15.28304.01503.153.43517.300032
Camaro Z2813.38350.02453.733.84015.410034
Pontiac Firebird19.28400.01753.083.84517.050032
Fiat X1-927.3479.0664.081.93518.901141
Porsche 914-226.04120.3914.432.14016.700152
Lotus Europa30.4495.11133.771.51316.901152
Ford Pantera L15.88351.02644.223.17014.500154
Ferrari Dino19.76145.01753.622.77015.500156
Maserati Bora15.08301.03353.543.57014.600158
Volvo 142E21.44121.01094.112.78018.601142
\n", - "
" - ], - "text/plain": [ - " mpg cyl disp hp drat wt qsec vs am gear \\\n", - "Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 \n", - "Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 \n", - "Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 \n", - "Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 \n", - "Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 \n", - "Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 \n", - "Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 \n", - "Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 \n", - "Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 \n", - "Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 \n", - "Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 \n", - "Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 \n", - "Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 \n", - "Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 \n", - "Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 \n", - "Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 \n", - "Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 \n", - "Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 \n", - "Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 \n", - "Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 \n", - "Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 \n", - "Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 \n", - "Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 \n", - "Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 \n", - "Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 \n", - "Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 \n", - "Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 \n", - "Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 \n", - "Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 \n", - "Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 \n", - "Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 \n", - "Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 \n", - "Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 \n", - "AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 \n", - "Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 \n", - "Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 \n", - "Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 \n", - "Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 \n", - "Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 \n", - "Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 \n", - "Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 \n", - "Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 \n", - "Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 \n", - "\n", - " carb \n", - "Mazda RX4 4 \n", - "Mazda RX4 Wag 4 \n", - "Datsun 710 1 \n", - "Hornet 4 Drive 1 \n", - "Hornet Sportabout 2 \n", - "Valiant 1 \n", - "Duster 360 4 \n", - "Merc 240D 2 \n", - "Merc 230 2 \n", - "Merc 280 4 \n", - "Merc 280C 4 \n", - "Merc 450SE 3 \n", - "Merc 450SL 3 \n", - "Merc 450SLC 3 \n", - "Cadillac Fleetwood 4 \n", - "Lincoln Continental 4 \n", - "Chrysler Imperial 4 \n", - "Fiat 128 1 \n", - "Honda Civic 2 \n", - "Toyota Corolla 1 \n", - "Merc 280 4 \n", - "Merc 280C 4 \n", - "Merc 450SE 3 \n", - "Merc 450SL 3 \n", - "Merc 450SLC 3 \n", - "Cadillac Fleetwood 4 \n", - "Lincoln Continental 4 \n", - "Chrysler Imperial 4 \n", - "Fiat 128 1 \n", - "Honda Civic 2 \n", - "Toyota Corolla 1 \n", - "Toyota Corona 1 \n", - "Dodge Challenger 2 \n", - "AMC Javelin 2 \n", - "Camaro Z28 4 \n", - "Pontiac Firebird 2 \n", - "Fiat X1-9 1 \n", - "Porsche 914-2 2 \n", - "Lotus Europa 2 \n", - "Ford Pantera L 4 \n", - "Ferrari Dino 6 \n", - "Maserati Bora 8 \n", - "Volvo 142E 2 " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "union_all(first, second)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "exciting-clothing", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "setequal(mtcars, mtcars >> slice(f[::-1]))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "statewide-story", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
column
00
11
22
33
44
55
\n", - "
" - ], - "text/plain": [ - " column\n", - "0 0\n", - "1 1\n", - "2 2\n", - "3 3\n", - "4 4\n", - "5 5" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a = tibble(column=c(*range(11), 10))\n", - "b = tibble(column=c(*range(6), 5))\n", - "intersect(a, b)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "sealed-comment", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
column
00
11
22
33
44
55
66
77
88
99
1010
\n", - "
" - ], - "text/plain": [ - " column\n", - "0 0\n", - "1 1\n", - "2 2\n", - "3 3\n", - "4 4\n", - "5 5\n", - "6 6\n", - "7 7\n", - "8 8\n", - "9 9\n", - "10 10" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "union(a, b)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "fifth-knife", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
column
06
17
28
39
410
\n", - "
" - ], - "text/plain": [ - " column\n", - "0 6\n", - "1 7\n", - "2 8\n", - "3 9\n", - "4 10" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "setdiff(a, b)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "powered-drive", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
column
00
11
22
33
44
55
66
77
88
99
1010
1110
00
11
22
33
44
55
65
\n", - "
" - ], - "text/plain": [ - " column\n", - "0 0\n", - "1 1\n", - "2 2\n", - "3 3\n", - "4 4\n", - "5 5\n", - "6 6\n", - "7 7\n", - "8 8\n", - "9 9\n", - "10 10\n", - "11 10\n", - "0 0\n", - "1 1\n", - "2 2\n", - "3 3\n", - "4 4\n", - "5 5\n", - "6 5" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "union_all(a, b)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/mkdocs.yml b/mkdocs.yml index 9d793163..d612ee77 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -17,9 +17,63 @@ markdown_extensions: plugins: - search # necessary for search to work - mkapi + - mkdocs-jupyter: + execute: True extra_css: - style.css nav: - 'Home': 'index.md' - 'API': 'mkapi/api/datar' + - 'Examples': + 'across': 'notebooks/across.ipynb' + 'arrange': 'notebooks/arrange.ipynb' + 'base': 'notebooks/base.ipynb' + 'between': 'notebooks/between.ipynb' + 'bind': 'notebooks/bind.ipynb' + 'case_when': 'notebooks/case_when.ipynb' + 'coalesce': 'notebooks/coalesce.ipynb' + 'context': 'notebooks/context.ipynb' + 'count': 'notebooks/count.ipynb' + 'cumall': 'notebooks/cumall.ipynb' + 'desc': 'notebooks/desc.ipynb' + 'distinct': 'notebooks/distinct.ipynb' + 'drop_na': 'notebooks/drop_na.ipynb' + 'expand': 'notebooks/expand.ipynb' + 'expand_grid': 'notebooks/expand_grid.ipynb' + 'extract': 'notebooks/extract.ipynb' + 'fill': 'notebooks/fill.ipynb' + 'filter-joins': 'notebooks/filter-joins.ipynb' + 'full_seq': 'notebooks/full_seq.ipynb' + 'get': 'notebooks/get.ipynb' + 'group_by': 'notebooks/group_by.ipynb' + 'group_map': 'notebooks/group_map.ipynb' + 'group_split': 'notebooks/group_split.ipynb' + 'group_trim': 'notebooks/group_trim.ipynb' + 'lead-lag': 'notebooks/lead-lag.ipynb' + 'mutate-joins': 'notebooks/mutate-joins.ipynb' + 'mutate': 'notebooks/mutate.ipynb' + 'n_distinct': 'notebooks/n_distinct.ipynb' + 'na_if': 'notebooks/na_if.ipynb' + 'near': 'notebooks/near.ipynb' + 'nest-join': 'notebooks/nest-join.ipynb' + 'nth': 'notebooks/nth.ipynb' + 'pivot_longer': 'notebooks/pivot_longer.ipynb' + 'pivot_wider': 'notebooks/pivot_wider.ipynb' + 'pull': 'notebooks/pull.ipynb' + 'ranking': 'notebooks/ranking.ipynb' + 'readme': 'notebooks/readme.ipynb' + 'recode': 'notebooks/recode.ipynb' + 'relocate': 'notebooks/relocate.ipynb' + 'rename': 'notebooks/rename.ipynb' + 'replace_na': 'notebooks/replace_na.ipynb' + 'rowwise': 'notebooks/rowwise.ipynb' + 'select': 'notebooks/select.ipynb' + 'separate': 'notebooks/separate.ipynb' + 'setops': 'notebooks/setops.ipynb' + 'slice': 'notebooks/slice.ipynb' + 'summarise': 'notebooks/summarise.ipynb' + 'tibble': 'notebooks/tibble.ipynb' + 'uncount': 'notebooks/uncount.ipynb' + 'unite': 'notebooks/unite.ipynb' + 'with_groups': 'notebooks/with_groups.ipynb' - 'Change Log': CHANGELOG.md diff --git a/pyproject.toml b/pyproject.toml index 526e31c1..16002fc2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ authors = ["pwwang "] license = "MIT" [tool.poetry.dependencies] -python = "^3.7.1" +python = "^3.7.1" # align with pandas pipda = "*" modkit = "*" pandas = "^1.2" diff --git a/setup.py b/setup.py index 444591c4..27c579a3 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ author='pwwang', author_email='pwwang@pwwang.com', license='MIT', - packages=['datar', 'datar.base', 'datar.datar', 'datar.datasets', + packages=['datar', 'datar.base', 'datar.core', 'datar.datar', 'datar.datasets', 'datar.dplyr', 'datar.stats', 'datar.tibble', 'datar.tidyr', 'datar.utils'], package_dir={"": "."}, package_data={"datar.datasets": ["*.gz"]}, diff --git a/tests/__init__.py b/tests/__init__.py index e69de29b..088f4fa5 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,3 @@ +from pipda import register_verb, register_func +register_verb.astnode_fail_warning = False +register_func.astnode_fail_warning = False diff --git a/tests/test_base.py b/tests/test_base.py index aae9e673..966adbe9 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -244,8 +244,8 @@ def test_table(): from datar import f from datar.datasets import warpbreaks, state_division, state_region, airquality z = stats.rpois(100, 5) - # x = table(z) - # assert sum(x.values.flatten()) == 100 + x = table(z) + assert sum(x.values.flatten()) == 100 #----------------- with context(warpbreaks) as _: diff --git a/tests/test_core_names.py b/tests/test_core_names.py new file mode 100644 index 00000000..54e7f8f7 --- /dev/null +++ b/tests/test_core_names.py @@ -0,0 +1,125 @@ +# https://github.com/r-lib/vctrs/blob/master/tests/testthat/test-names.R +from enum import unique +from string import ascii_letters + +import pytest +from datar.core.names import * + +@pytest.mark.parametrize("names,expect", [ + ([1,2,3], ["1","2","3"]), + (["", numpy.nan], ["", ""]), + (["", numpy.nan], ["", ""]), + (["", "", numpy.nan], ["", "", ""]), + (repair_names(["", "", numpy.nan], repair="minimal"), ["", "", ""]), +]) +def test_minimal(names, expect): + assert repair_names(names, repair="minimal") == expect + +@pytest.mark.parametrize("names,expect", [ + ([numpy.nan, numpy.nan], ["__0", "__1"]), + (["x", "x"], ["x__0", "x__1"]), + (["x", "y"], ["x", "y"]), + (["", "x", "y", "x"], ["__0", "x__1", "y", "x__3"]), + ([""], ["__0"]), + ([numpy.nan], ["__0"]), + (["__20", "a__33", "b", "", "a__2__34"], ["__0", "a__1", "b", "__3", "a__4"]), + (["a__1"], ["a"]), + (["a__2", "a"], ["a__0", "a__1"]), + (["a__3", "a", "a"], ["a__0", "a__1", "a__2"]), + (["a__2", "a", "a"], ["a__0", "a__1", "a__2"]), + (["a__2", "a__2", "a__2"], ["a__0", "a__1", "a__2"]), + (["__20", "a__1", "b", "", "a__2"], ["__0", "a__1", "b", "__3", "a__4"]), + (repair_names(["__20", "a__1", "b", "", "a__2"], repair="unique"), + ["__0", "a__1", "b", "__3", "a__4"]), + (["", "x", "", "y", "x", "_2", "__"], + ["__0", "x__1", "__2", "y", "x__4", "__5", "__6"]), +]) +def test_unique(names, expect): + assert repair_names(names, repair="unique") == expect + +def test_unique_algebraic_y(): + x = ["__20", "a__1", "b", "", "a__2", "d"] + y = ["", "a__3", "b", "__3", "e"] + ## fix names on each, catenate, fix the whole + z1 = repair_names( + repair_names(x, repair="unique") + + repair_names(y, repair="unique"), + repair="unique" + ) + z2 = repair_names( + repair_names(x, repair="unique") + y, + repair="unique" + ) + z3 = repair_names( + x + repair_names(y, repair="unique"), + repair="unique" + ) + z4 = repair_names( + x + y, + repair="unique" + ) + assert z1 == z2 == z3 == z4 + +@pytest.mark.parametrize("names,expect", [ + (list(ascii_letters), list(ascii_letters)), + ([numpy.nan, "", "x", "x", "a1:", "_x_y}"], + ["__0", "__1", "x__2", "x__3", "a1_", "_x_y_"]), + (repair_names([numpy.nan, "", "x", "x", "a1:", "_x_y}"], repair="universal"), + ["__0", "__1", "x__2", "x__3", "a1_", "_x_y_"]), + (["a", "b", "a", "c", "b"], ["a__0", "b__1", "a__2", "c", "b__4"]), + ([""], ["__0"]), + ([numpy.nan], ["__0"]), + (["__"], ["__0"]), + (["_"], ["_"]), + (["_", "_"], ["___0", "___1"]), + (["", "_"], ["__0", "_"]), + (["", "", "_"], ["__0", "__1", "_"]), + (["_", "_", ""], ["___0", "___1", "__2"]), + (["_", "", "_"], ["___0", "__1", "___2"]), + (["", "_", ""], ["__0", "_", "__2"]), + (["__6", "__1__2"], ["__0", "__1"]), + (["if__2"], ["_if"]), + (["", "_", numpy.nan, "if__4", "if", "if__8", "for", "if){]1"], + ["__0", "_", "__2", "_if__3", "_if__4", "_if__5", "_for", "if___1"]), + (["a b", "b c"], ["a_b", "b_c"]), + (["", "_2", "_3", "__4", "___5", "____6", "_____7", "__"], + ["__0", "__1", "__2", "__3", "___5", "____6", "_____7", "__7"]), + (repair_names(["", "_2", "_3", "__4", "___5", "____6", "_____7", "__"], repair="unique"), + ["__0", "__1", "__2", "__3", "___5", "____6", "_____7", "__7"]), + ([7,4,3,6,5,1,2,8], + ["_7","_4","_3","_6","_5","_1","_2","_8"]), + (repair_names([7,4,3,6,5,1,2,8], repair="unique"), + ["_7","_4","_3","_6","_5","_1","_2","_8"]), +]) +def test_universal(names, expect): + assert repair_names(names, repair="universal") == expect + + +def test_check_unique(): + with pytest.raises(NameNonUniqueError): + repair_names([numpy.nan], repair="check_unique") + with pytest.raises(NameNonUniqueError): + repair_names([""], repair="check_unique") + with pytest.raises(NameNonUniqueError): + repair_names(["a", "a"], repair="check_unique") + with pytest.raises(NameNonUniqueError): + repair_names(["_1"], repair="check_unique") + with pytest.raises(NameNonUniqueError): + repair_names(["__"], repair="check_unique") + assert repair_names(["a", "b"], repair="check_unique") == ["a", "b"] + +def test_custom_repair(): + def replace(names: Iterable[str]): + return ["a", "b", "c"] + + out = repair_names([1,2,3], repair=replace) + assert out == ["a", "b", "c"] + + with pytest.raises(ValueError): + repair_names([1,2,3], repair=1) + + out = repair_names(['a', 'b', 'c'], repair=str.upper) + assert out == ['A', 'B', 'C'] + + out = repair_names(['a', 'b', 'c'], repair=['x', 'y', 'z']) + assert out == ['x', 'y', 'z'] diff --git a/tests/test_dplyr.py b/tests/test_dplyr.py index 674387ad..f36cec87 100644 --- a/tests/test_dplyr.py +++ b/tests/test_dplyr.py @@ -4,6 +4,7 @@ from pipda import Symbolic from datar.dplyr import * +from datar.base import * from datar.tibble import tibble from .conftest import assert_equal @@ -48,7 +49,7 @@ def test_mutate(): assert_equal(df.values.flatten(), ['a', 0, 2, 'b', 1, 3, 'c', 2, 4]) # df is still rowwise - df = df >> mutate(z=c_across([f.y, f.z], lambda row: str(row.y+row.z))) + df = df >> mutate(z=as_character(f.y+f.z)) assert_equal(df.values.flatten(), ['a', 0, '2', 'b', 1, '4', 'c', 2, '6']) df = df >> mutate(z=None) @@ -92,5 +93,3 @@ def test_group(): f = Symbolic() df = tibble(x=list('abc'), y=range(3)) df = df >> group_by(f.x) - - diff --git a/tests/test_dplyr_across.py b/tests/test_dplyr_across.py index ca4491a3..b1cc85ed 100644 --- a/tests/test_dplyr_across.py +++ b/tests/test_dplyr_across.py @@ -1,9 +1,11 @@ """Grabbed from https://github.com/tidyverse/dplyr/blob/master/tests/testthat/test-across.R""" import numpy +from pipda import register_func import pytest from datar.all import * +from datar.core.contexts import Context def test_on_one_column(): df = tibble(x=1) @@ -12,64 +14,64 @@ def test_on_one_column(): def test_not_selecting_grouping_var(): df = tibble(g = 1, x = 1) - out = df >> group_by(f.g) >> summarise(x = across(everything())) >> pull() + out = df >> group_by(f.g) >> summarise(x = across(everything())) >> pull(f.x) expected = tibble(x=1) - assert out.to_frame().equals(expected) + assert out.equals(expected) def test_names_output(): gf = tibble(x = 1, y = 2, z = 3, s = "") >> group_by(f.x) out = gf >> summarise(across()) - assert out.columns.tolist() == c("x", "y", "z", "s") + assert out.columns.tolist() == ["x", "y", "z", "s"] out = gf >> summarise(across(_names = "id_{_col}")) - assert out.columns.tolist() == c("x", "id_y", "id_z", "id_s") + assert out.columns.tolist() == ["x", "id_y", "id_z", "id_s"] out = gf >> summarise(across(where(is_numeric), mean)) - assert out.columns.tolist() == c("x", "y", "z") + assert out.columns.tolist() == ["x", "y", "z"] out = gf >> summarise(across(where(is_numeric), mean, _names="mean_{_col}")) - assert out.columns.tolist() == c("x", "mean_y", "mean_z") + assert out.columns.tolist() == ["x", "mean_y", "mean_z"] out = gf >> summarise(across( where(is_numeric), {'mean': mean, 'sum': sum} )) - assert out.columns.tolist() == c("x", "y_mean", "y_sum", "z_mean", "z_sum") + assert out.columns.tolist() == ["x", "y_mean", "y_sum", "z_mean", "z_sum"] # Different from R's list out = gf >> summarise(across( where(is_numeric), {'mean': mean, 1: sum} )) - assert out.columns.tolist() == c("x", "y_mean", "y_1", "z_mean", "z_1") + assert out.columns.tolist() == ["x", "y_mean", "y_1", "z_mean", "z_1"] # Different from R's list out = gf >> summarise(across( where(is_numeric), {0: mean, 'sum': sum} )) - assert out.columns.tolist() == c("x", "y_0", "y_sum", "z_0", "z_sum") + assert out.columns.tolist() == ["x", "y_0", "y_sum", "z_0", "z_sum"] out = gf >> summarise(across( where(is_numeric), [mean, sum] )) - assert out.columns.tolist() == c("x", "y_1", "y_2", "z_1", "z_2") + assert out.columns.tolist() == ["x", "y_0", "y_1", "z_0", "z_1"] out = gf >> summarise(across( where(is_numeric), [mean, sum], - _names='{_col}_{_fn0}' + _names='{_col}_{_fn1}' )) - assert out.columns.tolist() == c("x", "y_0", "y_1", "z_0", "z_1") + assert out.columns.tolist() == ["x", "y_1", "y_2", "z_1", "z_2"] out = gf >> summarise(across( where(is_numeric), {'mean': mean, 'sum': sum}, _names="{_fn}_{_col}" )) - assert out.columns.tolist() == c("x", "mean_y", "sum_y", "mean_z", "sum_z") + assert out.columns.tolist() == ["x", "mean_y", "sum_y", "mean_z", "sum_z"] def test_result_locations_aligned_with_column_names(): df = tibble(x=[1,2], y=['a', 'b']) @@ -112,21 +114,18 @@ def test_kwargs(): def test_works_sequentially(): from pipda import register_func, Context - n_col = register_func( - lambda data, acr: len(acr.evaluate(Context.EVAL, data)) - ) df = tibble(a = 1) out = df >> mutate( - x = n_col(across(where(is_numeric))), - y = n_col(across(where(is_numeric))) + x = ncol(across(where(is_numeric))), + y = ncol(across(where(is_numeric))) ) expect = tibble(a=1, x=1, y=2) assert out.equals(expect) out = df >> mutate( a = "x", - y = n_col(across(where(is_numeric))) + y = ncol(across(where(is_numeric))) ) expect = tibble(a="x", y=0) assert out.equals(expect) @@ -135,3 +134,210 @@ def test_original_ordering(): df = tibble(a=1, b=2) out = df >> mutate(a=2, x=across()) assert out.columns.tolist() == ['a', 'b', 'x$a', 'x$b'] + +def test_error_messages(): + with pytest.raises(ValueError, match='Argument `_fns` of across must be'): + tibble(x = 1) >> summarise(res=across(where(is_numeric), 42)) + with pytest.raises(ValueError, match="must only be used inside verbs"): + across() + with pytest.raises(ValueError, match="must only be used inside verbs"): + c_across() + +def test_used_twice(): + df = tibble(a = 1, b = 2) + out = df >> mutate(x = ncol(across(where(is_numeric))) + ncol(across(f.a))) + expect = tibble(a=1, b=2, x=3) + assert out.equals(expect) + +def test_used_separately(): + df = tibble(a = 1, b = 2) + out = df >> mutate(x=ncol(across(where(is_numeric))), y=ncol(across(f.a))) + expect = tibble(a=1, b=2, x=2, y=1) + assert out.equals(expect) + +def test_with_group_id(): + df = tibble(g=[1,2], a=[1,2], b=[3,4]) >> group_by(f.g) + + @register_func(context=None) + def switcher(data, group_id, across_a, across_b): + return across_a.a if group_id == 0 else across_b.b + + expect = df >> ungroup() >> mutate(x=[1,4]) + out = df >> mutate(x=switcher(cur_group_id(), across(f.a), across(f.b))) + assert out.obj.equals(expect) + +def test_cache_key(): + df = tibble(g=rep([1,2], each=2), a=range(1,5)) >> group_by(f.g) + tibble2 = register_func(None)(tibble) + + @register_func(context=Context.EVAL) + def across_col(data, acr, col): + return acr[col] + + out = df >> mutate( + tibble2( + x = across_col(across(where(is_numeric), mean), 'a'), + y = across_col(across(where(is_numeric), max), 'a') + ) + ) + expect = df >> mutate(x = mean(f.a), y = max(f.a)) + assert out.obj.equals(expect.obj) + +def test_reject_non_vectors(): + with pytest.raises(ValueError, match='Argument `_fns` of across must be'): + tibble(x = 1) >> summarise(across(where(is_numeric), object())) + +def test_recycling(): + df = tibble(x=1, y=2) + out = df >> summarise(across(everything(), lambda col: rep(42, col))) + expect = tibble(x=rep(42,2), y=rep(42,2)) + assert out.equals(expect) + + df = tibble(x=2, y=3) + with pytest.raises(ValueError): + df >> summarise(across(everything(), lambda col: rep(42, col))) + +def test_return_one_row(): + # not actually one row, but returns a corresponding series + df = tibble(x=range(1,43)) + out = df >> mutate(across(c(), as_factor)) + assert out.equals(df) + + out = df >> mutate(y=across(c(), as_factor)) + # empty column in pandas will be NAs + assert out.y.isna().all() + +def test_use_env_var(): + # not a problem, since we use f.y + df = tibble(x = 1.0, y = 2.4) + y = "x" + out = df >> summarise(across(all_of(y), mean)) + expect = tibble(x=1.0) + assert out.equals(expect) + + out = df >> mutate(across(all_of(y), mean)) + assert out.equals(df) + + out = df >> filter(if_all(all_of(y), lambda col: col < 2)) + assert out.equals(df) + +def test_empty_df(): + df = tibble() + out = df >> mutate(across()) + assert out.equals(df) + +def test_mutate_cols_inside_func(): + df = tibble(x = 2, y = 4, z = 8) + + @register_func(None, context=None) + def data_frame(**kwargs): + return tibble(**kwargs) + + out = df >> mutate(data_frame(x=f.x/f.y, y=f.y/f.y, z=f.z/f.y)) + # df.y does not work on grouped data + expect = df >> mutate(across(everything(), lambda col: col/df.y)) + assert out.equals(expect) + +def test_summarise_cols_inside_func(): + df = tibble(x = 2, y = 4, z = 8) + @register_func(None, context=None) + def data_frame(**kwargs): + return tibble(**kwargs) + + out = df >> summarise(data_frame(x=f.x/f.y, y=f.y/f.y, z=f.z/f.y)) + expect = df >> summarise(across(everything(), lambda col: col/df.y)) + assert out.equals(expect) + +def test_cols_in_lambda(): + df = tibble(x=1.0, y=2.0) + out = df >> mutate(across('x', lambda x: x/df.y)) >> pull(f.x, to='list') + assert out == [.5] + +def test_if_any_all_enforce_bool(): + d = tibble(x=10, y=10) + with pytest.raises(TypeError): + d >> filter(if_all(f[f.x:f.y], identity)) + + with pytest.raises(TypeError): + d >> filter(if_any(f[f.x:f.y], identity)) + + with pytest.raises(TypeError): + d >> mutate(ok=if_all(f[f.x:f.y], identity)) + with pytest.raises(TypeError): + d >> mutate(ok=if_any(f[f.x:f.y], identity)) + +def test_if_any_all_in_mutate(): + d = tibble(x = c(1, 5, 10, 10), y = c(0, 0, 0, 10), z = c(10, 5, 1, 10)) + res = d >> mutate( + any = if_any(f[f.x:f.z], lambda x: x > 8), + all = if_all(f[f.x:f.z], lambda x: x > 8) + ) + assert res['any'].eq([True, False, True, True]).all() + assert res['all'].eq([False, False, False, True]).all() + +def test_caching_not_confused(): + + df = tibble(x=[1,2,3]) + res = df >> mutate( + # evaluating in Python space because the '+' operator is not supported + # by numexpr for the bool dtype, use '|' instead + any = if_any(f.x, lambda x: x >= 2) | if_any(f.x, lambda x: x >= 3), + all = if_all(f.x, lambda x: x >= 2) | if_all(f.x, lambda x: x >= 3) + ) + # dtypes not changed + assert res['any'].eq([False, True, True]).all() + assert res['all'].eq([False, True, True]).all() + +def test_if_any_all_na_handling(): + df = expandgrid(x = c(TRUE, FALSE, NA), y = c(TRUE, FALSE, NA)) + + out = df >> filter(if_all(c(f.x,f.y), identity)) + expect = df >> filter(f.x & f.y) + assert out.equals(expect) + + out = df >> filter(if_any(c(f.x,f.y), identity)) + expect = df >> filter(f.x | f.y) + # Note that out has an extra row: + # x y + # 6 NaN True + # This is because df.fillna(False).any() + # is not the same as df.x | df.y + # see: https://pandas.pydata.org/pandas-docs/stable/user_guide/boolean.html + assert out.iloc[:4].equals(expect) + +# reset columns not supported + +def test_c_across(): + df = tibble(x=[1,2], y=[3,4]) + + out = df >> summarise(z=[c_across([f.x, f.y])]) >> pull(f.z, to='list') + assert out[0].tolist() == [1,2,3,4] + +def test_nb_fail(): + from datar.datasets import iris + out = iris >> mutate( + across( + where(is_double) & ~c(f['Petal_Length'], f['Petal_Width']), + round + ) + ) + rows = out >> nrow() + assert rows == 150 + +def test_nb_fail_c_across(): + df = tibble( + id=[1, 2, 3, 4], + k=['a', 'b', 'c', 'd'], + w=runif(4), + x=runif(4), + y=runif(4), + z=runif(4) + ) + out = df >> rowwise() >> mutate( + sum = sum(c_across(f[f.w:f.z])), + sd = sd(c_across(f[f.w:f.z])) + ) + + assert out.flags.rowwise + rows = nrow(out) + assert rows == 4 diff --git a/tests/test_dplyr_arrange.py b/tests/test_dplyr_arrange.py new file mode 100644 index 00000000..413d3e8b --- /dev/null +++ b/tests/test_dplyr_arrange.py @@ -0,0 +1,90 @@ +# https://github.com/tidyverse/dplyr/blob/master/tests/testthat/test-arrange.r + +from datar.core.exceptions import ColumnNotExistingError +from pandas.core.groupby.generic import DataFrameGroupBy +import pytest +from datar.all import * + +def test_empty_returns_self(): + df = tibble(x=range(1,11), y=range(1,11)) + gf = df >> group_by(f.x) + + out = df >> arrange() + assert out is df + + out = gf >> arrange() + assert out.obj.equals(gf.obj) + +def test_sort_empty_df(): + df = tibble() + out = df >> arrange() + assert out is df + +def test_na_end(): + df = tibble(x=c(2,1,NA)) # NA makes it float + out = df >> arrange(f.x) >> pull() + assert out.fillna(0.0).eq([1.0,2.0,0.0]).all() + out = df >> arrange(desc(f.x)) >> pull() + assert out.fillna(0.0).eq([2.0,1.0,0.0]).all() + +def test_errors(): + x = 1 + df = tibble(x, x, _name_repair="minimal") + + with pytest.raises(ValueError): + df >> arrange(f.x) + + df = tibble(x=x) + with pytest.raises(ColumnNotExistingError): + df >> arrange(f.y) + + with pytest.raises(KeyError): + # rep(f.x, 2) is a list + df >> arrange(rep(f.x, 2)) + +def test_df_cols(): + df = tibble(x = [1,2,3], y = tibble(z = [3,2,1])) + out = df >> arrange(f['y$z']) + expect = tibble(x=[3,2,1], y=tibble(z=[1,2,3])) + assert out.reset_index(drop=True).equals(expect) + +def test_complex_cols(): + df = tibble(x = [1,2,3], y = [3+2j, 2+2j, 1+2j]) + out = df >> arrange(f.y) + assert out.equals(df.iloc[[2,1,0], :]) + +def test_ignores_group(): + df = tibble(g=[2,1]*2, x=[4,3,2,1]) + gf = df >> group_by(f.g) + + out = gf >> arrange(f.x) + assert out.obj.equals(df.iloc[[3,2,1,0], :]) + + out = gf >> arrange(f.x, _by_group=True) + assert out.obj.equals(df.iloc[[3,1,2,0], :]) + +def test_update_grouping(): + df = tibble(g = [2, 2, 1, 1], x = [1, 3, 2, 4]) + res = df >> group_by(f.g) >> arrange(f.x) + assert isinstance(res, DataFrameGroupBy) + + # grouping structure kept as index not reset + +def test_across(): + df = tibble(x = [1, 3, 2, 1], y = [4, 3, 2, 1]) + + out = df >> arrange(across()) + expect = df >> arrange(f.x, f.y) + assert out.equals(expect) + + out = df >> arrange(across(_fns=desc)) + expect = df >> arrange(desc(f.x), desc(f.y)) + assert out.equals(expect) + + out = df >> arrange(across(f.x)) + expect = df >> arrange(f.x) + assert out.equals(expect) + + out = df >> arrange(across(f.y)) + expect = df >> arrange(f.y) + assert out.equals(expect) diff --git a/tests/test_dplyr_bind.py b/tests/test_dplyr_bind.py new file mode 100644 index 00000000..e888e966 --- /dev/null +++ b/tests/test_dplyr_bind.py @@ -0,0 +1,256 @@ +# https://github.com/tidyverse/dplyr/blob/master/tests/testthat/test-bind.R +from pipda.utils import Expression +import pytest + +from datar.all import * + +def test_handle_dict(): + expect = tibble(x = 1, y = "a", z = 2) + d1 = dict(x=1, y="a") + d2 = dict(z=2) + + out = bind_cols(d1, d2) + assert out.equals(expect) + + out = bind_cols(dict(**d1, **d2)) + assert out.equals(expect) + +def test_empty(): + + out = bind_cols(tibble()) + expect = tibble() + assert out.equals(expect) + +def test_all_null(): + out = bind_cols(dict(a=NULL, b=NULL)) + expect = tibble() + assert out.equals(expect) + + out = bind_cols(NULL) + expect = tibble() + assert out.equals(expect) + +def test_bind_col_null(): + df1 = tibble(a = range(1,11), b = range(1,11)) + df2 = tibble(c = range(1,11), d = range(1,11)) + + res1 = bind_cols(df1, df2) + res2 = bind_cols(NULL, df1, df2) + res3 = bind_cols(df1, NULL, df2) + res4 = bind_cols(df1, df2, NULL) + + assert res1.equals(res2) + assert res1.equals(res3) + assert res1.equals(res4) + +def test_repair_names(): + df = tibble(a = 1, b = 2) + bound = bind_cols(df, df) + assert bound.columns.tolist() == ['a__0', 'b__1', 'a__2', 'b__3'] + + t1 = tibble(a=1) + t2 = tibble(a=2) + bound = bind_cols(t1, t2) + assert bound.columns.tolist() == ['a__0', 'a__1'] + +def test_incompatible_size_fill_with_NA(): + df1 = tibble(x=range(1,4)) + df2 = tibble(y=range(1,2)) + out = bind_cols(df1, df2).fillna(100) + assert out.x.tolist() == [1,2,3] + assert out.y.tolist() == [1,100,100] + + +# bind_rows + +def test_reorder_cols(): + df = tibble( + a=1, + b=2, + c=3, + d=4, + e=5, + f=6 + ) + df_scramble = df[sample(df.columns)] + out = df >> bind_rows(df_scramble) + assert out.columns.tolist() == list('abcdef') + +def test_ignores_null_empty(): + df = tibble(a=1) + out = df >> bind_rows(NULL) + assert out.equals(df) + + df0 = tibble() + out = df >> bind_rows(df0) + assert out.equals(df) + + # no rows + df_no_rows = df.iloc[[], :] + out = df >> bind_rows(df_no_rows) + assert out.equals(df) + + # no cols + df_no_cols = df.iloc[:, []] + out = df >> bind_rows(df_no_cols) + rows = out >> nrow() + assert rows == 2 + + val = out.fillna(1234) >> get(1, f.a) + assert val == 1234 + + out = df_no_cols >> bind_rows(df) + rows = out >> nrow() + assert rows == 2 + + val = out.fillna(888) >> get(0, f.a) + assert val == 888 + +# column coercion +def test_int_to_float(): + df1 = tibble(a=1.0, b=2) + df2 = tibble(a=1, b=2) + out = df1 >> bind_rows(df2) + a_type = is_float(out.a) + assert a_type + b_type = is_int64(out.b) + assert b_type + +def test_factor_to_chars(): + # we don't have warnings + df1 = tibble(a = factor("a")) + df2 = tibble(a = "b") + + out = df1 >> bind_rows(df1, df2) + a_type = is_factor(out.a) + assert not a_type + +def test_bind_factors(): + df1 = tibble(a = factor("a")) + df2 = tibble(a = factor("b")) + + out = df1 >> bind_rows(df2) + assert out.a.cat.categories.tolist() == ["a", "b"] + + df1 = tibble(a = factor("a")) + df2 = tibble(a = factor(NA)) + + out = df1 >> bind_rows(df2) + assert out.a.cat.categories.tolist() == ["a"] + assert out.a.astype(object).fillna("NA").tolist() == ["a", "NA"] + +def test_bind_na_cols(): + df1 = tibble(x=factor(["foo", "bar"])) + df2 = tibble(x=NA) + + out = df1 >> bind_rows(df2) + res = out >> get(2, f.x) + y = is_na(res) + assert y + + out = df2 >> bind_rows(df1) + res = out >> get(0, f.x) + y = is_na(res) + assert y + + y = is_categorical(out.x) + assert y + +def test_complex(): + df1 = tibble(r=[1+1j, 2-1j]) + df2 = tibble(r=[1-1j, 2+1j]) + df3 = df1 >> bind_rows(df2) + out = df3 >> nrow() + assert out == 4 + assert df3.r.tolist() == df1.r.tolist() + df2.r.tolist() + +def test_cat_ordered(): + df = tibble(x=factor([1,2,3], ordered=True)) + y = bind_rows(df, df) + assert y.x.cat.ordered + +def test_create_id_col(): + df = tibble(x=range(1,11)) + df1 = df >> head(3) + df2 = df >> tail(2) + out = df1 >> bind_rows(df2, _id='col') + assert out.col.tolist() == [0,0,0,1,1] + + out = bind_rows([df1, df2], _id='col') + assert out.col.tolist() == [0,0,0,1,1] + + out = bind_rows(None, one=df1, two=df2, _id="col") + assert out.col.tolist() == ['one'] * 3 + ['two'] * 2 + +def test_non_existing_col(): + # fill with NA, but not convert whole column to NAs + df1 = tibble(x=letters) + df2 = tibble(x=letters[:10], y=letters[:10]) + out = df1 >> bind_rows(df2) + assert not out.y.isna().all() + +def test_empty_dict(): + df = bind_rows({}) + d = df >> dim() + assert d == (0, 0) + +def test_rowwise_vector(): + tbl = bind_rows( + tibble(a = "foo", b = "bar"), + dict(a = "A", b = "B") + ) + expect = tibble(a=["foo", "A"], b=["bar", "B"]) + assert tbl.equals(expect) + + id_tbl = bind_rows(None, a=dict(a=1, b=2), b=dict(a=3, b=4), _id="id") + expect = tibble(id=['a', 'b'], a=[1,3], b=[2,4]) + assert id_tbl.equals(expect) + +def test_list_as_first_argument(): + ll = tibble(a = 1, b = 2) + out = bind_rows([ll]) + assert out.equals(ll) + + out = bind_rows([ll, ll]) + expect = tibble(a=[1,1], b=[2,2]) + assert out.equals(expect) + +def test_hierachical_data(): + my_list = [dict(x = 1, y = "a"), dict(x = 2, y = "b")] + res = bind_rows(my_list) + rows = nrow(res) + assert rows == 2 + out = is_int(res.x) + assert out + out = is_character(res.y) + assert out + + res = bind_rows(dict(x = 1, y = "a"), dict(x = 2, y = "b")) + rows = nrow(res) + assert rows == 2 + out = is_int(res.x) + assert out + out = is_character(res.y) + assert out + +# vectors +# keyword arguments have to have dict/dataframe +# it is not working like tibble +# for example: bind_rows(a=[1,2]) is not working + +def test_wrong_first_argument(): + with pytest.raises(NotImplementedError): + 1 >> bind_rows() + +def test_errors(): + df1 = tibble(x = [1,2,3]) + df2 = tibble(x = [4,5,6]) + with pytest.raises(ValueError): + bind_rows(df1, df2, _id=5) + + df1 = tibble(a = factor("a")) + df2 = tibble(a = 1) + bind_rows(df1, df2) # no error, all converted to object + + with pytest.raises(TypeError): + bind_rows([1,2]) diff --git a/tests/test_dplyr_case_when.py b/tests/test_dplyr_case_when.py new file mode 100644 index 00000000..f9f3e12b --- /dev/null +++ b/tests/test_dplyr_case_when.py @@ -0,0 +1,83 @@ +# https://github.com/tidyverse/dplyr/blob/master/tests/testthat/test-case-when.R +import pytest + +import numpy +from datar.all import * +from datar.datasets import mtcars + +def test_match_values_in_order(): + x = numpy.array([1,2,3]) + out = case_when( + x, + x <= 1, 1, + x <= 2, 2, + x <= 3, 3 + ) + assert (out == x).all() + +def test_unmatched_gets_missing(): + x = numpy.array([1,2,3]) + out = case_when( + x, + x <= 1, 1, + x <= 2, 2 + ) + out[numpy.isnan(out)] = 100 + assert out.tolist() == [1,2,100] + +def test_missing_can_be_replaced(): + x = numpy.array([1,2,3, NA]) + out = case_when( + x, + x <= 1, 1, + x <= 2, 2, + numpy.isnan(x), 0 + ) + out[numpy.isnan(out)] = 100 + assert out.tolist() == [1,2,100,0] + +def test_na_condition(): + # case_when requires first argument as data + x = numpy.array([1,2,3]) + with pytest.raises(IndexError): + # NA cannot be index of ndarray + case_when( + x, + [True, False, NA], [1,2,3], + True, 4 + ) + +def test_scalar_conditions(): + x = numpy.array([NA,NA,NA]) + out = case_when( + x, + True, [1,2,3], + False, [4,5,6] + ) + assert out.tolist() == [1,2,3] + +def test_use_inside_mutate(): + out = mtcars >> head(4) >> mutate( + out=case_when( + f.cyl==4, 1, + f['am']==1, 2, + True, 0 + ) + ) + assert out.out.tolist() == [2,2,1,0] + +def test_errors(): + x = numpy.array([NA]*10) + with pytest.raises(IndexError): + # condition has to be the same length as data + case_when( + x, + [True, False], [1,2,3], + [False, True], [1,2] + ) + with pytest.raises(TypeError): + case_when() + with pytest.raises(NotImplementedError): + case_when("a") + with pytest.raises(ValueError): + case_when([], 1) diff --git a/tests/test_dplyr_coalesce.py b/tests/test_dplyr_coalesce.py new file mode 100644 index 00000000..d394c3fc --- /dev/null +++ b/tests/test_dplyr_coalesce.py @@ -0,0 +1,48 @@ +# https://github.com/tidyverse/dplyr/blob/master/tests/testthat/test-coalesce.R +import pytest + +from datar.all import * + +def test_missing_replaced(): + x = [NA, 1] + out = coalesce(x, 1) + assert out.tolist() == [1,1] + +def test_common_type(): + out = coalesce(NA, 1) + assert out == 1 + + f = factor("x", levels=["x", "y"]) + out = coalesce(NA, f) + assert out == f + +def test_multiple_replaces(): + x1 = c(1, NA, NA) + x2 = c(NA, 2, NA) + x3 = c(NA, NA, 3) + out = coalesce(x1, x2, x3) + assert out.tolist() == [1,2,3] + +def test_errors(): + # can still coalesce with a shorter or longer array + out = coalesce([1,2,NA], [1,2]) + out[is_na(out)] = 100 + assert out.tolist() == [1.0,2.0,100.0] + + # works + out = coalesce([1,2], letters[:2]) + assert out.tolist() == [1,2] + +def test_with_dataframes(): + out = coalesce( + tibble(x = c(NA, 1)), + tibble(x = [1,2]) + ) + assert out.x.tolist() == [1,1] + + df1 = tibble(x = c(NA, 1, NA), y = c(2, NA, NA), z = c([1,2], NA)) + df2 = tibble(x = [1,2,3], y = c(3, 4, NA), z = c(NA, NA, NA)) + df3 = tibble(x = NA, y = c(30, 40, 50), z = [101,102,103]) + out = coalesce(df1, df2, df3) + expect = tibble(x = c(1.0, 1.0, 3.0), y = c(2.0, 4.0, 50.0), z = c(1.0, 2.0, 103.0)) + assert out.equals(expect) diff --git a/tests/test_dplyr_context.py b/tests/test_dplyr_context.py new file mode 100644 index 00000000..5ffd321d --- /dev/null +++ b/tests/test_dplyr_context.py @@ -0,0 +1,78 @@ +# https://github.com/tidyverse/dplyr/blob/master/tests/testthat/test-context.R +import pytest + +from datar.all import * + +def test_cur_group(): + df = tibble(g = [1,2], x = [1,2]) + gf = df >> group_by(f.g) + + with pytest.raises(ValueError): + df >> summarise(key=[cur_group()]) + + out = gf >> summarise(key=[cur_group()]) >> pull(f.key) + expect = tibble(g=1) + assert out.values[0].equals(expect) + expect = tibble(g=2) + assert out.values[1].equals(expect) + +def test_cur_group_id(): + df = tibble(x = c("b", "a", "b")) + gf = df >> group_by(f.x) + + out = gf >> summarise(id=cur_group_id()) + # group_by not sorted + expect = tibble(x = c("a", "b"), id = [0,1]) + assert out.equals(expect) + + out = gf >> mutate(id=cur_group_id()) + # note the order has changed + expect = tibble(x=["a", "b","b"], id=[0,1,1]) + assert out.obj.equals(expect) + +def test_cur_data_all(): + df = tibble(x = c("b", "a", "b"), y = [1,2,3]) + gf = df >> group_by(f.x) + + out = df >> summarise(x=[cur_data()]) >> pull(f.x, to='list') + assert out[0].equals(df) + + out = gf >> summarise(x=[cur_data()]) >> pull(f.x) + assert out.values[0].values.flatten().tolist() == [2] + assert out.values[1].values.flatten().tolist() == [1,3] + + out = gf >> summarise(x=[cur_data_all()]) >> pull(f.x) + assert out.values[0].values.flatten().tolist() == ["a", 2] + assert out.values[1].values.flatten().tolist() == ["b", 1, "b", 3] + +def test_cur_group_rows(): + df = tibble(x = c("b", "a", "b"), y = [1,2,3]) + gf = df >> group_by(f.x) + + out = gf >> summarise(x=[cur_group_rows()]) >> pull() + assert out.values.tolist() == [[1], [0,2]] + +def test_cur_data_all_sequentially(): + df = tibble(a=1) + out = df >> mutate(x = ncol(cur_data()), y = ncol(cur_data())) + expect = tibble(a=1, x=1, y=2) + assert out.equals(expect) + + gf = tibble(a = 1, b = 2) >> group_by(f.a) + out = gf >> mutate(x = ncol(cur_data_all()), y = ncol(cur_data_all())) + expect = tibble(a = 1, b = 2, x = 2, y = 3) + assert out.obj.equals(expect) + +def test_errors(): + with pytest.raises(TypeError): + n() + with pytest.raises(TypeError): + cur_data() + with pytest.raises(TypeError): + cur_data_all() + with pytest.raises(TypeError): + cur_group() + with pytest.raises(TypeError): + cur_group_id() + with pytest.raises(TypeError): + cur_group_rows() diff --git a/tests/test_dplyr_count.py b/tests/test_dplyr_count.py new file mode 100644 index 00000000..5f3b778e --- /dev/null +++ b/tests/test_dplyr_count.py @@ -0,0 +1,59 @@ +# https://github.com/tidyverse/dplyr/blob/master/tests/testthat/test-count-tally.r +import pytest + +from datar.all import * + +def test_informs_if_n_column_already_present_unless_overridden(caplog): + df1 = tibble(n = c(1, 1, 2, 2, 2)) + out = df1 >> count(f.n) + assert out.columns.tolist() == ['n', 'nn'] + assert 'already present' in caplog.text + + caplog.clear() + out = df1 >> count(f.n, name='n') + assert out.columns.tolist() == ['n'] + assert caplog.text == '' + + out = df1 >> count(f.n, name='nn') + assert out.columns.tolist() == ['n', 'nn'] + assert caplog.text == '' + + df2 = tibble(n = c(1, 1, 2, 2, 2), nn = range(1,6)) + out = df2 >> count(f.n) + assert out.columns.tolist() == ['n', 'nn'] + assert 'already present' in caplog.text + + out = df2 >> count(f.n, f.nn) + assert out.columns.tolist() == ['n', 'nn', 'nnn'] + assert 'already present' in caplog.text + +def test_name_must_be_string(): + df = tibble(x = c(1, 2)) + with pytest.raises(ValueError): + df >> count(f.x, name=1) + with pytest.raises(ValueError): + df >> count(f.x, name=letters) + +def test_drop(): + df = tibble(f = factor("b", levels = c("a", "b", "c"))) + out = df >> count(f.f) + assert out.n.tolist() == [1] + + out = df >> count(f.f, _drop = False) + # note the order + assert out.n.tolist() == [0,1,0] + + out = count(group_by(df, f.f, _drop = FALSE)) + # print(out.obj) + assert out.obj.n.tolist() == [0,1,0] + +def test_preserve_grouping(): + df = tibble(g = c(1, 2, 2, 2)) + exp = tibble(g = c(1, 2), n = c(1, 3)) + + out = df >> count(f.g) + assert out.equals(exp) + + df1 = df >> group_by(f.g) >> count() + df2 = exp >> group_by(f.g) + assert df1.obj.equals(df2.obj) diff --git a/tests/test_dplyr_distinct.py b/tests/test_dplyr_distinct.py new file mode 100644 index 00000000..c682deb8 --- /dev/null +++ b/tests/test_dplyr_distinct.py @@ -0,0 +1,139 @@ +# tests grabbed from: +# https://github.com/tidyverse/dplyr/blob/master/tests/testthat/test-distinct.R +import pytest +from datar.all import * + +from datar.datasets import iris +from datar.core.exceptions import ColumnNotExistingError + +def test_single_column(): + df = tibble( + x = c(1, 1, 1, 1), + y = c(1, 1, 2, 2), + z = c(1, 2, 1, 2) + ) + x = distinct(df, f.x, _keep_all=False) + assert all(x.x == unique(df.x)) + + y = distinct(df, f.y, _keep_all=False) + assert all(y.y == unique(df.y)) + +def test_0_col_df(): + df = tibble(x=range(10)) >> select(~f.x) + cols = df >> distinct() >> ncol() + assert cols == 0 + +def test_novar_use_all(): + df = tibble(x=[1,1], y=[2,2]) + expect = tibble(x=1, y=2) + out = df >> distinct() + assert out.equals(expect) + +def test_keeps_only_specified_cols(): + df = tibble(x = c(1, 1, 1), y = c(1, 1, 1)) + expect = tibble(x=1) + out = df >> distinct(f.x) + assert out.equals(expect) + +def test_unless_keep_all_true(): + df = tibble(x=[1,1,1], y=[3,2,1]) + expect1 = tibble(x=1) + out1 = df >> distinct(f.x) + assert out1.equals(expect1) + + expect2 = tibble(x=1, y=3) + out2 = df >> distinct(f.x, _keep_all=True) + assert out2.equals(expect2) + +def test_not_duplicating_cols(): + df = tibble(a=[1,2,3], b=[4,5,6]) + out = df >> distinct(f.a, f.a) + assert out.columns.tolist() == ['a'] + + out = df >> group_by(f.a) >> distinct(f.a) + assert out.obj.columns.tolist() == ['a'] + +def test_grouping_cols_always_included(): + df = tibble(g = c(1, 2), x = c(1, 2)) + out = df >> group_by(f.g) >> distinct(f.x) + + assert out.obj.columns.tolist() == ['g', 'x'] + +def test_switch_groupby_distinct_equal(): + df = tibble(g = c(1, 2), x = c(1, 2)) + + df1 = df >> distinct() >> group_by(f.g) + df2 = df >> group_by(f.g) >> distinct() + + assert df1.obj.equals(df2.obj) + +def test_mutate_internally(): + df = tibble(g = c(1, 2), x = c(1, 2)) + + df1 = df >> distinct(aa=f.g*2) + df2 = df >> mutate(aa=f.g*2) >> distinct(f.aa) + + assert df1.equals(df2) + +def test_on_iter_type(): + df = tibble( + a = c("1", "1", "2", "2", "3", "3"), + b = [("A", )] + ) + df2 = tibble( + x=range(1,6), + y=[(1,2,3), (2,3,4), (3,4,5), (4,5,6), (5,6,7)] + ) + + out = df >> distinct() + expect = df >> slice([0,2,4]) + assert out.equals(expect) + + out2 = df2 >> distinct() + assert out2.equals(df2) + +def test_preserves_order(): + d = tibble(x=[1,2], y=[3,4]) + out = d >> distinct(f.y, f.x) + assert out.columns.tolist() == ['x', 'y'] + +def test_on_na(): + df = tibble(col_a=[1, NA, NA]) >> mutate(col_a=f.col_a+0.0) + rows = df >> distinct() >> nrow() + assert rows == 2 + +def test_auto_splicing(): + species = tibble(Species=iris.Species) + + df1 = iris >> distinct(f.Species) + df2 = iris >> distinct(species) + assert df1.equals(df2) + + df3 = iris >> distinct(across(f.Species)) + assert df1.equals(df3) + + df4 = iris >> mutate(across(starts_with("Sepal"), round)) >> distinct( + f.Sepal_Length, f.Sepal_Width) + df5 = iris >> distinct(across(starts_with("Sepal"), round)) + assert df4.equals(df5) + +def test_preserves_grouping(): + gf = tibble(x = c(1, 1, 2, 2), y = f.x) >> group_by(f.x) + out = gf >> distinct() + gvars = group_vars(out) + assert gvars == ['x'] + + out = gf >> distinct(x=f.x+2) + gvars = group_vars(out) + assert gvars == ['x'] + +def test_errors(): + + df = tibble(g = c(1, 2), x = c(1, 2)) + + with pytest.raises(ColumnNotExistingError): + df >> distinct(f.aa, f.x) + with pytest.raises(ColumnNotExistingError): + df >> distinct(f.aa, f.bb) + with pytest.raises(ColumnNotExistingError): + df >> distinct(y=f.a+1) diff --git a/tests/test_dplyr_empty_groups.py b/tests/test_dplyr_empty_groups.py new file mode 100644 index 00000000..40f65a37 --- /dev/null +++ b/tests/test_dplyr_empty_groups.py @@ -0,0 +1,116 @@ +# tests grabbed from: +# https://github.com/tidyverse/dplyr/blob/master/tests/testthat/test-empty-groups.R +import pytest +from datar.all import * + +@pytest.fixture +def df(): + return tibble( + e = 1, + f = factor(c(1, 1, 2, 2), levels = [1,2,3]), + g = c(1, 1, 2, 2), + x = c(1, 2, 1, 4) + # group_by(..., _drop=False) only works for a + # single categorical columns + ) >> group_by(f.f, _drop = FALSE) + +def test_filter_slice_keep_zero_len_groups(df): + out = filter(df, f.f == 1) + gsize = group_size(out) + assert gsize == [2,0,0] + + out = slice(df, 1) + gsize = group_size(out) + assert gsize == [1,1,0] + +def test_filter_slice_retain_zero_group_labels(df): + # count loses _drop=False + out = ungroup(count(filter(df, f.f==1))) + expect = tibble( + f=[1,2,3], + n=[2,0,0] + ) + assert out.equals(expect) + + out = ungroup(count(slice(df, 1))) + expect = tibble( + f=[1,2,3], + n=[1,1,0] + ) + assert out.equals(expect) + +def test_mutate_keeps_zero_len_groups(df): + gsize = group_size(mutate(df, z=2)) + assert gsize == [2,2,0] + +def test_summarise_returns_a_row_for_zero_len_groups(df): + summarised = df >> summarise(z=n()) + rows = summarised >> nrow() + # assert rows == 3 + # not supported when dataframe is empty + assert rows == 2 + +def test_arrange_keeps_zero_len_groups(df): + gsize = group_size(arrange(df)) + assert gsize == [2,2,0] + + gsize = group_size(arrange(df, f.x)) + assert gsize == [2,2,0] + +def test_bind_rows(df): + gg = bind_rows(df, df) + gsize = group_size(gg) + assert gsize == [4,4,0] + +def test_join_respect_zero_len_groups(): + df1 = tibble( + f=factor([1,1,2,2], levels=[1,2,3]), + x=[1,2,1,4] + ) >> group_by(f.f) + df2 = tibble( + f=factor([2,2,3,3], levels=[1,2,3]), + x=[1,2,3,4] + ) >> group_by(f.f) + + gsize = group_size(left_join(df1, df2, by=f.f)) + assert gsize == [2,4] + gsize = group_size(right_join(df1, df2, by=f.f)) + assert gsize == [4,2] + gsize = group_size(full_join(df1, df2, by=f.f)) + assert gsize == [2,4,2] + gsize = group_size(anti_join(df1, df2, by=f.f)) + assert gsize == [2] + gsize = group_size(inner_join(df1, df2, by=f.f)) + assert gsize == [4] + + df1 = tibble( + f=factor([1,1,2,2], levels=[1,2,3]), + x=[1,2,1,4] + ) >> group_by(f.f, _drop=False) + df2 = tibble( + f=factor([2,2,3,3], levels=[1,2,3]), + x=[1,2,3,4] + ) >> group_by(f.f, _drop=False) + + gsize = group_size(left_join(df1, df2, by=f.f)) + assert gsize == [2,4,0] + gsize = group_size(right_join(df1, df2, by=f.f)) + assert gsize == [0,4,2] + gsize = group_size(full_join(df1, df2, by=f.f)) + assert gsize == [2,4,2] + gsize = group_size(anti_join(df1, df2, by=f.f)) + assert gsize == [2,0,0] + gsize = group_size(inner_join(df1, df2, by=f.f)) + assert gsize == [0,4,0] + +def test_n_groups_respect_zero_len_groups(): + df = tibble(x=factor([1,2,3], levels=[1,2,3,4])) >> group_by(f.x, _drop=False) + assert n_groups(df) == 4 + +def test_summarise_respect_zero_len_groups(): + df = tibble(x=factor(rep([1,2,3], each=10), levels=[1,2,3,4])) + + out = df >> group_by(f.x, _drop=False) >> summarise(n=n()) + # assert out.obj.n.tolist() == [10,10,10,0] + # not supported when dataframe is empty + assert out.n.tolist() == [10,10,10] diff --git a/tests/test_dplyr_filter.py b/tests/test_dplyr_filter.py new file mode 100644 index 00000000..1efd07f0 --- /dev/null +++ b/tests/test_dplyr_filter.py @@ -0,0 +1,279 @@ +# tests grabbed from: +# https://github.com/tidyverse/dplyr/blob/master/tests/testthat/test-filter.r +import numpy +from pandas.core.groupby import groupby +from pipda.function import register_func +import pytest +from datar.all import * +from datar.datasets import mtcars, iris + +def test_handles_passing_args(): + df = tibble(x=range(1,5)) + def ff(*args): + x1 = 4 + f1 = lambda y: y + return df >> filter(*args, f1(x1) > f.x) + + def g(): + x2 = 2 + return ff(f.x > x2) + + res = g() + assert res.x.tolist() == [3] + + df >>= group_by(f.x) + res = g() + assert res.obj.x.tolist() == [3] + +def test_handles_simple_symbols(): + df = tibble(x=range(1,5), test=rep(c(TRUE,FALSE), each=2)) + res = filter(df, f.test) + + gdf = group_by(df, f.x) + res = filter(gdf, f.test) + + def h(data): + test2 = c(True, True, False, False) + return filter(data, test2) + + out = h(df) + assert out.equals(df.iloc[:2,:]) + + def ff(data, *args): + one = 1 + return filter(data, f.test, f.x > one, *args) + + def g(data, *args): + four = 4 + return ff(data, f.x> filter(min(f.mpg) > 0) + # assert df1.equals(mtcars) + + df2 = mtcars >> arrange(f.cyl, f.mpg) >> group_by(f.cyl) >> filter(min(f.mpg)>0) + # group_by not sorted until apply. + df3 = mtcars >> arrange(f.cyl, f.mpg) >> group_by(f.cyl) + assert df2.obj.equals(df3.obj) + +def test_discards_na(): + temp = tibble( + i = range(1,6), + x = c(NA, 1,1,0,0) + ) + res = filter(temp, f.x == 1) + rows = nrow(res) + assert rows == 2 + +def test_returns_input_with_no_args(): + df = filter(mtcars) + assert df.equals(mtcars.reset_index(drop=True)) + +def test_complex_vec(): + d = tibble(x=range(1,11), y=[i+2j for i in range(1,11)]) + out = filter(d, f.x < 4) + assert out.y.tolist() == [i+2j for i in range(1,4)] + + out = d >> filter(Re(f.y) < 4) + assert out.y.tolist() == [i+2j for i in range(1,4)] + +def test_contains(): + df = tibble(a=c("a", "b", "ab"), g=c(1,1,2)) + + res = df >> filter(is_element(f.a, letters)) + rows = nrow(res) + assert rows == 2 + + res = df >> group_by(f.g) >> filter(is_element(f.a, letters)) + rows = nrow(res) + assert rows == 2 + +def test_row_number(): + z = tibble(a=[1,2,3]) + b = "a" + res = z >> filter(row_number() == 3) + rows = nrow(res) + assert rows == 0 + +def test_row_number_0col(): + out = tibble() >> mutate(a=row_number()) + assert nrow(out) == 0 + assert out.columns.tolist() == ['a'] + +def test_mixed_orig_df(): + df = tibble(x=range(1,11), g=rep(range(1,6),2)) + res = df >> group_by(f.g) >> filter(f.x > min(df.x)) + assert nrow(res) == 9 + +def test_empty_df(): + res = tibble() >> filter(False) + assert nrow(res) == 0 + assert len(res.columns) == 0 + +def test_true_true(): + df = tibble(x=range(1,6)) + res = filter(df, True, True) + assert res.equals(df) + +def test_rowwise(): + @register_func(None) + def grepl(a, b): + return numpy.array([x in y for x,y in zip(a,b)]) + df = tibble(First = c("string1", "string2"), + Second = c("Sentence with string1", "something")) + res = df >> rowwise() >> filter(grepl(f.First, f.Second)) + assert nrow(res) == 1 + + df1 = df >> slice(0) + df2 = ungroup(res) + assert df1.equals(df2) + +def test_grouped_filter_handles_indices(): + res = iris >> group_by(f.Species) >> filter(f.Sepal_Length>5) + res2 = res >> mutate(Petal = f.Petal_Width * f.Petal_Length) + + assert nrow(res) == nrow(res2) + grows1 = group_rows(res) + grows2 = group_rows(res2) + assert grows1 == grows2 + assert all(group_keys(res) == group_keys(res2)) + +def test_filter_false_handles_indices(): + # todo: figure out how to do _preserve=True + # out = mtcars >> group_by(f.cyl) >> filter( + # False, _preserve=True) + # out = group_rows(out) + # assert out == [[], [], []] + + out = mtcars >> group_by(f.cyl) >> filter( + False, _preserve=False) + out = group_rows(out) + assert out == {} + +# def test_hybrid_lag_and_default_value_for_string_cols(): + +def test_handles_tuple_columns(): + res = tibble(a=[1,2], x=[tuple(range(1,11)), tuple(range(1,6))]) >> filter( + f.a == 1) >> pull(f.x, to='list') + assert res == [tuple(range(1,11))] + + res = tibble(a=[1,2], x=[tuple(range(1,11)), tuple(range(1,6))]) >> group_by( + f.a) >> filter( + f.a == 1) >> pull(f.x, to='list') + assert res == [tuple(range(1,11))] + +def test_row_number_no_warning(caplog): + mtcars >> filter(row_number() > 1, row_number() < 5) + assert caplog.text == '' + +def test_preserve_order_across_groups(): + df = tibble(g=c(1,2,1,2,1), time=[5,4,3,2,1], x=f.time) + res1 = df >> group_by( + f.g + ) >> filter(f.x <= 4) >> ungroup() >> arrange(f.g, f.time) + + res2 = df >> arrange(f.g) >> group_by( + f.g + ) >> filter(f.x <=4) >> ungroup() >> arrange(f.g, f.time) + + res3 = df >> filter(f.x <= 4) >> group_by(f.g) >> ungroup() >> arrange(f.g, f.time) + res1.reset_index(drop=True, inplace=True) + res2.reset_index(drop=True, inplace=True) + res3.reset_index(drop=True, inplace=True) + assert res1.equals(res2) + assert res1.equals(res3) + # res1$time, res2$time, res3$time unsorted? + +def test_two_conds_not_freeze(): + df1 = iris >> filter(f.Sepal_Length > 7, f.Petal_Length < 6) + df2 = iris >> filter((f.Sepal_Length > 7) & (f.Petal_Length < 6)) + assert df1.equals(df2) + +def test_handles_df_cols(): + df = tibble( + x = [1,2], + z = tibble(A=[1,2], B=[3,4]) + ) + expect = df >> slice(0) + + out = filter(df, f.x == 1) + assert out.equals(expect) + out = filter(df, f['z$A'] == 1) + assert out.equals(expect) + + gdf = group_by(df, f.x) + + out = filter(gdf, f['z$A'] == 1) + assert out.obj.equals(expect) + out = filter(gdf, f['z$A'] == 1) + assert out.obj.equals(expect) + +# def test_handles_named_logical(): +# tbl = tibble(a={'a': True}) +# out = tbl >> filter(f.a) +# assert out.equals(tbl) + +def test_errors(): + # wrong type + with pytest.raises(ValueError): + iris >> group_by(f.Species) >> filter(range(1,10)) + with pytest.raises(ValueError): + iris >> filter(range(1,10)) + + # wrong size + with pytest.raises(ValueError): + iris >> group_by(f.Species) >> filter([True, False]) + with pytest.raises(ValueError): + iris >> rowwise(f.Species) >> filter([True, False]) + with pytest.raises(ValueError): + iris >> filter([True, False]) + + # wrong size in column + with pytest.raises(ValueError): + iris >> group_by(f.Species) >> filter(tibble([True, False])) + with pytest.raises(ValueError): + iris >> rowwise() >> filter(tibble([True, False])) + with pytest.raises(ValueError): + iris >> filter(tibble([True, False])) + with pytest.raises(ValueError): + tibble(x=1) >> filter([True, False]) + + # named inputs + with pytest.raises(TypeError): + mtcars >> filter(x=1) + with pytest.raises(TypeError): + mtcars >> filter(f.y>2, z=3) + with pytest.raises(TypeError): + mtcars >> filter(True, x=1) + + # across() in filter() does not warn yet + # tibble(x=1, y=2) >> filter(across(everything(), lambda x: x>0)) + +def test_preserves_grouping(): + gf = tibble(g=[1,1,1,2,2], x=[1,2,3,4,5]) >> group_by(f.g) + + out = gf >> filter(is_element(f.x, [3,4])) + assert group_vars(out) == ['g'] + assert group_rows(out) == {1: [0], 2: [1]} + + out = gf >> filter(f.x < 3) + assert group_vars(out) == ['g'] + assert group_rows(out)[1].tolist() == [0, 1] + +def test_works_with_if_any_if_all(): + df = tibble(x1=range(1,11), x2=c(range(1,6), 10, 9, 8, 7, 6)) + df1 = df >> filter(if_all(starts_with("x"), lambda x: x>6)) + df2 = df >> filter((f.x1 > 6) & (f.x2 > 6)) + assert df1.equals(df2) + + df1 = df >> filter(if_any(starts_with("x"), lambda x: x>6)) + df2 = df >> filter((f.x1 > 6) | (f.x2 > 6)) + assert df1.equals(df2) diff --git a/tests/test_dplyr_group_by.py b/tests/test_dplyr_group_by.py new file mode 100644 index 00000000..64c6d9e7 --- /dev/null +++ b/tests/test_dplyr_group_by.py @@ -0,0 +1,336 @@ +#https://github.com/tidyverse/dplyr/blob/master/tests/testthat/test-group-by.r + +from pandas.core import groupby +import pytest + +from datar.all import * +from datar.datasets import mtcars, iris +from datar.core.exceptions import ColumnNotExistingError + +@pytest.fixture +def df(): + return tibble(x = rep([1,2,3], each = 10), y = rep(range(1,7), each = 5)) + +def test_add(df): + tbl = df >> group_by(f.x, f.y, _add=True) + gvars = group_vars(tbl) + assert gvars == ['x', 'y'] + + tbl = df >> group_by(f.x, _add=True) >> group_by(f.y, _add=True) + gvars = group_vars(tbl) + assert gvars == ['x', 'y'] + +def test_join_preserve_grouping(df): + g = df >> group_by(f.x) + + tbl = g >> inner_join(g, by=['x', 'y']) + gvars = tbl >> group_vars() + assert gvars == ['x'] + + tbl = g >> left_join(g, by=['x', 'y']) + gvars = tbl >> group_vars() + assert gvars == ['x'] + + tbl = g >> semi_join(g, by=['x', 'y']) + gvars = tbl >> group_vars() + assert gvars == ['x'] + + tbl = g >> anti_join(g, by=['x', 'y']) + gvars = tbl >> group_vars() + assert gvars == ['x'] + +def test_tibble_lose_grouping(df): + g = df >> group_by(f.x) + tbl = tibble(g) + # with pytest.raises(NotImplementedError): + assert group_vars(tbl) == [] + +# group by a string is also referring to the column + +def test_mutate_does_not_loose_variables(): + df = tibble(a = rep([1,2,3,4], 2), b = rep([1,2,3,4], each = 2), x = runif(8)) + by_ab = df >> group_by(f.a, f.b) + by_a = by_ab >> summarise(x=sum(f.x), _groups="drop_last") + by_a_quantile = by_a >> group_by(quantile=ntile(f.x, 4)) + + assert by_a_quantile.obj.columns.tolist() == ["a", "b", "x", "quantile"] + +def test_orders_by_groups(): + df = tibble(a = sample(range(1,11), 3000, replace = TRUE)) >> group_by(f.a) + out = df >> count() + assert out.obj.a.tolist() == list(range(1,11)) + + df = tibble(a = sample(letters[:10], 3000, replace = TRUE)) >> group_by(f.a) + out = df >> count() + assert out.obj.a.tolist() == letters[:10] + + df = tibble(a = sample(sqrt(range(1,11)), 3000, replace = TRUE)) >> group_by(f.a) + out = df >> count() + expect = list(sqrt(range(1,11))) + assert out.obj.a.tolist() == expect + +def test_by_tuple_values(): + df = tibble( + x=[1,2,3], + y=[(1,2), (1,2,3), (1,2)] + ) >> group_by(f.y) + out = df >> count() + assert out.obj.y.tolist() == [(1,2), (1,2,3)] + assert out.obj.n.tolist() == [2, 1] + +def test_select_add_group_vars(): + res = mtcars >> group_by(f.vs) >> select(f.mpg) + assert res.obj.columns.tolist() == ['vs', 'mpg'] + +def test_one_group_for_NA(): + x = c(NA, NA, NA, range(10,0,-1), range(10,0,-1)) + w = c(20, 30, 40, range(1,11), range(1,11)) + n_dist = n_distinct(x) + assert n_dist == 11 + + res = tibble(x = x, w = w) >> group_by(f.x) >> summarise(n = n()) + rows = res >> nrow() + assert rows == 11 + +def test_zero_row_dfs(): + df = tibble(a=1,b=1,g=1).loc[[], :] + dfg = df >> group_by(f.g, _drop=False) + d = dfg >> dim() + assert d == (0, 3) + + x = dfg >> summarise(n=n()) + d = x >> dim() + assert d == (0, 2) + # with pytest.raises(NotImplementedError): + assert group_vars(x) == [] + + x = dfg >> mutate(c = f.b+1) + d = x >> dim() + assert d == (0, 4) + gvars = x >> group_vars() + assert gvars == ['g'] + + x = dfg >> filter(f.a==100) + d = x >> dim() + assert d == (0, 3) + gvars = x >> group_vars() + assert gvars == ['g'] + + x = dfg >> arrange(f.a, f.g) + d = x >> dim() + assert d == (0, 3) + gvars = x >> group_vars() + assert gvars == ['g'] + + x = dfg >> select(f.a) + d = x >> dim() + assert d == (0, 2) + gvars = x >> group_vars() + assert gvars == ['g'] + +def test_does_not_affect_input_data(): + df = tibble(x=1) + dfg = df >> group_by(f.x) + assert df.x.tolist() == [1] + +def test_0_groups(): + df = tibble(x=1).loc[[], :] >> group_by(f.x) + res = df >> mutate(y=mean(f.x), z=+mean(f.x), n=n()) + assert res.obj.columns.tolist() == ['x', 'y', 'z', 'n'] + rows = res >> nrow() + assert rows == 0 + +def test_0_groups_filter(): + df = tibble(x=1).loc[[], :] >> group_by(f.x) + res = df >> filter(f.x > 3) + d1 = df >> dim() + d2 = res >> dim() + assert d1 == d2 + assert df.obj.columns.tolist() == res.obj.columns.tolist() + +def test_0_groups_select(): + df = tibble(x=1).loc[[], :] >> group_by(f.x) + res = df >> select(f.x) + d1 = df >> dim() + d2 = res >> dim() + assert d1 == d2 + assert df.obj.columns.tolist() == res.obj.columns.tolist() + +def test_0_groups_arrange(): + df = tibble(x=1).loc[[], :] >> group_by(f.x) + res = df >> arrange(f.x) + d1 = df >> dim() + d2 = res >> dim() + assert d1 == d2 + assert df.obj.columns.tolist() == res.obj.columns.tolist() + +def test_0_vars(df): + with pytest.raises(ValueError): + df >> group_by() + +def test_drop(): + res = iris >> filter(f.Species == "setosa") >> group_by( + f.Species, _drop = TRUE + ) + out = res >> count() >> nrow() + assert out == 1 + +def test_remember_drop_true(): + res = iris >> group_by( + f.Species, + _drop=True + ) + assert group_by_drop_default(res) + + res2 = res >> filter(f.Sepal_Length > 5) + assert group_by_drop_default(res2) + + res3 = res >> filter(f.Sepal_Length > 5, _preserve = FALSE) + assert group_by_drop_default(res3) + + res4 = res3 >> group_by(f.Species) + assert group_by_drop_default(res4) + + # group_data to be implemented + +def test_remember_drop_false(): + res = iris >> filter( + f.Species == "setosa" + ) >> group_by(f.Species, _drop = FALSE) + assert not group_by_drop_default(res) + + res2 = res >> group_by(f.Species) + assert not group_by_drop_default(res2) + +# todo +# def test_drop_false_preserve_ordered_factors(): +# ... + +def test_summarise_maintains_drop(): + df = tibble( + f1 = factor("a", levels = c("a", "b", "c")), + f2 = factor("d", levels = c("d", "e", "f", "g")), + x = 42 + ) + res = df >> group_by(f.f1, f.f2, _drop = TRUE) + ng = n_groups(res) + assert ng == 1 + assert group_by_drop_default(res) + + # DataFrame.groupby(..., observed=False) doesn't support multiple categoricals + # res1 = df >> group_by(f.f1, f.f2, _drop=False) + # ng = n_groups(res1) + # assert ng == 12 + + res1 = df >> group_by(f.f1, _drop = TRUE) + ng = n_groups(res1) + assert ng == 1 + + res1 = df >> group_by(f.f1, _drop = FALSE) + ng = n_groups(res1) + assert ng == 3 + + res1 = df >> group_by(f.f2, _drop = FALSE) + ng = n_groups(res1) + assert ng == 4 + + res2 = res >> summarise(x=sum(f.x), _groups="drop_last") + ng = n_groups(res2) + assert ng == 1 + assert group_by_drop_default(res2) + +def test_joins_maintains__drop(): + df1 = group_by(tibble( + f1 = factor(c("a", "b"), levels = c("a", "b", "c")), + x = [42,43] + ), f.f1, _drop = TRUE) + + df2 = group_by(tibble( + f1 = factor(c("a"), levels = c("a", "b", "c")), + y = 1 + ), f.f1, _drop = TRUE) + + res = left_join(df1, df2, by = "f1") + assert n_groups(res) == 2 + + df2 = group_by(tibble( + f1 = factor(c("a", "c"), levels = c("a", "b", "c")), + y = [1,2] + ), f.f1, _drop = TRUE) + res = full_join(df1, df2, by = "f1") + assert n_groups(res) == 3 + +def test_add_passes_drop(): + d = tibble( + f1 = factor("b", levels = c("a", "b", "c")), + f2 = factor("g", levels = c("e", "f", "g")), + x = 48 + ) + + res = group_by(group_by(d, f.f1, _drop = TRUE), f.f2, _add = TRUE) + ng = n_groups(res) + assert ng == 1 + assert group_by_drop_default(res) + +def test_na_last(): + # this is a pandas bug when try to retrieve groupby groups with NAs + # https://github.com/pandas-dev/pandas/issues/35202 + res = tibble(x = c("apple", NA, "banana"), y = range(1,4)) >> group_by(f.x) + # ret = res >> group_rows() + + lvls = res.grouper.levels[0].fillna('NA') + assert lvls.tolist() == ['apple', 'banana', 'NA'] + +def test_auto_splicing(): + df1 = iris >> group_by(f.Species) + df2 = iris >> group_by(tibble(Species=iris.Species)) + assert df1.obj.equals(df2.obj) + + df1 = iris >> group_by(f.Species) + df2 = iris >> group_by(across(f.Species)) + assert df1.obj.equals(df2.obj) + + df1 = iris >> mutate(across(starts_with("Sepal"), round)) >> group_by( + f.Sepal_Length, f.Sepal_Width) + df2 = iris >> group_by(across(starts_with("Sepal"), round)) + assert df1.obj.equals(df2.obj) + + # across(character()), across(NULL) not supported + + df1 = iris >> mutate(across(starts_with("Sepal"), round)) >> group_by( + f.Sepal_Length, f.Sepal_Width, f.Species) + df2 = iris >> group_by(across(starts_with("Sepal"), round), f.Species) + assert df1.obj.equals(df2.obj) + + df1 = iris >> mutate(across(starts_with("Sepal"), round)) >> group_by( + f.Species, f.Sepal_Length, f.Sepal_Width) + df2 = iris >> group_by(f.Species, across(starts_with("Sepal"), round)) + assert df1.obj.equals(df2.obj) + +def test_mutate_semantics(): + df1 = tibble(a = 1, b = 2) >> group_by(c = f.a * f.b, d = f.c + 1) + df2 = tibble(a = 1, b = 2) >> mutate( + c = f.a * f.b, d = f.c + 1 + ) >> group_by(f.c, f.d) + assert df1.obj.equals(df2.obj) + +def test_implicit_mutate_operates_on_ungrouped_data(): + vars = tibble(x = c(1,2), y = c(3,4), z = c(5,6)) >> group_by(f.y) + vars >>= group_by(across(any_of(c('y','z')))) + gv = group_vars(vars) + assert gv == ['y', 'z'] + +def test_errors(): + df = tibble(x=1, y=2) + + with pytest.raises(ColumnNotExistingError): + df >> group_by(f.unknown) + + with pytest.raises(ValueError): + df >> ungroup(f.x) + + with pytest.raises(ValueError): + df >> group_by(f.x, f.y) >> ungroup(f.z) + + with pytest.raises(ColumnNotExistingError): + df >> group_by(z=f.a+1) diff --git a/tests/test_dplyr_rowwise.py b/tests/test_dplyr_rowwise.py new file mode 100644 index 00000000..d36a7d1c --- /dev/null +++ b/tests/test_dplyr_rowwise.py @@ -0,0 +1,61 @@ +# tests grabbed from: +# https://github.com/tidyverse/dplyr/blob/master/tests/testthat/test-rowwise.r +import pytest +from pandas.core.groupby.generic import DataFrameGroupBy +from datar.all import * + +def test_preserved_by_major_verbs(): + rf = rowwise(tibble(x=range(1,6), y=[5,4,3,2,1]), "x") + + out = arrange(rf, f.y) + assert out.flags.rowwise == ['x'] + assert group_vars(out) == ['x'] + + out = filter(rf, f.x < 3) + assert out.flags.rowwise == ['x'] + assert group_vars(out) == ['x'] + + out = mutate(rf, x=f.x+1) + assert out.flags.rowwise == ['x'] + assert group_vars(out) == ['x'] + + out = rename(rf, X=f.x) + assert out.flags.rowwise == ['X'] + assert group_vars(out) == ['X'] + + out = select(rf, f.x) + assert out.flags.rowwise == ['x'] + assert group_vars(out) == ['x'] + + out = slice(rf, c(1,1)) + assert out.flags.rowwise == ['x'] + assert group_vars(out) == ['x'] + + out = summarise(rf, z=mean([f.x, f.y])) + assert isinstance(out, DataFrameGroupBy) + assert group_vars(out) == ['x'] + +def test_rowwise_preserved_by_assign_only(): + rf = rowwise(tibble(x=range(1,6), y=[5,4,3,2,1]), "x") + rf['z'] = [5,4,3,2,1] + + assert rf.flags.rowwise == ['x'] + assert group_vars(rf) == ['x'] + +def test_shows_in_display(caplog): + rf = rowwise(tibble(x = range(1,6)), "x") + rf >> display() + assert "# [DataFrame] Rowwise: ['x']" in caplog.text + +def test_rowwise_captures_group_vars(): + df = group_by(tibble(g = [1,2], x = [1,2]), f.g) + rw = rowwise(df) + assert group_vars(rw) == ['g'] + + with pytest.raises(ValueError): + rowwise(df, f.x) + +def test_can_rowwise(): + rf1 = rowwise(tibble(x = range(1,6), y = range(1,6)), "x") + rf2 = rowwise(rf1, f.y) + assert group_vars(rf2) == ['y'] diff --git a/tests/test_dplyr_summarise.py b/tests/test_dplyr_summarise.py new file mode 100644 index 00000000..97bfe467 --- /dev/null +++ b/tests/test_dplyr_summarise.py @@ -0,0 +1,253 @@ +# tests grabbed from: +# https://github.com/tidyverse/dplyr/blob/master/tests/testthat/test-summarise.r +from pandas.core.frame import DataFrame +from pipda.function import register_func +from datar.core.contexts import Context +import pytest +from datar.all import * +from datar.datasets import mtcars +from datar.core.exceptions import ColumnNotExistingError + +def test_freshly_create_vars(): + df = tibble(x=range(1,11)) + out = summarise(df, y=mean(f.x), z=f.y+1) + assert out.y.to_list() == [5.5] + assert out.z.to_list() == [6.5] + +def test_input_recycled(): + df1 = tibble() >> summarise(x=1, y=[1,2,3], z=1) + df2 = tibble(x=1, y=[1,2,3], z=1) + assert df1.equals(df2) + + gf = group_by(tibble(a = [1,2]), f.a) + df1 = gf >> summarise(x=1, y=[1,2,3], z=1) + df2 = tibble( + a = rep([1,2], each = 3), + x = 1, + y = c([1,2,3], [1,2,3]), + z = 1 + ) >> group_by(f.a) + assert df1.obj.equals(df2.obj) + + df1 = gf >> summarise(x = seq_len(f.a), y = 1) + df2 = tibble(a = c(1, 2, 2), x = c(0, 0, 1), y = 1) >> group_by(f.a) + assert df1.obj.equals(df2.obj) + +def test_works_with_empty_data_frames(): + df = tibble(x=[]) + df1 = summarise(df) + df2 = tibble(_rows=1) + assert df1.equals(df2) + + df = tibble(_rows=10) + df1 = summarise(df) + assert df1.equals(df2) + + df1 = df >> summarise(n=n()) + df2 = tibble(n=10) + assert df1.equals(df2) + +def test_works_with_grouped_empty_data_frames(): + df = tibble(x=[]) + df1 = df >> group_by(f.x) >> summarise(y = 1) + assert dim(df1) == (0, 2) + assert df1.columns.tolist() == ['x', 'y'] + + df1 = df >> rowwise(f.x) >> summarise(y = 1) + assert group_vars(df1) == ['x'] + assert dim(df1.obj) == (0, 2) + assert df1.obj.columns.tolist() == ['x', 'y'] + +def test_no_expressions(): + df = tibble(x = [1,2], y = [1,2]) + gf = group_by(df, f.x) + + out = summarise(df) + assert dim(out) == (1, 0) + + out = summarise(gf) + assert group_vars(out) == [] + exp = tibble(x=[1,2]) + assert out.equals(exp) + + out = summarise(df, {}) + assert dim(out) == (1, 0) + + out = summarise(gf, {}) + assert group_vars(out) == [] + exp = tibble(x=[1,2]) + assert out.equals(exp) + +def test_0col_df_in_results_ignored(): + df1 = tibble(x=[1,2]) + df2 = df1 >> group_by(f.x) >> summarise(tibble()) + assert df2.equals(df1) + + df2 = df1 >> group_by(f.x) >> summarise(tibble(), y=65) + df3 = df1 >> mutate(y=65) + assert df2.equals(df3) + + df2 = tibble(x=[1,2], y=[3,4]) + df3 = df2 >> group_by(f.x) >> summarise(tibble()) + assert df3.equals(df1) + + df3 = df2 >> group_by(f.x) >> summarise(tibble(), z=98) + df4 = df1 >> mutate(z=98) + assert df3.equals(df4) + +def test_peels_off_a_single_layer_of_grouping(): + df = tibble(x=rep([1,2,3,4], each=4), y=rep([1,2], each=8), z=runif(16)) + gf = df >> group_by(f.x, f.y) + + assert group_vars(summarise(gf)) == ['x'] + assert group_vars(summarise(summarise(gf))) == [] + +def test_correctly_reconstructs_groups(): + d = tibble(x=[1,2,3,4], g1=rep([1,2], 2), g2=[1,2,3,4]) >> group_by( + f.g1, f.g2 + ) >> summarise(x = f.x + 1) + # order is different from dplyr + assert group_rows(d)[1].tolist() == [0, 2] + assert group_rows(d)[2].tolist() == [1, 3] + +def test_modify_grouping_vars(): + df = tibble(a = c(1, 2, 1, 2), b = c(1, 1, 2, 2)) + gf = group_by(df, f.a, f.b) + out = summarise(gf, a=f.a+1) + assert out.obj.a.tolist() == [2,3,2,3] + +def test_allows_names(): + res = tibble(x = [1,2,3], y = letters[:3]) >> group_by( + f.y + ) >> summarise( + a = length(f.x), + b = quantile(f.x, 0.5) + ) + assert res.b.tolist() == [1., 2., 3.] + +def test_list_output_columns(): + df = tibble(x = range(1,11), g = rep([1,2], each = 5)) + res = df >> group_by(f.g) >> summarise(y = [f.x]) >> pull(f.y, to='list') + assert res[0].tolist() == [1,2,3,4,5] + +def test_unnamed_tibbles_are_unpacked(): + df = tibble(x = [1,2]) + + @register_func(None, context=Context.EVAL) + def tibble_func(**kwargs): + return tibble(**kwargs) + + out = summarise(df, tibble_func(y = f.x * 2, z = 3)) + assert out.y.tolist() == [2,4] + assert out.z.tolist() == [3,3] + +def test_named_tibbles_are_packed(): + @register_func(None, context=Context.EVAL) + def tibble_func(**kwargs): + return tibble(**kwargs) + + df = tibble(x = [1,2]) + out = summarise(df, df = tibble_func(y = f.x * 2, z = 3)) >> pull(f.df) + assert out.y.tolist() == [2,4] + assert out.z.tolist() == [3,3] + +def test_groups_arg(caplog): + df = tibble(x=1, y=2) + out = df >> group_by(f.x, f.y) >> summarise() + assert out.obj.equals(df) + assert "has grouped output by ['x']" in caplog.text + caplog.clear() + + df >> rowwise(f.x, f.y) >> summarise() >> display() + assert "[DataFrameGroupBy] Groups: ['x', 'y'] (1)" in caplog.text + caplog.clear() + + df = tibble(x = 1, y = 2) + df1 = df >> summarise(z = 3, _groups= "rowwise") + df2 = rowwise(tibble(z = 3)) + assert df1.flags.rowwise + assert df2.flags.rowwise + assert df1.equals(df2) + + gf = df >> group_by(f.x, f.y) + gvars = gf >> summarise() >> group_vars() + assert gvars == ['x'] + gvars = gf >> summarise(_groups = "drop_last") >> group_vars() + assert gvars == ['x'] + gvars = gf >> summarise(_groups = "drop") >> group_vars() + assert gvars == [] + gvars = gf >> summarise(_groups = "keep") >> group_vars() + assert gvars == ['x', 'y'] + + rf = df >> rowwise(f.x, f.y) + gvars = rf >> summarise(_groups = "drop") >> group_vars() + assert gvars == [] + gvars = rf >> summarise(_groups = "keep") >> group_vars() + assert gvars == ['x', 'y'] + +def test_casts_data_frame_results_to_common_type(): + df = tibble(x=[1,2], g=[1,2]) >> group_by(f.g) + + @register_func(None, context=Context.EVAL) + def df_of_g(g): + if g.tolist() == [1]: + return tibble(y=1) + return tibble(y=1, z=2) + + res = df >> summarise(df_of_g(f.g), _groups='drop') + assert res.z.fillna(0).tolist() == [0, 2] + +def test_silently_skips_when_all_results_are_null(): + df = tibble(x = [1,2], g = [1,2]) >> group_by(f.g) + + df1 = summarise(df, x=NULL) + df2 = summarise(df) + assert df1.equals(df2) + +def test_errors(caplog): + df = tibble(x = 1, y = 2) + out = df >> group_by(f.x, f.y) >> summarise() + assert "`summarise()` has grouped output by ['x']" in caplog.text + assert out.obj.equals(df) + caplog.clear() + + out = tibble(x=1, y=2) >> group_by(f.x, f.y) >> summarise(z=[2,2]) + assert "`summarise()` has grouped output by ['x', 'y']" in caplog.text + exp = tibble(x=[1,1], y=[2,2], z=[2,2]) + assert out.obj.equals(exp) + caplog.clear() + + out = df >> rowwise(f.x, f.y) >> summarise() + assert "`summarise()` has grouped output by ['x', 'y']" in caplog.text + assert out.obj.equals(df) + caplog.clear() + + out = df >> rowwise() >> summarise() + assert "`summarise()` has ungrouped output" in caplog.text + d = dim(out) + assert d == (1, 0) + caplog.clear() + + # unsupported type (but python objects are supported by pandas) + # not testing for types futher + tibble(x = 1, y = c(1, 2, 2), z = runif(3)) >> summarise(a=object()) + + # incompatible size + with pytest.raises(ValueError): + tibble(z = 1) >> summarise(x = [1,2,3], y = [1,2]) + with pytest.raises(ValueError): + tibble(z = [1,2]) >> group_by(f.z) >> summarise(x = [1,2,3], y = [1,2]) + with pytest.raises(ValueError): + tibble(z=c(1, 3)) >> group_by(f.z) >> summarise(x=seq_len(f.z), y=[1,2]) + + # Missing variable + with pytest.raises(ColumnNotExistingError): + summarise(mtcars, a = mean(f.not_there)) + + with pytest.raises(ColumnNotExistingError): + summarise(group_by(mtcars, f.cyl), a = mean(f.not_there)) + + # Duplicate column names + x = 1 + with pytest.raises(ValueError): + tibble(x, x, _name_repair="minimal") >> summarise(f.x) diff --git a/tests/test_tibble.py b/tests/test_tibble.py index 48bb1ad2..47c04e1f 100644 --- a/tests/test_tibble.py +++ b/tests/test_tibble.py @@ -1,66 +1,242 @@ +# https://github.com/tidyverse/tibble/blob/master/tests/testthat/test-tibble.R import pytest -from pandas import DataFrame -from pipda import Symbolic -from datar.tibble import * +from datar import f +from datar.tibble import tibble, tribble +from datar.base import nrow, rep, dim, sum, diag, NA, letters, LETTERS, NULL +from datar.dplyr import pull + +def test_correct_rows(): + out = tibble(value=range(1,11)) >> nrow() + assert out == 10 + out = tibble(value=range(1,11), name="recycle_me") >> nrow() + assert out == 10 + out = tibble(name="recycle_me", value=range(1,11)) >> nrow() + assert out == 10 + out = tibble(name="recycle_me", value=range(1,11), value2=range(11,21)) >> nrow() + assert out == 10 + out = tibble(value=range(1,11), name="recycle_me", value2=range(11,21)) >> nrow() + assert out == 10 + +def test_null_none_ignored(): + out = tibble(a=None) + expect = tibble() + assert out.equals(expect) + + out = tibble(a_=None, a=1) + expect = tibble(a=1) + assert out.equals(expect) + + out = tibble(a=None, b=1, c=[2,3]) + expect = tibble(b=1, c=[2,3]) + assert out.equals(expect) + + out = tibble(None, b=1, c=[2,3]) + expect = tibble(b=1, c=[2,3]) + assert out.equals(expect) + +def test_recycle_scalar_or_len1_vec(): + out = tibble(value=range(1,11)) >> nrow() + assert out == 10 + out = tibble(value=range(1,11), y=1) >> nrow() + assert out == 10 + out = tibble(value=range(1,11), y=[1]) >> nrow() + assert out == 10 + with pytest.raises(ValueError): + tibble(value=range(1,11), y=[1,2,3]) + +def test_recycle_nrow1_df(): + out = tibble(x=range(1,11), y=tibble(z=1)) + expect = tibble(x=range(1,11), y=tibble(z=rep(1,10))) + assert out.equals(expect) + + out = tibble(y=tibble(z=1), x=range(1,11)) + expect = tibble(y=tibble(z=rep(1,10)), x=range(1,11)) + assert out.equals(expect) + + out = tibble(x=1, y=tibble(z=range(1,11))) + expect = tibble(x=rep(1,10), y=tibble(z=range(1,11))) + assert out.equals(expect) + + out = tibble(y=tibble(z=range(1,11)), x=1) + expect = tibble(y=tibble(z=range(1,11)), x=rep(1,10)) + assert out.equals(expect) + +def test_missing_names(): + x = range(1,11) + df = tibble(x, y=x) + assert df.columns.tolist() == ['x', 'y'] + +def test_empty(): + zero = tibble() + d = zero >> dim() + assert d == (0, 0) + assert zero.columns.tolist() == [] + +def test_hierachical_names(): + foo = tibble(x=tibble(y=1,z=2)) + assert foo.columns.tolist() == ['x$y', 'x$z'] + pulled = foo >> pull(f.x) + assert pulled.columns.tolist() == ['y', 'z'] + + foo = tibble(x=dict(y=1,z=2)) + assert foo.columns.tolist() == ['x$y', 'x$z'] + pulled = foo >> pull(f.x) + assert pulled.columns.tolist() == ['y', 'z'] + + +def test_meta_flags_preserved(): + foo = tibble(x=1) + foo.flags.rowwise = True + bar = tibble(foo) + assert bar.flags.rowwise == True + +def test_f_pronoun(): + foo = tibble(a=1, b=f.a) + bar = tibble(a=1, b=1) + assert foo.equals(bar) + +def test_mutate_semantics(): + foo = tibble(a=[1,2], b=1, c=f.b / sum(f.b)) + bar = tibble(a=[1,2], b=[1,1], c=[.5,.5]) + assert foo.equals(bar) + + foo = tibble(b=1, a=[1,2], c=f.b / sum(f.b)) + bar = tibble(b=[1,1], a=[1,2], c=[.5,.5]) + assert foo.equals(bar) + + foo = tibble(b=1.0, c=f.b / sum(f.b), a=[1,2]) + bar = tibble(b=[1.0,1.0], c=[1.0,1.0], a=[1,2]) + assert foo.equals(bar) + +# TODO: units preseved when recycled + +def test_auto_splicing_anonymous_tibbles(): + df = tibble(a=1, b=2) + out = tibble(df) + assert out.equals(df) + + out = tibble(df, c=f.b) + expect = tibble(a=1,b=2,c=2) + assert out.equals(expect) + +def test_coerce_dict_of_df(): + df = tibble(x=range(1,11)) + out = tibble(dict(x=df)) >> nrow() + assert out == 10 + + out = tibble(dict(x=diag(5))) >> nrow() + assert out == 5 + +def test_subsetting_correct_nrow(): + df = tibble(x=range(1,11)) + out = tibble(x=df).loc[:4,:] + expect = tibble(x=df.loc[:4,:]) + assert out.equals(expect) + +def test_one_row_retains_column(): + out = tibble(y=diag(5)).loc[0, :] + expect = tibble(y=diag(5).loc[0, :].values) + assert (out.values.flatten() == expect.values.flatten()).all() + +# tribble -from .conftest import assert_equal - -def test_tibble(): - f = Symbolic() - - df1 = tibble(x=1, y=2) - df2 = DataFrame(dict(x=[1], y=[2])) - assert df1.equals(df2) - assert_equal(df1.columns.tolist(), df2.columns.tolist()) - - df1 = tibble(x=1, y=2, _name_repair=str.upper) - assert_equal(df1.columns.tolist(), ['X', 'Y']) - - df1 = tibble(x=1, y=2, _name_repair=['X', 'Y']) - assert_equal(df1.columns.tolist(), ['X', 'Y']) - - - x = 1 - df = tibble(x, x, x, _name_repair="unique") - assert_equal(df.columns.tolist(), ["x_1", "x_2", "x_3"]) - - df = tibble(x, f.x*2) - assert_equal(df.values.flatten(), [1,2]) - df = tibble(x, y=f.x*2) - assert_equal(df.values.flatten(), [1,2]) - - df = tibble(x, df, _name_repair="unique") - assert_equal(df.values.flatten(), [1,1,2]) - df = tibble(x, z=df, _name_repair="unique") - assert_equal(df.values.flatten(), [1,1,1,2]) - - - def name_repair(name, raw_names, new_names): - if name not in new_names: - return name - return f'{name}_1' - - df = tibble(x, x, _name_repair=name_repair) - assert_equal(df.columns.tolist(), ["x", "x_1"]) - - with pytest.raises(ValueError, match="duplicated"): - tibble(x, x) +def test_tribble(): + out = tribble( + f.colA, f.colB, + "a", 1, + "b", 2 + ) + expect = tibble(colA=["a", "b"], colB=[1,2]) + assert out.equals(expect) - x = {'a': 1} - df = tibble(x['a'], x['a'], x['a'], _name_repair="universal") - assert_equal(df.columns.tolist(), ["x_a_1", "x_a_2", "x_a_3"]) + out = tribble( + f.colA, f.colB, f.colC, f.colD, + 1,2,3,4, + 5,6,7,8 + ) + expect = tibble( + colA=[1,5], + colB=[2,6], + colC=[3,7], + colD=[4,8], + ) + assert out.equals(expect) - with pytest.raises(ValueError, match='_name_repair'): - tibble(x['a'], _name_repair=True) + out = tribble( + f.colA, f.colB, + 1,6, + 2,7, + 3,8, + 4,9, + 5,10 + ) + expect = tibble( + colA=[1,2,3,4,5], + colB=[6,7,8,9,10] + ) + assert out.equals(expect) -def test_tribble(): - f = Symbolic() - df = tribble( +# trailing comma is a python feature +def test_trailing_comma(): + out = tribble( f.colA, f.colB, "a", 1, - "b", 2, - "c", 3, + "b", 2, # <-- ) - assert_equal(df.columns.tolist(), ['colA', 'colB']) - assert_equal(df.values.flatten(), ['a', 1, 'b', 2, 'c', 3]) + expect = tibble(colA=["a", "b"], colB=[1,2]) + assert out.equals(expect) + +# todo: handle column as class + +def test_non_atomic_value(): + out = tribble(f.a, f.b, NA, "A", letters, LETTERS[1:]) + expect = tibble(a=[NA, letters], b=["A", LETTERS[1:]]) + assert out.equals(expect) + + out = tribble(f.a, f.b, NA, NULL, 1, 2) + expect = tibble(a=[NA, 1], b=[NULL, 2]) + assert out.equals(expect) + +def test_errors(): + with pytest.raises(ValueError): + tribble(1, 2, 3) + with pytest.raises(ValueError): + tribble("a", "b", 1, 2) + + out = tribble(f.a, f.b, f.c, 1,2,3,4,5) + # missing values filled with NA, unlike R + expect = tibble(a=[1,4], b=[2,5], c=[3,NA]) + assert out.fillna(0).equals(expect.fillna(0)) + +def test_dict_value(): + out = tribble(f.x, f.y, 1, dict(a=1), 2, dict(b=2)) + assert out.x.values.tolist() == [1,2] + assert out.y.values.tolist() == [dict(a=1), dict(b=2)] + +def test_empty_df(): + out = tribble(f.x, f.y) + expect = tibble(x=[], y=[]) + assert out.columns.tolist() == ['x', 'y'] + assert out.shape == (0, 2) + assert expect.columns.tolist() == ['x', 'y'] + assert expect.shape == (0, 2) + +def test_0x0(): + df = tibble() + expect = tibble() + assert df.equals(expect) + +def test_names_not_stripped(): + # different from R + df = tribble(f.x, dict(a=1)) + out = df >> pull(f.x, to='list') + assert out == [dict(a=1)] + +def test_dup_cols(): + df = tribble(f.x, f.x, 1, 2) + assert df.columns.tolist() == ['x', 'x'] + + x = 1 + df = tibble(x, x, _name_repair='minimal') + assert df.columns.tolist() == ['x', 'x'] diff --git a/tests/test_utils.py b/tests/test_utils.py index 460df8d8..184c0b4a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -22,5 +22,3 @@ def test_head_tail(): assert len(z) == 6 with pytest.raises(TypeError): tail(3) - -