Skip to content

Commit

Permalink
🔖 0.8.6 (#134)
Browse files Browse the repository at this point in the history
  • Loading branch information
pwwang authored Aug 25, 2022
1 parent 4f9c2a3 commit cd2595b
Show file tree
Hide file tree
Showing 9 changed files with 97 additions and 44 deletions.
2 changes: 1 addition & 1 deletion datar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
)

__all__ = ("f", "get_versions")
__version__ = "0.8.5"
__version__ = "0.8.6"

apply_init_callbacks()

Expand Down
45 changes: 35 additions & 10 deletions datar/base/arithmetic.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Arithmetic or math functions"""

from functools import singledispatch
import inspect
from typing import TYPE_CHECKING, Union

Expand Down Expand Up @@ -883,18 +884,42 @@ def std(
sd = std


@func_factory("transform", {"x", "w"})
def weighted_mean(
x: Series, w: Series = 1, na_rm=True, __args_raw=None
) -> Series:
"""Calculate weighted mean"""
if __args_raw["w"] is not None and np.nansum(w) == 0:
@singledispatch
def _weighted_mean(
df: DataFrame,
has_w: bool = True,
na_rm: bool = True,
) -> np.ndarray:
if not has_w:
return np.nanmean(df["x"]) if na_rm else np.mean(df["x"])

if np.nansum(df["w"]) == 0:
return np.nan

if na_rm:
na_mask = pd.isnull(x)
x = x[~na_mask.values]
w = w[~na_mask.values]
na_mask = pd.isnull(df["x"])
x = df["x"][~na_mask.values]
w = df["w"][~na_mask.values]
return np.average(x, weights=w)

return np.average(x, weights=w)
return np.average(df["x"], weights=df["w"])


@_weighted_mean.register(TibbleGrouped)
def _(
df: TibbleGrouped,
has_w: bool = True,
na_rm: bool = True,
) -> Series:
return df._datar["grouped"].apply(
lambda subdf: _weighted_mean(subdf, has_w, na_rm)
)


@func_factory(None, {"x", "w"})
def weighted_mean(
x: Series, w: Series = 1, na_rm=True, __args_raw=None, __args_frame=None,
) -> Series:
"""Calculate weighted mean"""
has_w = __args_raw["w"] is not None
return _weighted_mean(__args_frame, has_w, na_rm)
2 changes: 1 addition & 1 deletion datar/base/verbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def union(x, y):

@register_verb(context=Context.EVAL)
def unique(x):
"""Union of two iterables"""
"""Get unique elements from an iterable and keep their order"""
# order not kept
# return np.unique(x)
if is_scalar(x):
Expand Down
65 changes: 42 additions & 23 deletions datar/dplyr/distinct.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
See source https://github.com/tidyverse/dplyr/blob/master/R/distinct.R
"""
from pipda import register_verb
from pipda.symbolic import Reference

from ..core.backends.pandas import DataFrame
from ..core.backends.pandas.core.groupby import GroupBy
Expand All @@ -11,7 +12,7 @@
from ..core.factory import func_factory
from ..core.utils import regcall
from ..core.tibble import Tibble, TibbleGrouped, reconstruct_tibble
from ..base import union, setdiff, intersect
from ..base import union, setdiff, intersect, unique
from .mutate import mutate


Expand All @@ -33,31 +34,49 @@ def distinct(_data, *args, _keep_all=False, **kwargs):
A dataframe without duplicated rows in _data
"""
if not args and not kwargs:
uniq = _data.drop_duplicates()
out = _data.drop_duplicates()
else:
# keep_none_prefers_new_order
uniq = (
regcall(
mutate,
_data,
*args,
**kwargs,
_keep="none",
if (
not kwargs
# optimize:
# iris >> distinct(f.Species, f.Sepal_Length)
# We don't need to do mutation
and all(
isinstance(expr, Reference)
and expr._pipda_level == 1
and expr._pipda_ref in _data.columns
for expr in args
)
).drop_duplicates()
):
subset = [expr._pipda_ref for expr in args]
ucols = getattr(_data, "group_vars", [])
ucols.extend(subset)
ucols = regcall(unique, ucols)
uniq = _data.drop_duplicates(subset=subset)[ucols]
else:
# keep_none_prefers_new_order
uniq = (
regcall(
mutate,
_data,
*args,
**kwargs,
_keep="none",
)
).drop_duplicates()

if not _keep_all:
# keep original order
out = uniq[
regcall(
union,
regcall(intersect, _data.columns, uniq.columns),
regcall(setdiff, uniq.columns, _data.columns),
)
]
else:
out = _data.loc[uniq.index, :].copy()
out[uniq.columns.tolist()] = uniq
if not _keep_all:
# keep original order
out = uniq[
regcall(
union,
regcall(intersect, _data.columns, uniq.columns),
regcall(setdiff, uniq.columns, _data.columns),
)
]
else:
out = _data.loc[uniq.index, :].copy()
out[uniq.columns.tolist()] = uniq

return reconstruct_tibble(_data, Tibble(out, copy=False))

Expand Down
6 changes: 6 additions & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## 0.8.6

- 🐛 Fix weighted_mean not working for grouped data (#133)
- ✅ Add tests for weighted_mean on grouped data
- ⚡️ Optimize distinct on existing columns (#128)

## 0.8.5

- 🐛 Fix columns missing after Join by same columns using mapping (#122)
Expand Down
11 changes: 4 additions & 7 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
# use_directory_urls doesn't work for newer versions
mkdocs==1.1.2
# AttributeError: module 'jinja2' has no attribute 'contextfilter'
# jinja2==3.1.0
jinja2==3.0.3
mkdocs-material==7.2.3
pymdown-extensions==8.2
mkdocs
mkdocs-material
pymdown-extensions
mkapi-fix
mkdocs-jupyter==0.17.3
mkdocs-jupyter
ipykernel
ipython_genutils
# to compile readme.ipynb
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "datar"
version = "0.8.5"
version = "0.8.6"
description = "Port of dplyr and other related R packages in python, using pipda."
authors = ["pwwang <[email protected]>"]
readme = "README.md"
Expand Down
5 changes: 5 additions & 0 deletions tests/base/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ def test_weighted_mean():
with pytest.raises(ValueError):
weighted_mean([1,2], [1,2,3])

df = tibble(g=[1, 1, 2, 2], x=[1, 2, 3, 4], w=[1, 3, 3, 3]).group_by('g')
assert weighted_mean(df.g.obj, w=None) == 1.5
assert_iterable_equal(weighted_mean(df.g), [1, 2])
assert_iterable_equal(weighted_mean(df.x, w=df.w), [1.75, 3.5])


def test_quantile():
df = tibble(x=[1, 2, 3], g=[1, 2, 2])
Expand Down
3 changes: 2 additions & 1 deletion tests/dplyr/test_distinct.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
)
from datar.tibble import tibble
from datar.datasets import iris
from datar.testing import assert_frame_equal


def test_single_column():
Expand Down Expand Up @@ -51,7 +52,7 @@ def test_keeps_only_specified_cols():
df = tibble(x=c(1, 1, 1), y=c(1, 1, 1))
expect = tibble(x=1)
out = df >> distinct(f.x)
assert out.equals(expect)
assert_frame_equal(out, expect)


def test_unless_keep_all_true():
Expand Down

0 comments on commit cd2595b

Please sign in to comment.