From 974b1c269ad4e42fef3588c46b24e1e022449e57 Mon Sep 17 00:00:00 2001 From: mraabhijit Date: Sat, 11 Oct 2025 16:21:33 +0530 Subject: [PATCH 01/15] fix ArrowDtype.itemsize --- pandas/core/dtypes/dtypes.py | 30 +++++++++- pandas/tests/dtypes/test_dtypes.py | 90 ++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1e6761b2e1db0..648997a96d3c2 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2308,8 +2308,34 @@ def kind(self) -> str: @cache_readonly def itemsize(self) -> int: - """Return the number of bytes in this dtype""" - return self.numpy_dtype.itemsize + """ + Return the number of bytes in this dtype. + + For Arrow-backed dtypes: + - Returns the fixed-width bit size divided by 8 for standard fixed-width types. + - For boolean types, returns the NumPy itemsize. + - Falls back to the NumPy dtype itemsize for variable-width & unsupported types. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> dtype = pd.ArrowDtype(pa.int32()) + >>> dtype.itemsize + 4 + + >>> dtype = pd.ArrowDtype(pa.bool_()) + >>> dtype.itemsize # falls back to numpy dtype + 1 + """ + # Use pyarrow itemsize for fixed-width data types + # e.g. int32 -> 32 bits // 8 = 4 bytes + try: + if pa.types.is_boolean(self.pyarrow_dtype): + return self.numpy_dtype.itemsize + return self.pyarrow_dtype.bit_width // 8 + except (ValueError, AttributeError): + return self.numpy_dtype.itemsize def construct_array_type(self) -> type_t[ArrowExtensionArray]: """ diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 95be5f3768be6..1a3e3f1d3ac2f 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1259,3 +1259,93 @@ def test_categorical_nan_no_dtype_conversion(): expected = pd.DataFrame({"a": Categorical([1], [1]), "b": [1]}) df.loc[0, "a"] = np.array([1]) tm.assert_frame_equal(df, expected) + + +@pytest.fixture +def pa(): + return pytest.importorskip("pyarrow") + + +@pytest.mark.parametrize( + "type_name, expected_size", + [ + # Integer types + ("int8", 1), + ("int16", 2), + ("int32", 4), + ("int64", 8), + ("uint8", 1), + ("uint16", 2), + ("uint32", 4), + ("uint64", 8), + # Floating point types + ("float16", 2), + ("float32", 4), + ("float64", 8), + # Boolean + ("bool_", 1), + # Date and timestamp types + ("date32", 4), + ("date64", 8), + ("timestamp", 8), + # Time types + ("time32", 4), + ("time64", 8), + # Decimal types + ("decimal128", 16), + ("decimal256", 32), + ], +) +def test_arrow_dtype_itemsize_fixed_width(pa, type_name, expected_size): + # GH 57948 + + parametric_type_map = { + "timestamp": pa.timestamp("ns"), + "time32": pa.time32("s"), + "time64": pa.time64("ns"), + "decimal128": pa.decimal128(38, 10), + "decimal256": pa.decimal256(76, 10), + } + + if type_name in parametric_type_map: + arrow_type = parametric_type_map.get(type_name) + else: + arrow_type = getattr(pa, type_name)() + dtype = pd.ArrowDtype(arrow_type) + + if type_name == "bool_": + expected_size = dtype.numpy_dtype.itemsize + + assert dtype.itemsize == expected_size, ( + f"{type_name} expected {expected_size}, got {dtype.itemsize} " + f"(bit_width={getattr(dtype.pyarrow_dtype, 'bit_width', 'N/A')})" + ) + + +@pytest.mark.parametrize("type_name", ["string", "binary", "large_string"]) +def test_arrow_dtype_itemsize_variable_width(pa, type_name): + # GH 57948 + + arrow_type = getattr(pa, type_name)() + dtype = pd.ArrowDtype(arrow_type) + + assert dtype.itemsize == dtype.numpy_dtype.itemsize + + +def test_arrow_dtype_error_fallback(pa, monkeypatch): + # GH 57948 + + dtype = pd.ArrowDtype(pa.int32()) + + class ErrorType: + id = None + + @property + def bit_width(self): + raise ValueError("Simulated Error") + + def to_pandas_dtype(self): + return Series([0]).dtype + + monkeypatch.setattr(dtype, "pyarrow_dtype", ErrorType()) + assert dtype.itemsize == dtype.numpy_dtype.itemsize From 52c1e42f250d5dffb26bc853f56eaa7fa1e9ad8d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Oct 2025 09:37:57 -0700 Subject: [PATCH 02/15] Bump github/codeql-action from 3 to 4 (#62677) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codeql.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 9a6cac3e19c25..1f7d9566a5a73 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -28,8 +28,8 @@ jobs: steps: - uses: actions/checkout@v5 - - uses: github/codeql-action/init@v3 + - uses: github/codeql-action/init@v4 with: languages: ${{ matrix.language }} - - uses: github/codeql-action/autobuild@v3 - - uses: github/codeql-action/analyze@v3 + - uses: github/codeql-action/autobuild@v4 + - uses: github/codeql-action/analyze@v4 From b6ef858ede2202e4df1fb28af480fdd87a485e78 Mon Sep 17 00:00:00 2001 From: Yaswanth Kumar <155723049+VYaswanthKumar@users.noreply.github.com> Date: Mon, 13 Oct 2025 22:10:46 +0530 Subject: [PATCH 03/15] Minor cleanup: removed mypy inline type comment in _xlsxwriter.py constructor (#62675) --- pandas/io/excel/_xlsxwriter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 5874f720e3bd0..4a7b8eee2bfce 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -213,7 +213,7 @@ def __init__( # pyright: ignore[reportInconsistentConstructor] ) try: - self._book = Workbook(self._handles.handle, **engine_kwargs) # type: ignore[arg-type] + self._book = Workbook(self._handles.handle, **engine_kwargs) except TypeError: self._handles.handle.close() raise From 9de474741e59107bf338e45b64d2e99b6dabb811 Mon Sep 17 00:00:00 2001 From: Navya Srivastava <143343265+Navya1707@users.noreply.github.com> Date: Mon, 13 Oct 2025 09:45:42 -0700 Subject: [PATCH 04/15] Replace @Appender with inline docstring for DataFrame.groupby (#62667) --- pandas/core/frame.py | 175 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 141 insertions(+), 34 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 643974db5f2bf..9c41b82bbbc8e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9346,21 +9346,140 @@ def update( # ---------------------------------------------------------------------- # Data reshaping - @Appender( - dedent( - """ + @deprecate_nonkeyword_arguments( + Pandas4Warning, allowed_args=["self", "by", "level"], name="groupby" + ) + def groupby( + self, + by=None, + level: IndexLabel | None = None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + observed: bool = True, + dropna: bool = True, + ) -> DataFrameGroupBy: + """ + Group DataFrame using a mapper or by a Series of columns. + + A groupby operation involves some combination of splitting the + object, applying a function, and combining the results. This can be + used to group large amounts of data and compute operations on these + groups. + + Parameters + ---------- + by : mapping, function, label, pd.Grouper or list of such + Used to determine the groups for the groupby. + If ``by`` is a function, it's called on each value of the object's + index. If a dict or Series is passed, the Series or dict VALUES + will be used to determine the groups (the Series' values are first + aligned; see ``.align()`` method). If a list or ndarray of length + equal to the selected axis is passed (see the `groupby user guide + `_), + the values are used as-is to determine the groups. A label or list + of labels may be passed to group by the columns in ``self``. + Notice that a tuple is interpreted as a (single) key. + level : int, level name, or sequence of such, default None + If the axis is a MultiIndex (hierarchical), group by a particular + level or levels. Do not specify both ``by`` and ``level``. + as_index : bool, default True + Return object with group labels as the + index. Only relevant for DataFrame input. as_index=False is + effectively "SQL-style" grouped output. This argument has no effect + on filtrations (see the `filtrations in the user guide + `_), + such as ``head()``, ``tail()``, ``nth()`` and in transformations + (see the `transformations in the user guide + `_). + sort : bool, default True + Sort group keys. Get better performance by turning this off. + Note this does not influence the order of observations within each + group. Groupby preserves the order of rows within each group. If False, + the groups will appear in the same order as they did in the original + DataFrame. + This argument has no effect on filtrations (see the `filtrations + in the user guide + `_), + such as ``head()``, ``tail()``, ``nth()`` and in transformations + (see the `transformations in the user guide + `_). + + .. versionchanged:: 2.0.0 + + Specifying ``sort=False`` with an ordered categorical grouper will no + longer sort the values. + + group_keys : bool, default True + When calling apply and the ``by`` argument produces a like-indexed + (i.e. :ref:`a transform `) result, add group keys to + index to identify pieces. By default group keys are not included + when the result's index (and column) labels match the inputs, and + are included otherwise. + + .. versionchanged:: 1.5.0 + + Warns that ``group_keys`` will no longer be ignored when the + result from ``apply`` is a like-indexed Series or DataFrame. + Specify ``group_keys`` explicitly to include the group keys or + not. + + .. versionchanged:: 2.0.0 + + ``group_keys`` now defaults to ``True``. + + observed : bool, default True + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. + + .. versionchanged:: 3.0.0 + + The default value is now ``True``. + + dropna : bool, default True + If True, and if group keys contain NA values, NA values together + with row/column will be dropped. + If False, NA values will also be treated as the key in groups. + + Returns + ------- + pandas.api.typing.DataFrameGroupBy + Returns a groupby object that contains information about the groups. + + See Also + -------- + resample : Convenience method for frequency conversion and resampling + of time series. + + Notes + ----- + See the `user guide + `__ for more + detailed usage and examples, including splitting an object into groups, + iterating through groups, selecting a group, aggregation, and more. + + The implementation of groupby is hash-based, meaning in particular that + objects that compare as equal will be considered to be in the same group. + An exception to this is that pandas has special handling of NA values: + any NA values will be collapsed to a single group, regardless of how + they compare. See the user guide linked above for more details. + Examples -------- - >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed': [380., 370., 24., 26.]}) + >>> df = pd.DataFrame( + ... { + ... "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"], + ... "Max Speed": [380.0, 370.0, 24.0, 26.0], + ... } + ... ) >>> df Animal Max Speed 0 Falcon 380.0 1 Falcon 370.0 2 Parrot 24.0 3 Parrot 26.0 - >>> df.groupby(['Animal']).mean() + >>> df.groupby(["Animal"]).mean() Max Speed Animal Falcon 375.0 @@ -9371,11 +9490,12 @@ def update( We can groupby different levels of a hierarchical index using the `level` parameter: - >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... ['Captive', 'Wild', 'Captive', 'Wild']] - >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) - >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, - ... index=index) + >>> arrays = [ + ... ["Falcon", "Falcon", "Parrot", "Parrot"], + ... ["Captive", "Wild", "Captive", "Wild"], + ... ] + >>> index = pd.MultiIndex.from_arrays(arrays, names=("Animal", "Type")) + >>> df = pd.DataFrame({"Max Speed": [390.0, 350.0, 30.0, 20.0]}, index=index) >>> df Max Speed Animal Type @@ -9413,7 +9533,7 @@ def update( 2.0 2 5 NaN 1 4 - >>> arr = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] + >>> arr = [["a", 12, 12], [None, 12.3, 33.0], ["b", 12.3, 123], ["a", 1, 1]] >>> df = pd.DataFrame(arr, columns=["a", "b", "c"]) >>> df.groupby(by="a").sum() @@ -9432,10 +9552,13 @@ def update( When using ``.apply()``, use ``group_keys`` to include or exclude the group keys. The ``group_keys`` argument defaults to ``True`` (include). - >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df.groupby("Animal", group_keys=True)[['Max Speed']].apply(lambda x: x) + >>> df = pd.DataFrame( + ... { + ... "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"], + ... "Max Speed": [380.0, 370.0, 24.0, 26.0], + ... } + ... ) + >>> df.groupby("Animal", group_keys=True)[["Max Speed"]].apply(lambda x: x) Max Speed Animal Falcon 0 380.0 @@ -9443,29 +9566,13 @@ def update( Parrot 2 24.0 3 26.0 - >>> df.groupby("Animal", group_keys=False)[['Max Speed']].apply(lambda x: x) + >>> df.groupby("Animal", group_keys=False)[["Max Speed"]].apply(lambda x: x) Max Speed 0 380.0 1 370.0 2 24.0 3 26.0 """ - ) - ) - @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) - @deprecate_nonkeyword_arguments( - Pandas4Warning, allowed_args=["self", "by", "level"], name="groupby" - ) - def groupby( - self, - by=None, - level: IndexLabel | None = None, - as_index: bool = True, - sort: bool = True, - group_keys: bool = True, - observed: bool = True, - dropna: bool = True, - ) -> DataFrameGroupBy: from pandas.core.groupby.generic import DataFrameGroupBy if level is None and by is None: From 5e500997bc542f01d9d7bdf475a65158461e52f0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Oct 2025 10:06:04 -0700 Subject: [PATCH 05/15] Bump pypa/cibuildwheel from 3.2.0 to 3.2.1 (#62676) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 7da88d634adfc..2e8d14159f3d5 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -162,7 +162,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v3.2.0 + uses: pypa/cibuildwheel@v3.2.1 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From 23aabe7489e9aa1dd029b813daa85d50ce4c52e7 Mon Sep 17 00:00:00 2001 From: krishna datta <19500807+krishna-datta@users.noreply.github.com> Date: Mon, 13 Oct 2025 13:07:34 -0400 Subject: [PATCH 06/15] Zip Strict specification for `pandas/core/indexes` (#62533) --- pandas/core/indexes/interval.py | 4 +-- pandas/core/indexes/multi.py | 47 ++++++++++++++++++--------------- pyproject.toml | 2 -- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index ad3278d3ec205..41925864abbbf 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1056,8 +1056,8 @@ def _intersection_non_unique(self, other: IntervalIndex) -> IntervalIndex: first_nan_loc = np.arange(len(self))[self.isna()][0] mask[first_nan_loc] = True - other_tups = set(zip(other.left, other.right)) - for i, tup in enumerate(zip(self.left, self.right)): + other_tups = set(zip(other.left, other.right, strict=True)) + for i, tup in enumerate(zip(self.left, self.right, strict=True)): if tup in other_tups: mask[i] = True diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7054c2e23e1ed..0cd33491d68bb 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -720,7 +720,7 @@ def from_frame( if not isinstance(df, ABCDataFrame): raise TypeError("Input must be a DataFrame") - column_names, columns = zip(*df.items()) + column_names, columns = zip(*df.items(), strict=True) names = column_names if names is None else names return cls.from_arrays(columns, sortorder=sortorder, names=names) @@ -878,7 +878,10 @@ def levels(self) -> FrozenList: # Use cache_readonly to ensure that self.get_locs doesn't repeatedly # create new IndexEngine # https://github.com/pandas-dev/pandas/issues/31648 - result = [x._rename(name=name) for x, name in zip(self._levels, self._names)] + result = [ + x._rename(name=name) + for x, name in zip(self._levels, self._names, strict=True) + ] for level in result: # disallow midx.levels[0].name = "foo" level._no_setting_name = True @@ -912,7 +915,7 @@ def _set_levels( else: level_numbers = [self._get_level_number(lev) for lev in level] new_levels_list = list(self._levels) - for lev_num, lev in zip(level_numbers, levels): + for lev_num, lev in zip(level_numbers, levels, strict=True): new_levels_list[lev_num] = ensure_index(lev, copy=copy)._view() new_levels = FrozenList(new_levels_list) @@ -1148,13 +1151,13 @@ def _set_codes( if level is None: new_codes = FrozenList( _coerce_indexer_frozen(level_codes, lev, copy=copy).view() - for lev, level_codes in zip(self._levels, codes) + for lev, level_codes in zip(self._levels, codes, strict=True) ) level_numbers = range(len(new_codes)) else: level_numbers = [self._get_level_number(lev) for lev in level] new_codes_list = list(self._codes) - for lev_num, level_codes in zip(level_numbers, codes): + for lev_num, level_codes in zip(level_numbers, codes, strict=True): lev = self.levels[lev_num] new_codes_list[lev_num] = _coerce_indexer_frozen( level_codes, lev, copy=copy @@ -1478,7 +1481,7 @@ def _formatter_func(self, tup): Formats each item in tup according to its level's formatter function. """ formatter_funcs = (level._formatter_func for level in self.levels) - return tuple(func(val) for func, val in zip(formatter_funcs, tup)) + return tuple(func(val) for func, val in zip(formatter_funcs, tup, strict=True)) def _get_values_for_csv( self, *, na_rep: str = "nan", **kwargs @@ -1487,7 +1490,7 @@ def _get_values_for_csv( new_codes = [] # go through the levels and format them - for level, level_codes in zip(self.levels, self.codes): + for level, level_codes in zip(self.levels, self.codes, strict=True): level_strs = level._get_values_for_csv(na_rep=na_rep, **kwargs) # add nan values, if there are any mask = level_codes == -1 @@ -1527,7 +1530,7 @@ def _format_multi( return [] stringified_levels = [] - for lev, level_codes in zip(self.levels, self.codes): + for lev, level_codes in zip(self.levels, self.codes, strict=True): na = _get_na_rep(lev.dtype) if len(lev) > 0: @@ -1550,7 +1553,7 @@ def _format_multi( stringified_levels.append(formatted) result_levels = [] - for lev, lev_name in zip(stringified_levels, self.names): + for lev, lev_name in zip(stringified_levels, self.names, strict=True): level = [] if include_names: @@ -1627,7 +1630,7 @@ def _set_names(self, names, *, level=None) -> None: level = (self._get_level_number(lev) for lev in level) # set the name - for lev, name in zip(level, names): + for lev, name in zip(level, names, strict=True): if name is not None: # GH 20527 # All items in 'names' need to be hashable: @@ -2094,7 +2097,7 @@ def _sort_levels_monotonic(self, raise_if_incomparable: bool = False) -> MultiIn new_levels = [] new_codes = [] - for lev, level_codes in zip(self.levels, self.codes): + for lev, level_codes in zip(self.levels, self.codes, strict=True): if not lev.is_monotonic_increasing: try: # indexer to reorder the levels @@ -2173,7 +2176,7 @@ def remove_unused_levels(self) -> MultiIndex: new_codes = [] changed = False - for lev, level_codes in zip(self.levels, self.codes): + for lev, level_codes in zip(self.levels, self.codes, strict=True): # Since few levels are typically unused, bincount() is more # efficient than unique() - however it only accepts positive values # (and drops order): @@ -2240,7 +2243,7 @@ def __getitem__(self, key): key = com.cast_scalar_indexer(key) retval = [] - for lev, level_codes in zip(self.levels, self.codes): + for lev, level_codes in zip(self.levels, self.codes, strict=True): if level_codes[key] == -1: retval.append(np.nan) else: @@ -3078,7 +3081,7 @@ def _partial_tup_index(self, tup: tuple, side: Literal["left", "right"] = "left" n = len(tup) start, end = 0, len(self) - zipped = zip(tup, self.levels, self.codes) + zipped = zip(tup, self.levels, self.codes, strict=True) for k, (lab, lev, level_codes) in enumerate(zipped): section = level_codes[start:end] @@ -3362,7 +3365,7 @@ def maybe_mi_droplevels(indexer, levels): "Key for location must have same length as number of levels" ) result = None - for lev, k in zip(level, key): + for lev, k in zip(level, key, strict=True): loc, new_index = self._get_loc_level(k, level=lev) if isinstance(loc, slice): mask = np.zeros(len(self), dtype=bool) @@ -3948,7 +3951,7 @@ def _union(self, other, sort) -> MultiIndex: if isinstance(result, MultiIndex): return result return MultiIndex.from_arrays( - zip(*result), sortorder=None, names=result_names + zip(*result, strict=True), sortorder=None, names=result_names ) else: @@ -3995,7 +3998,7 @@ def _maybe_match_names(self, other): if len(self.names) != len(other.names): return [None] * len(self.names) names = [] - for a_name, b_name in zip(self.names, other.names): + for a_name, b_name in zip(self.names, other.names, strict=True): if a_name == b_name: names.append(a_name) else: @@ -4092,7 +4095,7 @@ def putmask(self, mask, value: MultiIndex) -> MultiIndex: new_codes = [] for i, (value_level, level, level_codes) in enumerate( - zip(subset.levels, self.levels, self.codes) + zip(subset.levels, self.levels, self.codes, strict=True) ): new_level = level.union(value_level, sort=False) value_codes = new_level.get_indexer_for(subset.get_level_values(i)) @@ -4123,7 +4126,7 @@ def insert(self, loc: int, item) -> MultiIndex: new_levels = [] new_codes = [] - for k, level, level_codes in zip(item, self.levels, self.codes): + for k, level, level_codes in zip(item, self.levels, self.codes, strict=True): if k not in level: # have to insert into level # must insert at end otherwise you have to recompute all the @@ -4219,7 +4222,7 @@ def _lexsort_depth(codes: list[np.ndarray], nlevels: int) -> int: def sparsify_labels(label_list, start: int = 0, sentinel: object = ""): - pivoted = list(zip(*label_list)) + pivoted = list(zip(*label_list, strict=True)) k = len(label_list) result = pivoted[: start + 1] @@ -4228,7 +4231,7 @@ def sparsify_labels(label_list, start: int = 0, sentinel: object = ""): for cur in pivoted[start + 1 :]: sparse_cur = [] - for i, (p, t) in enumerate(zip(prev, cur)): + for i, (p, t) in enumerate(zip(prev, cur, strict=True)): if i == k - 1: sparse_cur.append(t) result.append(sparse_cur) # type: ignore[arg-type] @@ -4243,7 +4246,7 @@ def sparsify_labels(label_list, start: int = 0, sentinel: object = ""): prev = cur - return list(zip(*result)) + return list(zip(*result, strict=True)) def _get_na_rep(dtype: DtypeObj) -> str: diff --git a/pyproject.toml b/pyproject.toml index 34e4b4e5031ed..094d0b44a6721 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -452,8 +452,6 @@ exclude = [ "pandas/core/groupby/groupby.py" = ["B905"] "pandas/core/groupby/grouper.py" = ["B905"] "pandas/core/groupby/ops.py" = ["B905"] -"pandas/core/indexes/interval.py" = ["B905"] -"pandas/core/indexes/multi.py" = ["B905"] "pandas/core/methods/to_dict.py" = ["B905"] "pandas/core/reshape/concat.py" = ["B905"] "pandas/core/reshape/encoding.py" = ["B905"] From a906ab5b3671a48e1e3e3d5bfdf04f54233a4010 Mon Sep 17 00:00:00 2001 From: ZA1815 Date: Mon, 13 Oct 2025 16:08:14 -0400 Subject: [PATCH 07/15] DOC: Inline docstrings in window/rolling.py (#62662) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- pandas/core/window/rolling.py | 1832 ++++++++++++++++++++------------- 1 file changed, 1133 insertions(+), 699 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 5e16e02e8db6d..d3c417a008916 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -9,7 +9,6 @@ from datetime import timedelta from functools import partial import inspect -from textwrap import dedent from typing import ( TYPE_CHECKING, Any, @@ -30,11 +29,6 @@ import pandas._libs.window.aggregations as window_aggregations from pandas.compat._optional import import_optional_dependency from pandas.errors import DataError -from pandas.util._decorators import ( - Appender, - Substitution, - doc, -) from pandas.core.dtypes.common import ( ensure_float64, @@ -82,19 +76,6 @@ flex_binary_moment, zsqrt, ) -from pandas.core.window.doc import ( - _shared_docs, - create_section_header, - kwargs_numeric_only, - kwargs_scipy, - numba_notes, - template_header, - template_pipe, - template_returns, - template_see_also, - window_agg_numba_parameters, - window_apply_parameters, -) from pandas.core.window.numba_ import ( generate_manual_numpy_nan_agg_with_axis, generate_numba_apply_func, @@ -1095,11 +1076,11 @@ class Window(BaseWindow): >>> df.rolling(2, win_type="gaussian").sum(std=3) B - 0 NaN - 1 0.986207 - 2 2.958621 - 3 NaN - 4 NaN + 0 NaN + 1 0.986207 + 2 2.958621 + 3 NaN + 4 NaN **on** @@ -1234,38 +1215,78 @@ def calc(x): :: self.step ] - @doc( - _shared_docs["aggregate"], - see_also=dedent( - """ + def aggregate(self, func=None, *args, **kwargs): + """ + Aggregate using one or more operations over the specified axis. + + Parameters + ---------- + func : function, str, list or dict + Function to use for aggregating the data. If a function, must either + work when passed a Series/DataFrame or + when passed to Series/DataFrame.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of axis labels -> functions, function names or list of such. + + *args + Positional arguments to pass to `func`. + **kwargs + Keyword arguments to pass to `func`. + + Returns + ------- + scalar, Series or DataFrame + + The return can be: + + * scalar : when Series.agg is called with single function + * Series : when DataFrame.agg is called with a single function + * DataFrame : when DataFrame.agg is called with several functions + See Also -------- DataFrame.aggregate : Similar DataFrame method. Series.aggregate : Similar Series method. - """ - ), - examples=dedent( - """ + + Notes + ----- + The aggregation operations are always performed over an axis, either the + index (default) or the column axis. This behavior is different from + `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, + `var`), where the default is to compute the aggregation of the flattened + array, e.g., ``numpy.mean(arr_2d)`` as opposed to + ``numpy.mean(arr_2d, axis=0)``. + + `agg` is an alias for `aggregate`. Use the alias. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + A passed user-defined-function will be passed a Series for evaluation. + + If ``func`` defines an index relabeling, ``axis`` must be ``0`` or ``index``. + Examples -------- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df - A B C + A B C 0 1 4 7 1 2 5 8 2 3 6 9 >>> df.rolling(2, win_type="boxcar").agg("mean") - A B C + A B C 0 NaN NaN NaN 1 1.5 4.5 7.5 2 2.5 5.5 8.5 """ - ), - klass="Series/DataFrame", - axis="", - ) - def aggregate(self, func=None, *args, **kwargs): result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: # these must apply directly @@ -1275,24 +1296,40 @@ def aggregate(self, func=None, *args, **kwargs): agg = aggregate - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - kwargs_scipy, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """\ + def sum(self, numeric_only: bool = False, **kwargs): + """ + Calculate the rolling weighted window sum. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + **kwargs + Keyword arguments to configure the ``SciPy`` weighted window type. + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.sum : Aggregating sum for Series. + DataFrame.sum : Aggregating sum for DataFrame. + + Examples + -------- >>> ser = pd.Series([0, 1, 5, 2, 8]) To get an instance of :class:`~pandas.core.window.rolling.Window` we need to pass the parameter `win_type`. - >>> type(ser.rolling(2, win_type='gaussian')) + >>> type(ser.rolling(2, win_type="gaussian")) In order to use the `SciPy` Gaussian window we need to provide the parameters @@ -1300,7 +1337,7 @@ def aggregate(self, func=None, *args, **kwargs): We pass the second parameter `std` as a parameter of the following method (`sum` in this case): - >>> ser.rolling(2, win_type='gaussian').sum(std=3) + >>> ser.rolling(2, win_type="gaussian").sum(std=3) 0 NaN 1 0.986207 2 5.917243 @@ -1308,12 +1345,6 @@ def aggregate(self, func=None, *args, **kwargs): 4 9.862071 dtype: float64 """ - ), - window_method="rolling", - aggregation_description="weighted window sum", - agg_method="sum", - ) - def sum(self, numeric_only: bool = False, **kwargs): window_func = window_aggregations.roll_weighted_sum # error: Argument 1 to "_apply" of "Window" has incompatible type # "Callable[[ndarray, ndarray, int], ndarray]"; expected @@ -1325,31 +1356,47 @@ def sum(self, numeric_only: bool = False, **kwargs): **kwargs, ) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - kwargs_scipy, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """\ + def mean(self, numeric_only: bool = False, **kwargs): + """ + Calculate the rolling weighted window mean. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + **kwargs + Keyword arguments to configure the ``SciPy`` weighted window type. + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.mean : Aggregating mean for Series. + DataFrame.mean : Aggregating mean for DataFrame. + + Examples + -------- >>> ser = pd.Series([0, 1, 5, 2, 8]) To get an instance of :class:`~pandas.core.window.rolling.Window` we need to pass the parameter `win_type`. - >>> type(ser.rolling(2, win_type='gaussian')) + >>> type(ser.rolling(2, win_type="gaussian")) In order to use the `SciPy` Gaussian window we need to provide the parameters `M` and `std`. The parameter `M` corresponds to 2 in our example. We pass the second parameter `std` as a parameter of the following method: - >>> ser.rolling(2, win_type='gaussian').mean(std=3) + >>> ser.rolling(2, win_type="gaussian").mean(std=3) 0 NaN 1 0.5 2 3.0 @@ -1357,12 +1404,6 @@ def sum(self, numeric_only: bool = False, **kwargs): 4 5.0 dtype: float64 """ - ), - window_method="rolling", - aggregation_description="weighted window mean", - agg_method="mean", - ) - def mean(self, numeric_only: bool = False, **kwargs): window_func = window_aggregations.roll_weighted_mean # error: Argument 1 to "_apply" of "Window" has incompatible type # "Callable[[ndarray, ndarray, int], ndarray]"; expected @@ -1374,38 +1415,50 @@ def mean(self, numeric_only: bool = False, **kwargs): **kwargs, ) - @doc( - template_header, - create_section_header("Parameters"), - dedent( - """ + def var(self, ddof: int = 1, numeric_only: bool = False, **kwargs): + """ + Calculate the rolling weighted window variance. + + Parameters + ---------- ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements. - """ - ).replace("\n", "", 1), - kwargs_numeric_only, - kwargs_scipy, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """\ + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + **kwargs + Keyword arguments to configure the ``SciPy`` weighted window type. + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.var : Aggregating var for Series. + DataFrame.var : Aggregating var for DataFrame. + + Examples + -------- >>> ser = pd.Series([0, 1, 5, 2, 8]) To get an instance of :class:`~pandas.core.window.rolling.Window` we need to pass the parameter `win_type`. - >>> type(ser.rolling(2, win_type='gaussian')) + >>> type(ser.rolling(2, win_type="gaussian")) In order to use the `SciPy` Gaussian window we need to provide the parameters `M` and `std`. The parameter `M` corresponds to 2 in our example. We pass the second parameter `std` as a parameter of the following method: - >>> ser.rolling(2, win_type='gaussian').var(std=3) + >>> ser.rolling(2, win_type="gaussian").var(std=3) 0 NaN 1 0.5 2 8.0 @@ -1413,48 +1466,54 @@ def mean(self, numeric_only: bool = False, **kwargs): 4 18.0 dtype: float64 """ - ), - window_method="rolling", - aggregation_description="weighted window variance", - agg_method="var", - ) - def var(self, ddof: int = 1, numeric_only: bool = False, **kwargs): window_func = partial(window_aggregations.roll_weighted_var, ddof=ddof) kwargs.pop("name", None) return self._apply(window_func, name="var", numeric_only=numeric_only, **kwargs) - @doc( - template_header, - create_section_header("Parameters"), - dedent( - """ + def std(self, ddof: int = 1, numeric_only: bool = False, **kwargs): + """ + Calculate the rolling weighted window standard deviation. + + Parameters + ---------- ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements. - """ - ).replace("\n", "", 1), - kwargs_numeric_only, - kwargs_scipy, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """\ + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + **kwargs + Keyword arguments to configure the ``SciPy`` weighted window type. + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.std : Aggregating std for Series. + DataFrame.std : Aggregating std for DataFrame. + + Examples + -------- >>> ser = pd.Series([0, 1, 5, 2, 8]) To get an instance of :class:`~pandas.core.window.rolling.Window` we need to pass the parameter `win_type`. - >>> type(ser.rolling(2, win_type='gaussian')) + >>> type(ser.rolling(2, win_type="gaussian")) In order to use the `SciPy` Gaussian window we need to provide the parameters `M` and `std`. The parameter `M` corresponds to 2 in our example. We pass the second parameter `std` as a parameter of the following method: - >>> ser.rolling(2, win_type='gaussian').std(std=3) + >>> ser.rolling(2, win_type="gaussian").std(std=3) 0 NaN 1 0.707107 2 2.828427 @@ -1462,12 +1521,6 @@ def var(self, ddof: int = 1, numeric_only: bool = False, **kwargs): 4 4.242641 dtype: float64 """ - ), - window_method="rolling", - aggregation_description="weighted window standard deviation", - agg_method="std", - ) - def std(self, ddof: int = 1, numeric_only: bool = False, **kwargs): return zsqrt( self.var(ddof=ddof, name="std", numeric_only=numeric_only, **kwargs) ) @@ -1990,59 +2043,113 @@ def _raise_monotonic_error(self, msg: str): on = "index" raise ValueError(f"{on} {msg}") - @doc( - _shared_docs["aggregate"], - see_also=dedent( - """ + def aggregate(self, func=None, *args, **kwargs): + """ + Aggregate using one or more operations over the specified axis. + + Parameters + ---------- + func : function, str, list or dict + Function to use for aggregating the data. If a function, must either + work when passed a Series/Dataframe or + when passed to Series/Dataframe.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of axis labels -> functions, function names or list of such. + + *args + Positional arguments to pass to `func`. + **kwargs + Keyword arguments to pass to `func`. + + Returns + ------- + scalar, Series or DataFrame + + The return can be: + + * scalar : when Series.agg is called with single function + * Series : when DataFrame.agg is called with a single function + * DataFrame : when DataFrame.agg is called with several functions + See Also -------- Series.rolling : Calling object with Series data. DataFrame.rolling : Calling object with DataFrame data. - """ - ), - examples=dedent( - """ + + Notes + ----- + The aggregation operations are always performed over an axis, either the + index (default) or the column axis. This behavior is different from + `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, + `var`), where the default is to compute the aggregation of the flattened + array, e.g., ``numpy.mean(arr_2d)`` as opposed to + ``numpy.mean(arr_2d, axis=0)``. + + `agg` is an alias for `aggregate`. Use the alias. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + A passed user-defined-function will be passed a Series for evaluation. + + If ``func`` defines an index relabeling, ``axis`` must be ``0`` or ``index``. + Examples -------- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df - A B C + A B C 0 1 4 7 1 2 5 8 2 3 6 9 >>> df.rolling(2).sum() - A B C + A B C 0 NaN NaN NaN 1 3.0 9.0 15.0 2 5.0 11.0 17.0 >>> df.rolling(2).agg({"A": "sum", "B": "min"}) - A B + A B 0 NaN NaN 1 3.0 4.0 2 5.0 5.0 """ - ), - klass="Series/Dataframe", - axis="", - ) - def aggregate(self, func=None, *args, **kwargs): return super().aggregate(func, *args, **kwargs) agg = aggregate - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """ + def count(self, numeric_only: bool = False): + """ + Calculate the rolling count of non NaN observations. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.count : Aggregating count for Series. + DataFrame.count : Aggregating count for DataFrame. + + Examples + -------- >>> s = pd.Series([2, 3, np.nan, 10]) >>> s.rolling(2).count() 0 NaN @@ -2063,38 +2170,8 @@ def aggregate(self, func=None, *args, **kwargs): 3 3.0 dtype: float64 """ - ).replace("\n", "", 1), - window_method="rolling", - aggregation_description="count of non NaN observations", - agg_method="count", - ) - def count(self, numeric_only: bool = False): return super().count(numeric_only) - @doc( - template_header, - create_section_header("Parameters"), - window_apply_parameters, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """\ - >>> ser = pd.Series([1, 6, 5, 4]) - >>> ser.rolling(2).apply(lambda s: s.sum() - s.min()) - 0 NaN - 1 6.0 - 2 6.0 - 3 5.0 - dtype: float64 - """ - ), - window_method="rolling", - aggregation_description="custom aggregation function", - agg_method="apply", - ) def apply( self, func: Callable[..., Any], @@ -2104,6 +2181,70 @@ def apply( args: tuple[Any, ...] | None = None, kwargs: dict[str, Any] | None = None, ): + """ + Calculate the rolling custom aggregation function. + + Parameters + ---------- + func : function + Must produce a single value from an ndarray input if ``raw=True`` + or a single value from a Series if ``raw=False``. Can also accept a + Numba JIT function with ``engine='numba'`` specified. + + raw : bool, default False + * ``False`` : passes each row or column as a Series to the + function. + * ``True`` : the passed function will receive ndarray + objects instead. + + If you are just applying a NumPy reduction function this will + achieve much better performance. + + engine : str, default None + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` or + globally setting ``compute.use_numba``. + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. + + The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to both the ``func`` and the ``apply`` rolling aggregation. + + args : tuple, default None + Positional arguments to be passed into func. + + kwargs : dict, default None + Keyword arguments to be passed into func. + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.apply : Aggregating apply for Series. + DataFrame.apply : Aggregating apply for DataFrame. + + Examples + -------- + >>> ser = pd.Series([1, 6, 5, 4]) + >>> ser.rolling(2).apply(lambda s: s.sum() - s.min()) + 0 NaN + 1 6.0 + 2 6.0 + 3 5.0 + dtype: float64 + """ return super().apply( func, raw=raw, @@ -2130,51 +2271,145 @@ def pipe( ) -> T: ... @final - @Substitution( - klass="Rolling", - examples=""" - >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, - ... index=pd.date_range('2012-08-02', periods=4)) - >>> df - A - 2012-08-02 1 - 2012-08-03 2 - 2012-08-04 3 - 2012-08-05 4 - - To get the difference between each rolling 2-day window's maximum and minimum - value in one pass, you can do - - >>> df.rolling('2D').pipe(lambda x: x.max() - x.min()) - A - 2012-08-02 0.0 - 2012-08-03 1.0 - 2012-08-04 1.0 - 2012-08-05 1.0""", - ) - @Appender(template_pipe) def pipe( self, func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], *args: Any, **kwargs: Any, ) -> T: + """ + Apply a ``func`` with arguments to this Rolling object and return its result. + + Use `.pipe` when you want to improve readability by chaining together + functions that expect + Series, DataFrames, GroupBy, Rolling, Expanding or Resampler + objects. + Instead of writing + + >>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3 + >>> g = lambda x, arg1: x * 5 / arg1 + >>> f = lambda x: x**4 + >>> df = pd.DataFrame( + ... {"A": [1, 2, 3, 4]}, index=pd.date_range("2012-08-02", periods=4) + ... ) + >>> h(g(f(df.rolling("2D")), arg1=1), arg2=2, arg3=3) # doctest: +SKIP + + You can write + + >>> ( + ... df.rolling("2D").pipe(f).pipe(g, arg1=1).pipe(h, arg2=2, arg3=3) + ... ) # doctest: +SKIP + + which is much more readable. + + Parameters + ---------- + func : callable or tuple of (callable, str) + Function to apply to this Rolling object or, alternatively, + a `(callable, data_keyword)` tuple where `data_keyword` is a + string indicating the keyword of `callable` that expects the + Rolling object. + *args : iterable, optional + Positional arguments passed into `func`. + **kwargs : dict, optional + A dictionary of keyword arguments passed into `func`. + + Returns + ------- + Rolling + The original object with the function `func` applied. + + See Also + -------- + Series.pipe : Apply a function with arguments to a series. + DataFrame.pipe: Apply a function with arguments to a dataframe. + apply : Apply function to each group instead of to the + full Rolling object. + + Notes + ----- + See more `here + `__. + + Examples + -------- + + >>> df = pd.DataFrame( + ... {"A": [1, 2, 3, 4]}, index=pd.date_range("2012-08-02", periods=4) + ... ) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each rolling + 2-day window's maximum and minimum + value in one pass, you can do + + >>> df.rolling("2D").pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 0.0 + 2012-08-03 1.0 + 2012-08-04 1.0 + 2012-08-05 1.0 + """ return super().pipe(func, *args, **kwargs) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - window_agg_numba_parameters(), - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Notes"), - numba_notes, - create_section_header("Examples"), - dedent( - """ + def sum( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + """ + Calculate the rolling sum. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or + globally setting ``compute.use_numba`` + + .. versionadded:: 1.3.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}``. + + .. versionadded:: 1.3.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.sum : Aggregating sum for Series. + DataFrame.sum : Aggregating sum for DataFrame. + + Notes + ----- + See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` + for extended documentation and performance considerations + for the Numba engine. + + Examples + -------- >>> s = pd.Series([1, 2, 3, 4, 5]) >>> s 0 1 @@ -2202,9 +2437,9 @@ def pipe( For DataFrame, each sum is computed column-wise. - >>> df = pd.DataFrame({{"A": s, "B": s ** 2}}) + >>> df = pd.DataFrame({"A": s, "B": s**2}) >>> df - A B + A B 0 1 1 1 2 4 2 3 9 @@ -2212,69 +2447,19 @@ def pipe( 4 5 25 >>> df.rolling(3).sum() - A B + A B 0 NaN NaN 1 NaN NaN 2 6.0 14.0 3 9.0 29.0 4 12.0 50.0 """ - ).replace("\n", "", 1), - window_method="rolling", - aggregation_description="sum", - agg_method="sum", - ) - def sum( - self, - numeric_only: bool = False, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): return super().sum( numeric_only=numeric_only, engine=engine, engine_kwargs=engine_kwargs, ) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - dedent( - """ - *args : iterable, optional - Positional arguments passed into ``func``.\n - """ - ).replace("\n", "", 1), - window_agg_numba_parameters(), - dedent( - """ - **kwargs : mapping, optional - A dictionary of keyword arguments passed into ``func``.\n - """ - ).replace("\n", "", 1), - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Notes"), - numba_notes, - create_section_header("Examples"), - dedent( - """\ - >>> ser = pd.Series([1, 2, 3, 4]) - >>> ser.rolling(2).max() - 0 NaN - 1 2.0 - 2 3.0 - 3 4.0 - dtype: float64 - """ - ), - window_method="rolling", - aggregation_description="maximum", - agg_method="max", - ) def max( self, numeric_only: bool = False, @@ -2283,68 +2468,202 @@ def max( engine_kwargs: dict[str, bool] | None = None, **kwargs, ): - return super().max( - numeric_only=numeric_only, - engine=engine, - engine_kwargs=engine_kwargs, - ) + """ + Calculate the rolling maximum. - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - window_agg_numba_parameters(), - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Notes"), - numba_notes, - create_section_header("Examples"), - dedent( - """ - Performing a rolling minimum with a window size of 3. + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. - >>> s = pd.Series([4, 3, 5, 2, 6]) - >>> s.rolling(3).min() + .. versionadded:: 1.5.0 + + *args : iterable, optional + Positional arguments passed into ``func``. + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or + globally setting ``compute.use_numba`` + + .. versionadded:: 1.3.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. + + The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}``. + + .. versionadded:: 1.3.0 + + **kwargs : mapping, optional + A dictionary of keyword arguments passed into ``func``. + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.max : Aggregating max for Series. + DataFrame.max : Aggregating max for DataFrame. + + Notes + ----- + See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` + for extended documentation and performance considerations + for the Numba engine. + + Examples + -------- + >>> ser = pd.Series([1, 2, 3, 4]) + >>> ser.rolling(2).max() 0 NaN - 1 NaN + 1 2.0 2 3.0 - 3 2.0 - 4 2.0 + 3 4.0 dtype: float64 """ - ).replace("\n", "", 1), - window_method="rolling", - aggregation_description="minimum", - agg_method="min", - ) + return super().max( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + def min( self, numeric_only: bool = False, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): + """ + Calculate the rolling minimum. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or + globally setting ``compute.use_numba`` + + .. versionadded:: 1.3.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. + + The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}``. + + .. versionadded:: 1.3.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.min : Aggregating min for Series. + DataFrame.min : Aggregating min for DataFrame. + + Notes + ----- + See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` + for extended documentation and performance considerations + for the Numba engine. + + Examples + -------- + Performing a rolling minimum with a window size of 3. + + >>> s = pd.Series([4, 3, 5, 2, 6]) + >>> s.rolling(3).min() + 0 NaN + 1 NaN + 2 3.0 + 3 2.0 + 4 2.0 + dtype: float64 + """ return super().min( numeric_only=numeric_only, engine=engine, engine_kwargs=engine_kwargs, ) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - window_agg_numba_parameters(), - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Notes"), - numba_notes, - create_section_header("Examples"), - dedent( - """ + def mean( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + """ + Calculate the rolling mean. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or + globally setting ``compute.use_numba`` + + .. versionadded:: 1.3.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. + + The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}``. + + .. versionadded:: 1.3.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.mean : Aggregating mean for Series. + DataFrame.mean : Aggregating mean for DataFrame. + + Notes + ----- + See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` + for extended documentation and performance considerations + for the Numba engine. + + Examples + -------- The below examples will show rolling mean calculations with window sizes of two and three, respectively. @@ -2363,37 +2682,67 @@ def min( 3 3.0 dtype: float64 """ - ).replace("\n", "", 1), - window_method="rolling", - aggregation_description="mean", - agg_method="mean", - ) - def mean( - self, - numeric_only: bool = False, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): return super().mean( numeric_only=numeric_only, engine=engine, engine_kwargs=engine_kwargs, ) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - window_agg_numba_parameters(), - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Notes"), - numba_notes, - create_section_header("Examples"), - dedent( - """ + def median( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + """ + Calculate the rolling median. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or + globally setting ``compute.use_numba`` + + .. versionadded:: 1.3.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. + + The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}``. + + .. versionadded:: 1.3.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.median : Aggregating median for Series. + DataFrame.median : Aggregating median for DataFrame. + + Notes + ----- + See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` + for extended documentation and performance considerations + for the Numba engine. + + Examples + -------- Compute the rolling median of a series with a window size of 3. >>> s = pd.Series([0, 1, 2, 3, 4]) @@ -2405,52 +2754,74 @@ def mean( 4 3.0 dtype: float64 """ - ).replace("\n", "", 1), - window_method="rolling", - aggregation_description="median", - agg_method="median", - ) - def median( - self, - numeric_only: bool = False, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): return super().median( numeric_only=numeric_only, engine=engine, engine_kwargs=engine_kwargs, ) - @doc( - template_header, - create_section_header("Parameters"), - dedent( - """ + def std( + self, + ddof: int = 1, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + """ + Calculate the rolling standard deviation. + + Parameters + ---------- ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements. - """ - ).replace("\n", "", 1), - kwargs_numeric_only, - window_agg_numba_parameters("1.4"), - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - "numpy.std : Equivalent method for NumPy array.\n", - template_see_also, - create_section_header("Notes"), - dedent( - """ + + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or + globally setting ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. + + The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}``. + + .. versionadded:: 1.4.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + numpy.std : Equivalent method for NumPy array. + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.std : Aggregating std for Series. + DataFrame.std : Aggregating std for DataFrame. + + Notes + ----- The default ``ddof`` of 1 used in :meth:`Series.std` is different than the default ``ddof`` of 0 in :func:`numpy.std`. - A minimum of one period is required for the rolling calculation.\n - """ - ).replace("\n", "", 1), - create_section_header("Examples"), - dedent( - """ + A minimum of one period is required for the rolling calculation. + + Examples + -------- >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) >>> s.rolling(3).std() 0 NaN @@ -2462,18 +2833,6 @@ def median( 6 0.000000 dtype: float64 """ - ).replace("\n", "", 1), - window_method="rolling", - aggregation_description="standard deviation", - agg_method="std", - ) - def std( - self, - ddof: int = 1, - numeric_only: bool = False, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): return super().std( ddof=ddof, numeric_only=numeric_only, @@ -2481,35 +2840,68 @@ def std( engine_kwargs=engine_kwargs, ) - @doc( - template_header, - create_section_header("Parameters"), - dedent( - """ + def var( + self, + ddof: int = 1, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + """ + Calculate the rolling variance. + + Parameters + ---------- ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements. - """ - ).replace("\n", "", 1), - kwargs_numeric_only, - window_agg_numba_parameters("1.4"), - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - "numpy.var : Equivalent method for NumPy array.\n", - template_see_also, - create_section_header("Notes"), - dedent( - """ + + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or + globally setting ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. + + The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}``. + + .. versionadded:: 1.4.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + numpy.var : Equivalent method for NumPy array. + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.var : Aggregating var for Series. + DataFrame.var : Aggregating var for DataFrame. + + Notes + ----- The default ``ddof`` of 1 used in :meth:`Series.var` is different than the default ``ddof`` of 0 in :func:`numpy.var`. - A minimum of one period is required for the rolling calculation.\n - """ - ).replace("\n", "", 1), - create_section_header("Examples"), - dedent( - """ + A minimum of one period is required for the rolling calculation. + + Examples + -------- >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) >>> s.rolling(3).var() 0 NaN @@ -2521,18 +2913,6 @@ def std( 6 0.000000 dtype: float64 """ - ).replace("\n", "", 1), - window_method="rolling", - aggregation_description="variance", - agg_method="var", - ) - def var( - self, - ddof: int = 1, - numeric_only: bool = False, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): return super().var( ddof=ddof, numeric_only=numeric_only, @@ -2540,24 +2920,37 @@ def var( engine_kwargs=engine_kwargs, ) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - "scipy.stats.skew : Third moment of a probability density.\n", - template_see_also, - create_section_header("Notes"), - dedent( - """ - A minimum of three periods is required for the rolling calculation.\n - """ - ), - create_section_header("Examples"), - dedent( - """\ + def skew(self, numeric_only: bool = False): + """ + Calculate the rolling unbiased skewness. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + scipy.stats.skew : Third moment of a probability density. + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.skew : Aggregating skew for Series. + DataFrame.skew : Aggregating skew for DataFrame. + + Notes + ----- + + A minimum of three periods is required for the rolling calculation. + + Examples + -------- >>> ser = pd.Series([1, 5, 2, 7, 15, 6]) >>> ser.rolling(3).skew().round(6) 0 NaN @@ -2568,34 +2961,41 @@ def var( 5 1.652317 dtype: float64 """ - ), - window_method="rolling", - aggregation_description="unbiased skewness", - agg_method="skew", - ) - def skew(self, numeric_only: bool = False): return super().skew(numeric_only=numeric_only) - @doc( - template_header, - create_section_header("Parameters"), - dedent( - """ + def sem(self, ddof: int = 1, numeric_only: bool = False): + """ + Calculate the rolling standard error of mean. + + Parameters + ---------- ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements. - """ - ).replace("\n", "", 1), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Notes"), - "A minimum of one period is required for the calculation.\n\n", - create_section_header("Examples"), - dedent( - """ + + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.sem : Aggregating sem for Series. + DataFrame.sem : Aggregating sem for DataFrame. + + Notes + ----- + A minimum of one period is required for the calculation. + + Examples + -------- >>> s = pd.Series([0, 1, 2, 3]) >>> s.rolling(2, min_periods=1).sem() 0 NaN @@ -2604,40 +3004,50 @@ def skew(self, numeric_only: bool = False): 3 0.707107 dtype: float64 """ - ).replace("\n", "", 1), - window_method="rolling", - aggregation_description="standard error of mean", - agg_method="sem", - ) - def sem(self, ddof: int = 1, numeric_only: bool = False): # Raise here so error message says sem instead of std self._validate_numeric_only("sem", numeric_only) return self.std(numeric_only=numeric_only) / ( self.count(numeric_only) - ddof ).pow(0.5) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - "scipy.stats.kurtosis : Reference SciPy method.\n", - template_see_also, - create_section_header("Notes"), - "A minimum of four periods is required for the calculation.\n\n", - create_section_header("Examples"), - dedent( - """ + def kurt(self, numeric_only: bool = False): + """ + Calculate the rolling Fisher's definition of kurtosis without bias. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + scipy.stats.kurtosis : Reference SciPy method. + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.kurt : Aggregating kurt for Series. + DataFrame.kurt : Aggregating kurt for DataFrame. + + Notes + ----- + A minimum of four periods is required for the calculation. + + Examples + -------- The example below will show a rolling calculation with a window size of four matching the equivalent function call using `scipy.stats`. >>> arr = [1, 2, 3, 4, 999] >>> import scipy.stats - >>> print(f"{{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}}") + >>> print(f"{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}") -1.200000 - >>> print(f"{{scipy.stats.kurtosis(arr[1:], bias=False):.6f}}") + >>> print(f"{scipy.stats.kurtosis(arr[1:], bias=False):.6f}") 3.999946 >>> s = pd.Series(arr) >>> s.rolling(4).kurt() @@ -2648,30 +3058,31 @@ def sem(self, ddof: int = 1, numeric_only: bool = False): 4 3.999946 dtype: float64 """ - ).replace("\n", "", 1), - window_method="rolling", - aggregation_description="Fisher's definition of kurtosis without bias", - agg_method="kurt", - ) - def kurt(self, numeric_only: bool = False): return super().kurt(numeric_only=numeric_only) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - dedent( - """ - GroupBy.first : Similar method for GroupBy objects. - Rolling.last : Method to get the last element in each window.\n + def first(self, numeric_only: bool = False): """ - ).replace("\n", "", 1), - create_section_header("Examples"), - dedent( - """ + Calculate the rolling First (left-most) element of the window. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + GroupBy.first : Similar method for GroupBy objects. + Rolling.last : Method to get the last element in each window. + + Examples + -------- The example below will show a rolling calculation with a window size of three. @@ -2684,30 +3095,31 @@ def kurt(self, numeric_only: bool = False): 4 2.0 dtype: float64 """ - ).replace("\n", "", 1), - window_method="rolling", - aggregation_description="First (left-most) element of the window", - agg_method="first", - ) - def first(self, numeric_only: bool = False): return super().first(numeric_only=numeric_only) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - dedent( - """ - GroupBy.last : Similar method for GroupBy objects. - Rolling.first : Method to get the first element in each window.\n + def last(self, numeric_only: bool = False): """ - ).replace("\n", "", 1), - create_section_header("Examples"), - dedent( - """ + Calculate the rolling Last (right-most) element of the window. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + GroupBy.last : Similar method for GroupBy objects. + Rolling.first : Method to get the first element in each window. + + Examples + -------- The example below will show a rolling calculation with a window size of three. @@ -2720,25 +3132,26 @@ def first(self, numeric_only: bool = False): 4 4.0 dtype: float64 """ - ).replace("\n", "", 1), - window_method="rolling", - aggregation_description="Last (right-most) element of the window", - agg_method="last", - ) - def last(self, numeric_only: bool = False): return super().last(numeric_only=numeric_only) - @doc( - template_header, - create_section_header("Parameters"), - dedent( - """ + def quantile( + self, + q: float, + interpolation: QuantileInterpolation = "linear", + numeric_only: bool = False, + ): + """ + Calculate the rolling quantile. + + Parameters + ---------- q : float Quantile to compute. 0 <= quantile <= 1. .. deprecated:: 2.1.0 This was renamed from 'quantile' to 'q' in version 2.1.0. - interpolation : {{'linear', 'lower', 'higher', 'midpoint', 'nearest'}} + + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} This optional parameter specifies the interpolation method to use, when the desired quantile lies between two data points `i` and `j`: @@ -2748,55 +3161,62 @@ def last(self, numeric_only: bool = False): * higher: `j`. * nearest: `i` or `j` whichever is nearest. * midpoint: (`i` + `j`) / 2. - """ - ).replace("\n", "", 1), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """ + + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.quantile : Aggregating quantile for Series. + DataFrame.quantile : Aggregating quantile for DataFrame. + + Examples + -------- >>> s = pd.Series([1, 2, 3, 4]) - >>> s.rolling(2).quantile(.4, interpolation='lower') + >>> s.rolling(2).quantile(0.4, interpolation="lower") 0 NaN 1 1.0 2 2.0 3 3.0 dtype: float64 - >>> s.rolling(2).quantile(.4, interpolation='midpoint') + >>> s.rolling(2).quantile(0.4, interpolation="midpoint") 0 NaN 1 1.5 2 2.5 3 3.5 dtype: float64 """ - ).replace("\n", "", 1), - window_method="rolling", - aggregation_description="quantile", - agg_method="quantile", - ) - def quantile( - self, - q: float, - interpolation: QuantileInterpolation = "linear", - numeric_only: bool = False, - ): return super().quantile( q=q, interpolation=interpolation, numeric_only=numeric_only, ) - @doc( - template_header, - ".. versionadded:: 1.4.0 \n\n", - create_section_header("Parameters"), - dedent( - """ - method : {{'average', 'min', 'max'}}, default 'average' + def rank( + self, + method: WindowingRankType = "average", + ascending: bool = True, + pct: bool = False, + numeric_only: bool = False, + ): + """ + Calculate the rolling rank. + + .. versionadded:: 1.4.0 + + Parameters + ---------- + method : {'average', 'min', 'max'}, default 'average' How to rank the group of records that have the same value (i.e. ties): * average: average rank of the group @@ -2805,19 +3225,30 @@ def quantile( ascending : bool, default True Whether or not the elements should be ranked in ascending order. + pct : bool, default False Whether or not to display the returned rankings in percentile form. - """ - ).replace("\n", "", 1), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """ + + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.rank : Aggregating rank for Series. + DataFrame.rank : Aggregating rank for DataFrame. + + Examples + -------- >>> s = pd.Series([1, 4, 2, 3, 5, 3]) >>> s.rolling(3).rank() 0 NaN @@ -2846,18 +3277,6 @@ def quantile( 5 1.0 dtype: float64 """ - ).replace("\n", "", 1), - window_method="rolling", - aggregation_description="rank", - agg_method="rank", - ) - def rank( - self, - method: WindowingRankType = "average", - ascending: bool = True, - pct: bool = False, - numeric_only: bool = False, - ): return super().rank( method=method, ascending=ascending, @@ -2865,18 +3284,36 @@ def rank( numeric_only=numeric_only, ) - @doc( - template_header, - ".. versionadded:: 3.0.0 \n\n", - create_section_header("Parameters"), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """ + def nunique( + self, + numeric_only: bool = False, + ): + """ + Calculate the rolling nunique. + + .. versionadded:: 3.0.0 + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.nunique : Aggregating nunique for Series. + DataFrame.nunique : Aggregating nunique for DataFrame. + + Examples + -------- >>> s = pd.Series([1, 4, 2, np.nan, 3, 3, 4, 5]) >>> s.rolling(3).nunique() 0 NaN @@ -2889,27 +3326,26 @@ def rank( 7 3.0 dtype: float64 """ - ).replace("\n", "", 1), - window_method="rolling", - aggregation_description="nunique", - agg_method="nunique", - ) - def nunique( - self, - numeric_only: bool = False, - ): return super().nunique( numeric_only=numeric_only, ) - @doc( - template_header, - create_section_header("Parameters"), - dedent( - """ + def cov( + self, + other: DataFrame | Series | None = None, + pairwise: bool | None = None, + ddof: int = 1, + numeric_only: bool = False, + ): + """ + Calculate the rolling sample covariance. + + Parameters + ---------- other : Series or DataFrame, optional If not supplied then will default to self and produce pairwise output. + pairwise : bool, default None If False then only matching columns between self and other will be used and the output will be a DataFrame. @@ -2917,19 +3353,30 @@ def nunique( output will be a MultiIndexed DataFrame in the case of DataFrame inputs. In the case of missing elements, only complete pairwise observations will be used. + ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements. - """ - ).replace("\n", "", 1), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """\ + + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.cov : Aggregating cov for Series. + DataFrame.cov : Aggregating cov for DataFrame. + + Examples + -------- >>> ser1 = pd.Series([1, 2, 3, 4]) >>> ser2 = pd.Series([1, 4, 5, 8]) >>> ser1.rolling(2).cov(ser2) @@ -2939,18 +3386,6 @@ def nunique( 3 1.5 dtype: float64 """ - ), - window_method="rolling", - aggregation_description="sample covariance", - agg_method="cov", - ) - def cov( - self, - other: DataFrame | Series | None = None, - pairwise: bool | None = None, - ddof: int = 1, - numeric_only: bool = False, - ): return super().cov( other=other, pairwise=pairwise, @@ -2958,14 +3393,22 @@ def cov( numeric_only=numeric_only, ) - @doc( - template_header, - create_section_header("Parameters"), - dedent( - """ + def corr( + self, + other: DataFrame | Series | None = None, + pairwise: bool | None = None, + ddof: int = 1, + numeric_only: bool = False, + ): + """ + Calculate the rolling correlation. + + Parameters + ---------- other : Series or DataFrame, optional If not supplied then will default to self and produce pairwise output. + pairwise : bool, default None If False then only matching columns between self and other will be used and the output will be a DataFrame. @@ -2973,25 +3416,32 @@ def cov( output will be a MultiIndexed DataFrame in the case of DataFrame inputs. In the case of missing elements, only complete pairwise observations will be used. + ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements. - """ - ).replace("\n", "", 1), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - dedent( - """ + + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- cov : Similar method to calculate covariance. numpy.corrcoef : NumPy Pearson's correlation calculation. - """ - ).replace("\n", "", 1), - template_see_also, - create_section_header("Notes"), - dedent( - """ + Series.rolling : Calling rolling with Series data. + DataFrame.rolling : Calling rolling with DataFrames. + Series.corr : Aggregating corr for Series. + DataFrame.corr : Aggregating corr for DataFrame. + + Notes + ----- This function uses Pearson's definition of correlation (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). @@ -3010,12 +3460,10 @@ def cov( columns on the second level. In the case of missing elements, only complete pairwise observations - will be used.\n - """ - ).replace("\n", "", 1), - create_section_header("Examples"), - dedent( - """ + will be used. + + Examples + -------- The below example shows a rolling calculation with a window size of four matching the equivalent function call using :meth:`numpy.corrcoef`. @@ -3023,10 +3471,10 @@ def cov( >>> v2 = [3, 4, 4, 4, 8] >>> np.corrcoef(v1[:-1], v2[:-1]) array([[1. , 0.33333333], - [0.33333333, 1. ]]) + [0.33333333, 1. ]]) >>> np.corrcoef(v1[1:], v2[1:]) array([[1. , 0.9169493], - [0.9169493, 1. ]]) + [0.9169493, 1. ]]) >>> s1 = pd.Series(v1) >>> s2 = pd.Series(v2) >>> s1.rolling(4).corr(s2) @@ -3040,18 +3488,16 @@ def cov( The below example shows a similar rolling calculation on a DataFrame using the pairwise option. - >>> matrix = np.array([[51., 35.], - ... [49., 30.], - ... [47., 32.], - ... [46., 31.], - ... [50., 36.]]) + >>> matrix = np.array( + ... [[51.0, 35.0], [49.0, 30.0], [47.0, 32.0], [46.0, 31.0], [50.0, 36.0]] + ... ) >>> np.corrcoef(matrix[:-1, 0], matrix[:-1, 1]) array([[1. , 0.6263001], - [0.6263001, 1. ]]) + [0.6263001, 1. ]]) >>> np.corrcoef(matrix[1:, 0], matrix[1:, 1]) array([[1. , 0.55536811], - [0.55536811, 1. ]]) - >>> df = pd.DataFrame(matrix, columns=['X', 'Y']) + [0.55536811, 1. ]]) + >>> df = pd.DataFrame(matrix, columns=["X", "Y"]) >>> df X Y 0 51.0 35.0 @@ -3061,29 +3507,17 @@ def cov( 4 50.0 36.0 >>> df.rolling(4).corr(pairwise=True) X Y - 0 X NaN NaN - Y NaN NaN - 1 X NaN NaN - Y NaN NaN - 2 X NaN NaN - Y NaN NaN - 3 X 1.000000 0.626300 - Y 0.626300 1.000000 - 4 X 1.000000 0.555368 - Y 0.555368 1.000000 - """ - ).replace("\n", "", 1), - window_method="rolling", - aggregation_description="correlation", - agg_method="corr", - ) - def corr( - self, - other: DataFrame | Series | None = None, - pairwise: bool | None = None, - ddof: int = 1, - numeric_only: bool = False, - ): + 0 X NaN NaN + Y NaN NaN + 1 X NaN NaN + Y NaN NaN + 2 X NaN NaN + Y NaN NaN + 3 X 1.000000 0.626300 + Y 0.626300 1.000000 + 4 X 1.000000 0.555368 + Y 0.555368 1.000000 + """ return super().corr( other=other, pairwise=pairwise, From f5653f8a209fd2622d7553a7da8adfc6c02ca66b Mon Sep 17 00:00:00 2001 From: Akashisang <151737560+Akashisang@users.noreply.github.com> Date: Tue, 14 Oct 2025 04:09:34 +0800 Subject: [PATCH 08/15] DOC: Replace @doc @appender and @substitution decorator with inline docstrings in core/window/expanding.py (#62664) --- pandas/core/window/expanding.py | 1487 +++++++++++++++++++------------ 1 file changed, 916 insertions(+), 571 deletions(-) diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 4c4d43f0c7545..afab2295a8f69 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -1,6 +1,5 @@ from __future__ import annotations -from textwrap import dedent from typing import ( TYPE_CHECKING, Any, @@ -11,29 +10,11 @@ overload, ) -from pandas.util._decorators import ( - Appender, - Substitution, - doc, -) - from pandas.core.indexers.objects import ( BaseIndexer, ExpandingIndexer, GroupbyIndexer, ) -from pandas.core.window.doc import ( - _shared_docs, - create_section_header, - kwargs_numeric_only, - numba_notes, - template_header, - template_pipe, - template_returns, - template_see_also, - window_agg_numba_parameters, - window_apply_parameters, -) from pandas.core.window.rolling import ( BaseWindowGroupby, RollingAndExpandingMixin, @@ -147,18 +128,63 @@ def _get_window_indexer(self) -> BaseIndexer: """ return ExpandingIndexer() - @doc( - _shared_docs["aggregate"], - see_also=dedent( - """ + def aggregate(self, func=None, *args, **kwargs): + """ + Aggregate using one or more operations over the specified axis. + + Parameters + ---------- + func : function, str, list or dict + Function to use for aggregating the data. If a function, must either + work when passed a Series/Dataframe or when passed to + Series/Dataframe.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of axis labels -> functions, function names or list of such. + + *args + Positional arguments to pass to `func`. + **kwargs + Keyword arguments to pass to `func`. + + Returns + ------- + scalar, Series or DataFrame + + The return can be: + + * scalar : when Series.agg is called with single function + * Series : when DataFrame.agg is called with a single function + * DataFrame : when DataFrame.agg is called with several functions + See Also -------- DataFrame.aggregate : Similar DataFrame method. Series.aggregate : Similar Series method. - """ - ), - examples=dedent( - """ + + Notes + ----- + The aggregation operations are always performed over an axis, either the + index (default) or the column axis. This behavior is different from + `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, + `var`), where the default is to compute the aggregation of the flattened + array, e.g., ``numpy.mean(arr_2d)`` as opposed to + ``numpy.mean(arr_2d, axis=0)``. + + `agg` is an alias for `aggregate`. Use the alias. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + A passed user-defined-function will be passed a Series for evaluation. + + If ``func`` defines an index relabeling, ``axis`` must be ``0`` or ``index``. + Examples -------- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) @@ -174,27 +200,36 @@ def _get_window_indexer(self) -> BaseIndexer: 1 1.666667 4.666667 7.666667 2 2.428571 5.428571 8.428571 """ - ), - klass="Series/Dataframe", - axis="", - ) - def aggregate(self, func=None, *args, **kwargs): return super().aggregate(func, *args, **kwargs) agg = aggregate - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """\ - >>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + def count(self, numeric_only: bool = False): + """ + Calculate the expanding count of non NaN observations. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.count : Aggregating count for Series. + DataFrame.count : Aggregating count for DataFrame. + + Examples + -------- + >>> ser = pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"]) >>> ser.expanding().count() a 1.0 b 2.0 @@ -202,38 +237,8 @@ def aggregate(self, func=None, *args, **kwargs): d 4.0 dtype: float64 """ - ), - window_method="expanding", - aggregation_description="count of non NaN observations", - agg_method="count", - ) - def count(self, numeric_only: bool = False): return super().count(numeric_only=numeric_only) - @doc( - template_header, - create_section_header("Parameters"), - window_apply_parameters, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """\ - >>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) - >>> ser.expanding().apply(lambda s: s.max() - 2 * s.min()) - a -1.0 - b 0.0 - c 1.0 - d 2.0 - dtype: float64 - """ - ), - window_method="expanding", - aggregation_description="custom aggregation function", - agg_method="apply", - ) def apply( self, func: Callable[..., Any], @@ -243,6 +248,67 @@ def apply( args: tuple[Any, ...] | None = None, kwargs: dict[str, Any] | None = None, ): + """ + Calculate the expanding custom aggregation function. + + Parameters + ---------- + func : function + Must produce a single value from an ndarray input if ``raw=True`` + or a single value from a Series if ``raw=False``. Can also accept a + Numba JIT function with ``engine='numba'`` specified. + + raw : bool, default False + * ``False`` : passes each row or column as a Series to the + function. + * ``True`` : the passed function will receive ndarray objects instead. + + If you are just applying a NumPy reduction function this will + achieve much better performance. + + engine : str, default None + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to both the ``func`` and the ``apply`` rolling aggregation. + + args : tuple, default None + Positional arguments to be passed into func. + + kwargs : dict, default None + Keyword arguments to be passed into func. + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.apply : Aggregating apply for Series. + DataFrame.apply : Aggregating apply for DataFrame. + + Examples + -------- + >>> ser = pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"]) + >>> ser.expanding().apply(lambda s: s.max() - 2 * s.min()) + a -1.0 + b 0.0 + c 1.0 + d 2.0 + dtype: float64 + """ return super().apply( func, raw=raw, @@ -269,52 +335,144 @@ def pipe( ) -> T: ... @final - @Substitution( - klass="Expanding", - examples=""" - >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, - ... index=pd.date_range('2012-08-02', periods=4)) - >>> df - A - 2012-08-02 1 - 2012-08-03 2 - 2012-08-04 3 - 2012-08-05 4 - - To get the difference between each expanding window's maximum and minimum - value in one pass, you can do - - >>> df.expanding().pipe(lambda x: x.max() - x.min()) - A - 2012-08-02 0.0 - 2012-08-03 1.0 - 2012-08-04 2.0 - 2012-08-05 3.0""", - ) - @Appender(template_pipe) def pipe( self, func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], *args: Any, **kwargs: Any, ) -> T: + """ + Apply a ``func`` with arguments to this Expanding object and return its result. + + Use `.pipe` when you want to improve readability by chaining together + functions that expect Series, DataFrames, GroupBy, Rolling, Expanding or + Resampler + objects. + Instead of writing + + >>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3 + >>> g = lambda x, arg1: x * 5 / arg1 + >>> f = lambda x: x**4 + >>> df = pd.DataFrame( + ... {"A": [1, 2, 3, 4]}, index=pd.date_range("2012-08-02", periods=4) + ... ) + >>> h(g(f(df.rolling("2D")), arg1=1), arg2=2, arg3=3) # doctest: +SKIP + + You can write + + >>> ( + ... df.rolling("2D").pipe(f).pipe(g, arg1=1).pipe(h, arg2=2, arg3=3) + ... ) # doctest: +SKIP + + which is much more readable. + + Parameters + ---------- + func : callable or tuple of (callable, str) + Function to apply to this Expanding object or, alternatively, + a `(callable, data_keyword)` tuple where `data_keyword` is a + string indicating the keyword of `callable` that expects the + Expanding object. + *args : iterable, optional + Positional arguments passed into `func`. + **kwargs : dict, optional + A dictionary of keyword arguments passed into `func`. + + Returns + ------- + Expanding + The original object with the function `func` applied. + + See Also + -------- + Series.pipe : Apply a function with arguments to a series. + DataFrame.pipe: Apply a function with arguments to a dataframe. + apply : Apply function to each group instead of to the + full Expanding object. + + Notes + ----- + See more `here + `_ + + Examples + -------- + + >>> df = pd.DataFrame( + ... {"A": [1, 2, 3, 4]}, index=pd.date_range("2012-08-02", periods=4) + ... ) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each expanding window's maximum and minimum + value in one pass, you can do + + >>> df.expanding().pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 0.0 + 2012-08-03 1.0 + 2012-08-04 2.0 + 2012-08-05 3.0 + """ return super().pipe(func, *args, **kwargs) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - window_agg_numba_parameters(), - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Notes"), - numba_notes, - create_section_header("Examples"), - dedent( - """\ - >>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + def sum( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + """ + Calculate the expanding sum. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.3.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` + + .. versionadded:: 1.3.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.sum : Aggregating sum for Series. + DataFrame.sum : Aggregating sum for DataFrame. + + Notes + ----- + See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for extended + documentation and performance considerations for the Numba engine. + + Examples + -------- + >>> ser = pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"]) >>> ser.expanding().sum() a 1.0 b 3.0 @@ -322,38 +480,65 @@ def pipe( d 10.0 dtype: float64 """ - ), - window_method="expanding", - aggregation_description="sum", - agg_method="sum", - ) - def sum( - self, - numeric_only: bool = False, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): return super().sum( numeric_only=numeric_only, engine=engine, engine_kwargs=engine_kwargs, ) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - window_agg_numba_parameters(), - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Notes"), - numba_notes, - create_section_header("Examples"), - dedent( - """\ - >>> ser = pd.Series([3, 2, 1, 4], index=['a', 'b', 'c', 'd']) + def max( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + """ + Calculate the expanding maximum. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.3.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` + + .. versionadded:: 1.3.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.max : Aggregating max for Series. + DataFrame.max : Aggregating max for DataFrame. + + Notes + ----- + See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for extended + documentation and performance considerations for the Numba engine. + + Examples + -------- + >>> ser = pd.Series([3, 2, 1, 4], index=["a", "b", "c", "d"]) >>> ser.expanding().max() a 3.0 b 3.0 @@ -361,38 +546,65 @@ def sum( d 4.0 dtype: float64 """ - ), - window_method="expanding", - aggregation_description="maximum", - agg_method="max", - ) - def max( - self, - numeric_only: bool = False, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): return super().max( numeric_only=numeric_only, engine=engine, engine_kwargs=engine_kwargs, ) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - window_agg_numba_parameters(), - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Notes"), - numba_notes, - create_section_header("Examples"), - dedent( - """\ - >>> ser = pd.Series([2, 3, 4, 1], index=['a', 'b', 'c', 'd']) + def min( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + """ + Calculate the expanding minimum. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.3.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` + + .. versionadded:: 1.3.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.min : Aggregating min for Series. + DataFrame.min : Aggregating min for DataFrame. + + Notes + ----- + See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for extended + documentation and performance considerations for the Numba engine. + + Examples + -------- + >>> ser = pd.Series([2, 3, 4, 1], index=["a", "b", "c", "d"]) >>> ser.expanding().min() a 2.0 b 2.0 @@ -400,38 +612,65 @@ def max( d 1.0 dtype: float64 """ - ), - window_method="expanding", - aggregation_description="minimum", - agg_method="min", - ) - def min( - self, - numeric_only: bool = False, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): return super().min( numeric_only=numeric_only, engine=engine, engine_kwargs=engine_kwargs, ) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - window_agg_numba_parameters(), - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Notes"), - numba_notes, - create_section_header("Examples"), - dedent( - """\ - >>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + def mean( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + """ + Calculate the expanding mean. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.3.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` + + .. versionadded:: 1.3.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.mean : Aggregating mean for Series. + DataFrame.mean : Aggregating mean for DataFrame. + + Notes + ----- + See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for extended + documentation and performance considerations for the Numba engine. + + Examples + -------- + >>> ser = pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"]) >>> ser.expanding().mean() a 1.0 b 1.5 @@ -439,38 +678,65 @@ def min( d 2.5 dtype: float64 """ - ), - window_method="expanding", - aggregation_description="mean", - agg_method="mean", - ) - def mean( - self, - numeric_only: bool = False, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): return super().mean( numeric_only=numeric_only, engine=engine, engine_kwargs=engine_kwargs, ) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - window_agg_numba_parameters(), - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Notes"), - numba_notes, - create_section_header("Examples"), - dedent( - """\ - >>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + def median( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + """ + Calculate the expanding median. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.3.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` + + .. versionadded:: 1.3.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.median : Aggregating median for Series. + DataFrame.median : Aggregating median for DataFrame. + + Notes + ----- + See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for extended + documentation and performance considerations for the Numba engine. + + Examples + -------- + >>> ser = pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"]) >>> ser.expanding().median() a 1.0 b 1.5 @@ -478,69 +744,12 @@ def mean( d 2.5 dtype: float64 """ - ), - window_method="expanding", - aggregation_description="median", - agg_method="median", - ) - def median( - self, - numeric_only: bool = False, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): return super().median( numeric_only=numeric_only, engine=engine, engine_kwargs=engine_kwargs, ) - @doc( - template_header, - create_section_header("Parameters"), - dedent( - """ - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements.\n - """ - ).replace("\n", "", 1), - kwargs_numeric_only, - window_agg_numba_parameters("1.4"), - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - "numpy.std : Equivalent method for NumPy array.\n", - template_see_also, - create_section_header("Notes"), - dedent( - """ - The default ``ddof`` of 1 used in :meth:`Series.std` is different - than the default ``ddof`` of 0 in :func:`numpy.std`. - - A minimum of one period is required for the rolling calculation.\n - """ - ).replace("\n", "", 1), - create_section_header("Examples"), - dedent( - """ - >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) - - >>> s.expanding(3).std() - 0 NaN - 1 NaN - 2 0.577350 - 3 0.957427 - 4 0.894427 - 5 0.836660 - 6 0.786796 - dtype: float64 - """ - ).replace("\n", "", 1), - window_method="expanding", - aggregation_description="standard deviation", - agg_method="std", - ) def std( self, ddof: int = 1, @@ -548,42 +757,138 @@ def std( engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): - return super().std( - ddof=ddof, - numeric_only=numeric_only, - engine=engine, - engine_kwargs=engine_kwargs, - ) + """ + Calculate the expanding standard deviation. - @doc( - template_header, - create_section_header("Parameters"), - dedent( - """ + Parameters + ---------- ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements.\n + is ``N - ddof``, where ``N`` represents the number of elements. + + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` + + .. versionadded:: 1.4.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + numpy.std : Equivalent method for NumPy array. + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.std : Aggregating std for Series. + DataFrame.std : Aggregating std for DataFrame. + + Notes + ----- + The default ``ddof`` of 1 used in :meth:`Series.std` is different + than the default ``ddof`` of 0 in :func:`numpy.std`. + + A minimum of one period is required for the rolling calculation. + + Examples + -------- + >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) + + >>> s.expanding(3).std() + 0 NaN + 1 NaN + 2 0.577350 + 3 0.957427 + 4 0.894427 + 5 0.836660 + 6 0.786796 + dtype: float64 """ - ).replace("\n", "", 1), - kwargs_numeric_only, - window_agg_numba_parameters("1.4"), - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - "numpy.var : Equivalent method for NumPy array.\n", - template_see_also, - create_section_header("Notes"), - dedent( - """ + return super().std( + ddof=ddof, + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + def var( + self, + ddof: int = 1, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + """ + Calculate the expanding variance. + + Parameters + ---------- + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` + + .. versionadded:: 1.4.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + numpy.var : Equivalent method for NumPy array. + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.var : Aggregating var for Series. + DataFrame.var : Aggregating var for DataFrame. + + Notes + ----- The default ``ddof`` of 1 used in :meth:`Series.var` is different than the default ``ddof`` of 0 in :func:`numpy.var`. - A minimum of one period is required for the rolling calculation.\n - """ - ).replace("\n", "", 1), - create_section_header("Examples"), - dedent( - """ + A minimum of one period is required for the rolling calculation. + + Examples + -------- >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) >>> s.expanding(3).var() @@ -596,18 +901,6 @@ def std( 6 0.619048 dtype: float64 """ - ).replace("\n", "", 1), - window_method="expanding", - aggregation_description="variance", - agg_method="var", - ) - def var( - self, - ddof: int = 1, - numeric_only: bool = False, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): return super().var( ddof=ddof, numeric_only=numeric_only, @@ -615,26 +908,39 @@ def var( engine_kwargs=engine_kwargs, ) - @doc( - template_header, - create_section_header("Parameters"), - dedent( - """ + def sem(self, ddof: int = 1, numeric_only: bool = False): + """ + Calculate the expanding standard error of mean. + + Parameters + ---------- ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements.\n - """ - ).replace("\n", "", 1), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Notes"), - "A minimum of one period is required for the calculation.\n\n", - create_section_header("Examples"), - dedent( - """ + is ``N - ddof``, where ``N`` represents the number of elements. + + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.sem : Aggregating sem for Series. + DataFrame.sem : Aggregating sem for DataFrame. + + Notes + ----- + A minimum of one period is required for the calculation. + + Examples + -------- >>> s = pd.Series([0, 1, 2, 3]) >>> s.expanding().sem() @@ -644,29 +950,39 @@ def var( 3 0.745356 dtype: float64 """ - ).replace("\n", "", 1), - window_method="expanding", - aggregation_description="standard error of mean", - agg_method="sem", - ) - def sem(self, ddof: int = 1, numeric_only: bool = False): return super().sem(ddof=ddof, numeric_only=numeric_only) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - "scipy.stats.skew : Third moment of a probability density.\n", - template_see_also, - create_section_header("Notes"), - "A minimum of three periods is required for the rolling calculation.\n\n", - create_section_header("Examples"), - dedent( - """\ - >>> ser = pd.Series([-1, 0, 2, -1, 2], index=['a', 'b', 'c', 'd', 'e']) + def skew(self, numeric_only: bool = False): + """ + Calculate the expanding unbiased skewness. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + scipy.stats.skew : Third moment of a probability density. + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.skew : Aggregating skew for Series. + DataFrame.skew : Aggregating skew for DataFrame. + + Notes + ----- + A minimum of three periods is required for the rolling calculation. + + Examples + -------- + >>> ser = pd.Series([-1, 0, 2, -1, 2], index=["a", "b", "c", "d", "e"]) >>> ser.expanding().skew() a NaN b NaN @@ -675,36 +991,46 @@ def sem(self, ddof: int = 1, numeric_only: bool = False): e 0.315356 dtype: float64 """ - ), - window_method="expanding", - aggregation_description="unbiased skewness", - agg_method="skew", - ) - def skew(self, numeric_only: bool = False): return super().skew(numeric_only=numeric_only) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - "scipy.stats.kurtosis : Reference SciPy method.\n", - template_see_also, - create_section_header("Notes"), - "A minimum of four periods is required for the calculation.\n\n", - create_section_header("Examples"), - dedent( - """ + def kurt(self, numeric_only: bool = False): + """ + Calculate the expanding Fisher's definition of kurtosis without bias. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + scipy.stats.kurtosis : Reference SciPy method. + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.kurt : Aggregating kurt for Series. + DataFrame.kurt : Aggregating kurt for DataFrame. + + Notes + ----- + A minimum of four periods is required for the calculation. + + Examples + -------- The example below will show a rolling calculation with a window size of four matching the equivalent function call using `scipy.stats`. >>> arr = [1, 2, 3, 4, 999] >>> import scipy.stats - >>> print(f"{{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}}") + >>> print(f"{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}") -1.200000 - >>> print(f"{{scipy.stats.kurtosis(arr, bias=False):.6f}}") + >>> print(f"{scipy.stats.kurtosis(arr, bias=False):.6f}") 4.999874 >>> s = pd.Series(arr) >>> s.expanding(4).kurt() @@ -715,30 +1041,31 @@ def skew(self, numeric_only: bool = False): 4 4.999874 dtype: float64 """ - ).replace("\n", "", 1), - window_method="expanding", - aggregation_description="Fisher's definition of kurtosis without bias", - agg_method="kurt", - ) - def kurt(self, numeric_only: bool = False): return super().kurt(numeric_only=numeric_only) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - dedent( - """ - GroupBy.first : Similar method for GroupBy objects. - Expanding.last : Method to get the last element in each window.\n + def first(self, numeric_only: bool = False): """ - ).replace("\n", "", 1), - create_section_header("Examples"), - dedent( - """ + Calculate the expanding First (left-most) element of the window. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + GroupBy.first : Similar method for GroupBy objects. + Expanding.last : Method to get the last element in each window. + + Examples + -------- The example below will show an expanding calculation with a window size of three. @@ -751,30 +1078,31 @@ def kurt(self, numeric_only: bool = False): 4 0.0 dtype: float64 """ - ).replace("\n", "", 1), - window_method="expanding", - aggregation_description="First (left-most) element of the window", - agg_method="first", - ) - def first(self, numeric_only: bool = False): return super().first(numeric_only=numeric_only) - @doc( - template_header, - create_section_header("Parameters"), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - dedent( - """ - GroupBy.last : Similar method for GroupBy objects. - Expanding.first : Method to get the first element in each window.\n + def last(self, numeric_only: bool = False): """ - ).replace("\n", "", 1), - create_section_header("Examples"), - dedent( - """ + Calculate the expanding Last (right-most) element of the window. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + GroupBy.last : Similar method for GroupBy objects. + Expanding.first : Method to get the first element in each window. + + Examples + -------- The example below will show an expanding calculation with a window size of three. @@ -787,25 +1115,25 @@ def first(self, numeric_only: bool = False): 4 4.0 dtype: float64 """ - ).replace("\n", "", 1), - window_method="expanding", - aggregation_description="Last (right-most) element of the window", - agg_method="last", - ) - def last(self, numeric_only: bool = False): return super().last(numeric_only=numeric_only) - @doc( - template_header, - create_section_header("Parameters"), - dedent( - """ + def quantile( + self, + q: float, + interpolation: QuantileInterpolation = "linear", + numeric_only: bool = False, + ): + """ + Calculate the expanding quantile. + + Parameters + ---------- q : float Quantile to compute. 0 <= quantile <= 1. .. deprecated:: 2.1.0 This was renamed from 'quantile' to 'q' in version 2.1.0. - interpolation : {{'linear', 'lower', 'higher', 'midpoint', 'nearest'}} + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} This optional parameter specifies the interpolation method to use, when the desired quantile lies between two data points `i` and `j`: @@ -815,18 +1143,28 @@ def last(self, numeric_only: bool = False): * higher: `j`. * nearest: `i` or `j` whichever is nearest. * midpoint: (`i` + `j`) / 2. - """ - ).replace("\n", "", 1), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """\ - >>> ser = pd.Series([1, 2, 3, 4, 5, 6], index=['a', 'b', 'c', 'd', 'e', 'f']) - >>> ser.expanding(min_periods=4).quantile(.25) + + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.quantile : Aggregating quantile for Series. + DataFrame.quantile : Aggregating quantile for DataFrame. + + Examples + -------- + >>> ser = pd.Series([1, 2, 3, 4, 5, 6], index=["a", "b", "c", "d", "e", "f"]) + >>> ser.expanding(min_periods=4).quantile(0.25) a NaN b NaN c NaN @@ -835,30 +1173,27 @@ def last(self, numeric_only: bool = False): f 2.25 dtype: float64 """ - ), - window_method="expanding", - aggregation_description="quantile", - agg_method="quantile", - ) - def quantile( - self, - q: float, - interpolation: QuantileInterpolation = "linear", - numeric_only: bool = False, - ): return super().quantile( q=q, interpolation=interpolation, numeric_only=numeric_only, ) - @doc( - template_header, - ".. versionadded:: 1.4.0 \n\n", - create_section_header("Parameters"), - dedent( - """ - method : {{'average', 'min', 'max'}}, default 'average' + def rank( + self, + method: WindowingRankType = "average", + ascending: bool = True, + pct: bool = False, + numeric_only: bool = False, + ): + """ + Calculate the expanding rank. + + .. versionadded:: 1.4.0 + + Parameters + ---------- + method : {'average', 'min', 'max'}, default 'average' How to rank the group of records that have the same value (i.e. ties): * average: average rank of the group @@ -870,16 +1205,25 @@ def quantile( pct : bool, default False Whether or not to display the returned rankings in percentile form. - """ - ).replace("\n", "", 1), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """ + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.rank : Aggregating rank for Series. + DataFrame.rank : Aggregating rank for DataFrame. + + Examples + -------- >>> s = pd.Series([1, 4, 2, 3, 5, 3]) >>> s.expanding().rank() 0 1.0 @@ -908,18 +1252,6 @@ def quantile( 5 3.0 dtype: float64 """ - ).replace("\n", "", 1), - window_method="expanding", - aggregation_description="rank", - agg_method="rank", - ) - def rank( - self, - method: WindowingRankType = "average", - ascending: bool = True, - pct: bool = False, - numeric_only: bool = False, - ): return super().rank( method=method, ascending=ascending, @@ -927,18 +1259,36 @@ def rank( numeric_only=numeric_only, ) - @doc( - template_header, - ".. versionadded:: 3.0.0 \n\n", - create_section_header("Parameters"), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """ + def nunique( + self, + numeric_only: bool = False, + ): + """ + Calculate the expanding nunique. + + .. versionadded:: 3.0.0 + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.nunique : Aggregating nunique for Series. + DataFrame.nunique : Aggregating nunique for DataFrame. + + Examples + -------- >>> s = pd.Series([1, 4, 2, 3, 5, 3]) >>> s.expanding().nunique() 0 1.0 @@ -949,24 +1299,22 @@ def rank( 5 5.0 dtype: float64 """ - ).replace("\n", "", 1), - window_method="expanding", - aggregation_description="nunique", - agg_method="nunique", - ) - def nunique( - self, - numeric_only: bool = False, - ): return super().nunique( numeric_only=numeric_only, ) - @doc( - template_header, - create_section_header("Parameters"), - dedent( - """ + def cov( + self, + other: DataFrame | Series | None = None, + pairwise: bool | None = None, + ddof: int = 1, + numeric_only: bool = False, + ): + """ + Calculate the expanding sample covariance. + + Parameters + ---------- other : Series or DataFrame, optional If not supplied then will default to self and produce pairwise output. @@ -980,18 +1328,27 @@ def nunique( ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements. - """ - ).replace("\n", "", 1), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - template_see_also, - create_section_header("Examples"), - dedent( - """\ - >>> ser1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) - >>> ser2 = pd.Series([10, 11, 13, 16], index=['a', 'b', 'c', 'd']) + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.cov : Aggregating cov for Series. + DataFrame.cov : Aggregating cov for DataFrame. + + Examples + -------- + >>> ser1 = pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"]) + >>> ser2 = pd.Series([10, 11, 13, 16], index=["a", "b", "c", "d"]) >>> ser1.expanding().cov(ser2) a NaN b 0.500000 @@ -999,18 +1356,6 @@ def nunique( d 3.333333 dtype: float64 """ - ), - window_method="expanding", - aggregation_description="sample covariance", - agg_method="cov", - ) - def cov( - self, - other: DataFrame | Series | None = None, - pairwise: bool | None = None, - ddof: int = 1, - numeric_only: bool = False, - ): return super().cov( other=other, pairwise=pairwise, @@ -1018,11 +1363,18 @@ def cov( numeric_only=numeric_only, ) - @doc( - template_header, - create_section_header("Parameters"), - dedent( - """ + def corr( + self, + other: DataFrame | Series | None = None, + pairwise: bool | None = None, + ddof: int = 1, + numeric_only: bool = False, + ): + """ + Calculate the expanding correlation. + + Parameters + ---------- other : Series or DataFrame, optional If not supplied then will default to self and produce pairwise output. @@ -1035,23 +1387,30 @@ def cov( observations will be used. ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements.\n - """ - ).replace("\n", "", 1), - kwargs_numeric_only, - create_section_header("Returns"), - template_returns, - create_section_header("See Also"), - dedent( - """ + is ``N - ddof``, where ``N`` represents the number of elements. + + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype. + + See Also + -------- cov : Similar method to calculate covariance. numpy.corrcoef : NumPy Pearson's correlation calculation. - """ - ).replace("\n", "", 1), - template_see_also, - create_section_header("Notes"), - dedent( - """ + Series.expanding : Calling expanding with Series data. + DataFrame.expanding : Calling expanding with DataFrames. + Series.corr : Aggregating corr for Series. + DataFrame.corr : Aggregating corr for DataFrame. + + Notes + ----- + This function uses Pearson's definition of correlation (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). @@ -1070,14 +1429,12 @@ def cov( columns on the second level. In the case of missing elements, only complete pairwise observations - will be used.\n - """ - ), - create_section_header("Examples"), - dedent( - """\ - >>> ser1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) - >>> ser2 = pd.Series([10, 11, 13, 16], index=['a', 'b', 'c', 'd']) + will be used. + + Examples + -------- + >>> ser1 = pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"]) + >>> ser2 = pd.Series([10, 11, 13, 16], index=["a", "b", "c", "d"]) >>> ser1.expanding().corr(ser2) a NaN b 1.000000 @@ -1085,18 +1442,6 @@ def cov( d 0.975900 dtype: float64 """ - ), - window_method="expanding", - aggregation_description="correlation", - agg_method="corr", - ) - def corr( - self, - other: DataFrame | Series | None = None, - pairwise: bool | None = None, - ddof: int = 1, - numeric_only: bool = False, - ): return super().corr( other=other, pairwise=pairwise, From e8013f362a2fb61029ac336853dbf6867dbfba2c Mon Sep 17 00:00:00 2001 From: Yaswanth Kumar <155723049+VYaswanthKumar@users.noreply.github.com> Date: Tue, 14 Oct 2025 01:41:44 +0530 Subject: [PATCH 09/15] TST: Replace ensure_clean_store with temp_file in test_time_series.py (#62652) Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/io/pytables/test_time_series.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/pytables/test_time_series.py b/pandas/tests/io/pytables/test_time_series.py index 726dd0d420347..b44f612d43f9e 100644 --- a/pandas/tests/io/pytables/test_time_series.py +++ b/pandas/tests/io/pytables/test_time_series.py @@ -6,29 +6,29 @@ from pandas import ( DataFrame, DatetimeIndex, + HDFStore, Series, _testing as tm, date_range, period_range, ) -from pandas.tests.io.pytables.common import ensure_clean_store pytestmark = pytest.mark.single_cpu @pytest.mark.parametrize("unit", ["us", "ns"]) -def test_store_datetime_fractional_secs(setup_path, unit): +def test_store_datetime_fractional_secs(temp_file, unit): dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) dti = DatetimeIndex([dt], dtype=f"M8[{unit}]") series = Series([0], index=dti) - with ensure_clean_store(setup_path) as store: + with HDFStore(temp_file) as store: store["a"] = series assert store["a"].index[0] == dt @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") -def test_tseries_indices_series(setup_path): - with ensure_clean_store(setup_path) as store: +def test_tseries_indices_series(temp_file): + with HDFStore(temp_file) as store: idx = date_range("2020-01-01", periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) store["a"] = ser @@ -49,8 +49,8 @@ def test_tseries_indices_series(setup_path): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") -def test_tseries_indices_frame(setup_path): - with ensure_clean_store(setup_path) as store: +def test_tseries_indices_frame(temp_file): + with HDFStore(temp_file) as store: idx = date_range("2020-01-01", periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx From 8c80f89bbd35e8289d5106586128ce0d10fffb32 Mon Sep 17 00:00:00 2001 From: BreezeLune <1066773178@qq.com> Date: Tue, 14 Oct 2025 07:12:41 +0800 Subject: [PATCH 10/15] STY: Add strict=True to zip() calls in pandas/_testing (#62681) --- pandas/_testing/_warnings.py | 4 +++- pandas/_testing/asserters.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index 6ed8a7b9154d4..d2d1f5c0c273e 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -110,7 +110,9 @@ class for all warnings. To raise multiple types of exceptions, if isinstance(match, tuple) else (match,) * len(expected_warning) ) - for warning_type, warning_match in zip(expected_warning, match): + for warning_type, warning_match in zip( + expected_warning, match, strict=True + ): _assert_caught_expected_warnings( caught_warnings=w, expected_warning=warning_type, diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index c8f3bb6bd77d2..fc2cae5fadaa7 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -675,7 +675,7 @@ def _raise(left, right, err_msg) -> NoReturn: ) diff = 0 - for left_arr, right_arr in zip(left, right): + for left_arr, right_arr in zip(left, right, strict=True): # count up differences if not array_equivalent(left_arr, right_arr, strict_nan=strict_nan): diff += 1 @@ -1447,7 +1447,7 @@ def assert_copy(iter1, iter2, **eql_kwargs) -> None: the same object. (Does not check that items in sequences are also not the same object) """ - for elem1, elem2 in zip(iter1, iter2): + for elem1, elem2 in zip(iter1, iter2, strict=True): assert_almost_equal(elem1, elem2, **eql_kwargs) msg = ( f"Expected object {type(elem1)!r} and object {type(elem2)!r} to be " From df69921fab7ff1313af8524068a779b02ac8a636 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 13 Oct 2025 18:58:03 -0700 Subject: [PATCH 11/15] ERR: improve exception message for Series.replace (#62686) --- pandas/core/generic.py | 4 ++-- pandas/tests/series/methods/test_replace.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0154087b18399..6d703c398f055 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7627,8 +7627,8 @@ def replace( # Operate column-wise if self.ndim == 1: raise ValueError( - "Series.replace cannot use dict-like to_replace " - "and non-None value" + "Series.replace cannot specify both a dict-like " + "'to_replace' and a 'value'" ) mapping = { col: (to_rep, value) for col, to_rep in to_replace.items() diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index f7383dd967cbe..5134ba445352b 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -486,7 +486,9 @@ def test_replace_only_one_dictlike_arg(self, fixed_now_ts): ser = pd.Series([1, 2, "A", fixed_now_ts, True]) to_replace = {0: 1, 2: "A"} value = "foo" - msg = "Series.replace cannot use dict-like to_replace and non-None value" + msg = ( + "Series.replace cannot specify both a dict-like 'to_replace' and a 'value'" + ) with pytest.raises(ValueError, match=msg): ser.replace(to_replace, value) From 2bf01c1927b1e8f2ab5751a56318789277e6ae57 Mon Sep 17 00:00:00 2001 From: Aokizy2 <3441854632@qq.com> Date: Tue, 14 Oct 2025 23:54:44 +0800 Subject: [PATCH 12/15] CLN : Inline docstrings in pandas/core/window/resample.py (#62689) Co-authored-by: aokizy <14817191+aokizy2@user.noreply.gitee.com> --- pandas/core/resample.py | 444 +++++++++++++++++++++++++++++++--------- 1 file changed, 348 insertions(+), 96 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index e8803b6f30fce..44304bcc7f388 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1,7 +1,6 @@ from __future__ import annotations import copy -from textwrap import dedent from typing import ( TYPE_CHECKING, Concatenate, @@ -28,11 +27,6 @@ ) from pandas._typing import NDFrameT from pandas.errors import AbstractMethodError -from pandas.util._decorators import ( - Appender, - Substitution, - doc, -) from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.dtypes import ( @@ -53,12 +47,10 @@ ) from pandas.core.generic import ( NDFrame, - _shared_docs, ) from pandas.core.groupby.groupby import ( BaseGroupBy, GroupBy, - _pipe_template, get_groupby, ) from pandas.core.groupby.grouper import Grouper @@ -114,6 +106,7 @@ DataFrame, Series, ) + from pandas.core.generic import NDFrame _shared_docs_kwargs: dict[str, str] = {} @@ -263,100 +256,183 @@ def pipe( ) -> T: ... @final - @Substitution( - klass="Resampler", - examples=""" - >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, - ... index=pd.date_range('2012-08-02', periods=4)) - >>> df - A - 2012-08-02 1 - 2012-08-03 2 - 2012-08-04 3 - 2012-08-05 4 - - To get the difference between each 2-day period's maximum and minimum - value in one pass, you can do - - >>> df.resample('2D').pipe(lambda x: x.max() - x.min()) - A - 2012-08-02 1 - 2012-08-04 1""", - ) - @Appender(_pipe_template) def pipe( self, func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], *args: Any, **kwargs: Any, ) -> T: - return super().pipe(func, *args, **kwargs) - - _agg_see_also_doc = dedent( """ - See Also - -------- - DataFrame.groupby.aggregate : Aggregate using callable, string, dict, - or list of string/callables. - DataFrame.resample.transform : Transforms the Series on each group - based on the given function. - DataFrame.aggregate: Aggregate using one or more - operations over the specified axis. - """ - ) + Apply a ``func`` with arguments to this Resampler object and return its result. - _agg_examples_doc = dedent( - """ - Examples - -------- - >>> s = pd.Series([1, 2, 3, 4, 5], - ... index=pd.date_range('20130101', periods=5, freq='s')) - >>> s - 2013-01-01 00:00:00 1 - 2013-01-01 00:00:01 2 - 2013-01-01 00:00:02 3 - 2013-01-01 00:00:03 4 - 2013-01-01 00:00:04 5 - Freq: s, dtype: int64 - - >>> r = s.resample('2s') - - >>> r.agg("sum") - 2013-01-01 00:00:00 3 - 2013-01-01 00:00:02 7 - 2013-01-01 00:00:04 5 - Freq: 2s, dtype: int64 - - >>> r.agg(['sum', 'mean', 'max']) - sum mean max - 2013-01-01 00:00:00 3 1.5 2 - 2013-01-01 00:00:02 7 3.5 4 - 2013-01-01 00:00:04 5 5.0 5 - - >>> r.agg({'result': lambda x: x.mean() / x.std(), - ... 'total': "sum"}) - result total - 2013-01-01 00:00:00 2.121320 3 - 2013-01-01 00:00:02 4.949747 7 - 2013-01-01 00:00:04 NaN 5 - - >>> r.agg(average="mean", total="sum") - average total - 2013-01-01 00:00:00 1.5 3 - 2013-01-01 00:00:02 3.5 7 - 2013-01-01 00:00:04 5.0 5 - """ - ) + Use `.pipe` when you want to improve readability by chaining together + functions that expect Series, DataFrames, GroupBy or Resampler objects. + Instead of writing + + >>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3 + >>> g = lambda x, arg1: x * 5 / arg1 + >>> f = lambda x: x**4 + >>> df = pd.DataFrame([["a", 4], ["b", 5]], columns=["group", "value"]) + >>> h(g(f(df.groupby("group")), arg1=1), arg2=2, arg3=3) # doctest: +SKIP + + You can write + + >>> ( + ... df.groupby("group").pipe(f).pipe(g, arg1=1).pipe(h, arg2=2, arg3=3) + ... ) # doctest: +SKIP + + which is much more readable. + + Parameters + ---------- + func : callable or tuple of (callable, str) + Function to apply to this Resampler object or, alternatively, + a `(callable, data_keyword)` tuple where `data_keyword` is a + string indicating the keyword of `callable` that expects the + Resampler object. + *args : iterable, optional + Positional arguments passed into `func`. + **kwargs : dict, optional + A dictionary of keyword arguments passed into `func`. + + Returns + ------- + any + The result of applying ``func`` to the Resampler object. + + See Also + -------- + Series.pipe : Apply a function with arguments to a series. + DataFrame.pipe: Apply a function with arguments to a dataframe. + apply : Apply function to each group instead of to the + full Resampler object. + + Notes + ----- + See more `here + `_ + + Examples + -------- + >>> df = pd.DataFrame( + ... {"A": [1, 2, 3, 4]}, index=pd.date_range("2012-08-02", periods=4) + ... ) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each 2-day period's maximum and minimum + value in one pass, you can do + + >>> df.resample("2D").pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 1 + 2012-08-04 1 + """ + return super().pipe(func, *args, **kwargs) @final - @doc( - _shared_docs["aggregate"], - see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - klass="DataFrame", - axis="", - ) def aggregate(self, func=None, *args, **kwargs): + """ + Aggregate using one or more operations over the specified axis. + + Parameters + ---------- + func : function, str, list or dict + Function to use for aggregating the data. If a function, must either + work when passed a DataFrame or when passed to DataFrame.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of axis labels -> functions, function names or list of such. + *args + Positional arguments to pass to `func`. + **kwargs + Keyword arguments to pass to `func`. + + Returns + ------- + scalar, Series or DataFrame + + The return can be: + + * scalar : when Series.agg is called with single function + * Series : when DataFrame.agg is called with a single function + * DataFrame : when DataFrame.agg is called with several functions + + See Also + -------- + DataFrame.groupby.aggregate : Aggregate using callable, string, dict, + or list of string/callables. + DataFrame.resample.transform : Transforms the Series on each group + based on the given function. + DataFrame.aggregate: Aggregate using one or more + operations over the specified axis. + + Notes + ----- + The aggregation operations are always performed over an axis, either the + index (default) or the column axis. This behavior is different from + `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, + `var`), where the default is to compute the aggregation of the flattened + array, e.g., ``numpy.mean(arr_2d)`` as opposed to + ``numpy.mean(arr_2d, axis=0)``. + + `agg` is an alias for `aggregate`. Use the alias. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + A passed user-defined-function will be passed a Series for evaluation. + + If ``func`` defines an index relabeling, ``axis`` must be ``0`` or ``index``. + + Examples + -------- + >>> s = pd.Series( + ... [1, 2, 3, 4, 5], index=pd.date_range("20130101", periods=5, freq="s") + ... ) + >>> s + 2013-01-01 00:00:00 1 + 2013-01-01 00:00:01 2 + 2013-01-01 00:00:02 3 + 2013-01-01 00:00:03 4 + 2013-01-01 00:00:04 5 + Freq: s, dtype: int64 + + >>> r = s.resample("2s") + + >>> r.agg("sum") + 2013-01-01 00:00:00 3 + 2013-01-01 00:00:02 7 + 2013-01-01 00:00:04 5 + Freq: 2s, dtype: int64 + + >>> r.agg(["sum", "mean", "max"]) + sum mean max + 2013-01-01 00:00:00 3 1.5 2 + 2013-01-01 00:00:02 7 3.5 4 + 2013-01-01 00:00:04 5 5.0 5 + + >>> r.agg({"result": lambda x: x.mean() / x.std(), "total": "sum"}) + result total + 2013-01-01 00:00:00 2.121320 3 + 2013-01-01 00:00:02 4.949747 7 + 2013-01-01 00:00:04 NaN 5 + + >>> r.agg(average="mean", total="sum") + average total + 2013-01-01 00:00:00 1.5 3 + 2013-01-01 00:00:02 3.5 7 + 2013-01-01 00:00:04 5.0 5 + """ result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: how = func @@ -1236,25 +1312,107 @@ def max( return self._downsample("max", numeric_only=numeric_only, min_count=min_count) @final - @doc(GroupBy.first) def first( self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True, ): + """ + Compute the first non-null entry of each column. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + Returns + ------- + Series or DataFrame + First values within each group. + + See Also + -------- + core.resample.Resampler.last : Compute the last non-null value in each group. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + + Examples + -------- + >>> s = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) + >>> s + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> s.resample("MS").first() + 2023-01-01 1 + 2023-02-01 3 + Freq: MS, dtype: int64 + """ return self._downsample( "first", numeric_only=numeric_only, min_count=min_count, skipna=skipna ) @final - @doc(GroupBy.last) def last( self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True, ): + """ + Compute the last non-null entry of each column. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + Returns + ------- + Series or DataFrame + Last of values within each group. + + See Also + -------- + core.resample.Resampler.first : Compute the first non-null value in each group. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + + Examples + -------- + >>> s = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) + >>> s + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> s.resample("MS").last() + 2023-01-01 2 + 2023-02-01 4 + Freq: MS, dtype: int64 + """ return self._downsample( "last", numeric_only=numeric_only, min_count=min_count, skipna=skipna ) @@ -1544,8 +1702,41 @@ def sem( return self._downsample("sem", ddof=ddof, numeric_only=numeric_only) @final - @doc(GroupBy.ohlc) def ohlc(self): + """ + Compute open, high, low and close values of a group, excluding missing values. + + Returns + ------- + DataFrame + Open, high, low and close values within each group. + + See Also + -------- + DataFrame.agg : Aggregate using one or more operations over the specified axis. + DataFrame.resample : Resample time-series data. + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + + Examples + -------- + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 5], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").ohlc() + open high low close + 2023-01-01 1 3 1 2 + 2023-02-01 4 5 3 5 + """ ax = self.ax obj = self._obj_with_exclusions if len(ax) == 0: @@ -1600,8 +1791,37 @@ def nunique(self): return self._downsample("nunique") @final - @doc(GroupBy.size) def size(self): + """ + Compute group sizes. + + Returns + ------- + Series + Number of rows in each group. + + See Also + -------- + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby to each row + or column of a DataFrame. + + Examples + -------- + >>> ser = pd.Series( + ... [1, 2, 3], + ... index=pd.DatetimeIndex(["2023-01-01", "2023-01-15", "2023-02-01"]), + ... ) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + dtype: int64 + >>> ser.resample("MS").size() + 2023-01-01 2 + 2023-02-01 1 + Freq: MS, dtype: int64 + """ result = self._downsample("size") # If the result is a non-empty DataFrame we stack to get a Series @@ -1620,8 +1840,40 @@ def size(self): return result @final - @doc(GroupBy.count) def count(self): + """ + Compute count of group, excluding missing values. + + Returns + ------- + Series or DataFrame + Count of values within each group. + + See Also + -------- + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby to each row + or column of a DataFrame. + + Examples + -------- + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> ser.resample("MS").count() + 2023-01-01 2 + 2023-02-01 2 + Freq: MS, dtype: int64 + """ result = self._downsample("count") if not len(self.ax): if self._selected_obj.ndim == 1: From fe3d799c6a2da2efa053601358da2b0de7ca946e Mon Sep 17 00:00:00 2001 From: Sumeet Bhatnagar <69593471+nemo-1999@users.noreply.github.com> Date: Tue, 14 Oct 2025 11:56:08 -0400 Subject: [PATCH 13/15] Remove ensure_clean_store reference from test_retain_attributes.py, test_read.py (#62688) --- pandas/tests/io/pytables/test_read.py | 9 ++++----- pandas/tests/io/pytables/test_retain_attributes.py | 6 +++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 7a3a7339e7809..70696a502a111 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -19,7 +19,6 @@ ) from pandas.tests.io.pytables.common import ( _maybe_remove, - ensure_clean_store, ) from pandas.io.pytables import TableIterator @@ -70,14 +69,14 @@ def test_read_missing_key_opened_store(tmp_path, setup_path): read_hdf(store, "k1") -def test_read_column(setup_path): +def test_read_column(temp_file): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) - with ensure_clean_store(setup_path) as store: + with HDFStore(temp_file) as store: _maybe_remove(store, "df") # GH 17912 @@ -153,7 +152,7 @@ def test_read_column(setup_path): def test_pytables_native_read(datapath): - with ensure_clean_store( + with HDFStore( datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r" ) as store: d2 = store["detector/readout"] @@ -162,7 +161,7 @@ def test_pytables_native_read(datapath): @pytest.mark.skipif(is_platform_windows(), reason="native2 read fails oddly on windows") def test_pytables_native2_read(datapath): - with ensure_clean_store( + with HDFStore( datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r" ) as store: str(store) diff --git a/pandas/tests/io/pytables/test_retain_attributes.py b/pandas/tests/io/pytables/test_retain_attributes.py index ab8dcacc4b8d4..d685cf8eb68f7 100644 --- a/pandas/tests/io/pytables/test_retain_attributes.py +++ b/pandas/tests/io/pytables/test_retain_attributes.py @@ -3,6 +3,7 @@ from pandas import ( DataFrame, DatetimeIndex, + HDFStore, Series, _testing as tm, date_range, @@ -11,18 +12,17 @@ ) from pandas.tests.io.pytables.common import ( _maybe_remove, - ensure_clean_store, ) pytestmark = pytest.mark.single_cpu -def test_retain_index_attributes(setup_path, unit): +def test_retain_index_attributes(temp_file, unit): # GH 3499, losing frequency info on index recreation dti = date_range("2000-1-1", periods=3, freq="h", unit=unit) df = DataFrame({"A": Series(range(3), index=dti)}) - with ensure_clean_store(setup_path) as store: + with HDFStore(temp_file) as store: _maybe_remove(store, "data") store.put("data", df, format="table") From ef8ac3e89693e6a4cfe3b6f9a0d0da9b595b2201 Mon Sep 17 00:00:00 2001 From: mraabhijit Date: Fri, 17 Oct 2025 20:14:25 +0530 Subject: [PATCH 14/15] update ArrowDtype.itemsize --- pandas/core/dtypes/dtypes.py | 7 ++- pandas/tests/dtypes/test_dtypes.py | 90 ---------------------------- pandas/tests/extension/test_arrow.py | 85 ++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 93 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 648997a96d3c2..6d99f9df73282 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2328,13 +2328,14 @@ def itemsize(self) -> int: >>> dtype.itemsize # falls back to numpy dtype 1 """ + if pa.types.is_boolean(self.pyarrow_dtype): + return self.numpy_dtype.itemsize + # Use pyarrow itemsize for fixed-width data types # e.g. int32 -> 32 bits // 8 = 4 bytes try: - if pa.types.is_boolean(self.pyarrow_dtype): - return self.numpy_dtype.itemsize return self.pyarrow_dtype.bit_width // 8 - except (ValueError, AttributeError): + except (ValueError, AttributeError, NotImplementedError): return self.numpy_dtype.itemsize def construct_array_type(self) -> type_t[ArrowExtensionArray]: diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 1a3e3f1d3ac2f..95be5f3768be6 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1259,93 +1259,3 @@ def test_categorical_nan_no_dtype_conversion(): expected = pd.DataFrame({"a": Categorical([1], [1]), "b": [1]}) df.loc[0, "a"] = np.array([1]) tm.assert_frame_equal(df, expected) - - -@pytest.fixture -def pa(): - return pytest.importorskip("pyarrow") - - -@pytest.mark.parametrize( - "type_name, expected_size", - [ - # Integer types - ("int8", 1), - ("int16", 2), - ("int32", 4), - ("int64", 8), - ("uint8", 1), - ("uint16", 2), - ("uint32", 4), - ("uint64", 8), - # Floating point types - ("float16", 2), - ("float32", 4), - ("float64", 8), - # Boolean - ("bool_", 1), - # Date and timestamp types - ("date32", 4), - ("date64", 8), - ("timestamp", 8), - # Time types - ("time32", 4), - ("time64", 8), - # Decimal types - ("decimal128", 16), - ("decimal256", 32), - ], -) -def test_arrow_dtype_itemsize_fixed_width(pa, type_name, expected_size): - # GH 57948 - - parametric_type_map = { - "timestamp": pa.timestamp("ns"), - "time32": pa.time32("s"), - "time64": pa.time64("ns"), - "decimal128": pa.decimal128(38, 10), - "decimal256": pa.decimal256(76, 10), - } - - if type_name in parametric_type_map: - arrow_type = parametric_type_map.get(type_name) - else: - arrow_type = getattr(pa, type_name)() - dtype = pd.ArrowDtype(arrow_type) - - if type_name == "bool_": - expected_size = dtype.numpy_dtype.itemsize - - assert dtype.itemsize == expected_size, ( - f"{type_name} expected {expected_size}, got {dtype.itemsize} " - f"(bit_width={getattr(dtype.pyarrow_dtype, 'bit_width', 'N/A')})" - ) - - -@pytest.mark.parametrize("type_name", ["string", "binary", "large_string"]) -def test_arrow_dtype_itemsize_variable_width(pa, type_name): - # GH 57948 - - arrow_type = getattr(pa, type_name)() - dtype = pd.ArrowDtype(arrow_type) - - assert dtype.itemsize == dtype.numpy_dtype.itemsize - - -def test_arrow_dtype_error_fallback(pa, monkeypatch): - # GH 57948 - - dtype = pd.ArrowDtype(pa.int32()) - - class ErrorType: - id = None - - @property - def bit_width(self): - raise ValueError("Simulated Error") - - def to_pandas_dtype(self): - return Series([0]).dtype - - monkeypatch.setattr(dtype, "pyarrow_dtype", ErrorType()) - assert dtype.itemsize == dtype.numpy_dtype.itemsize diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c810f098f15cf..eed6db3b8f864 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3700,3 +3700,88 @@ def test_pow_with_all_na_float(): result = s.pow(2) expected = pd.Series([pd.NA, pd.NA], dtype="float64[pyarrow]") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "type_name, expected_size", + [ + # Integer types + ("int8", 1), + ("int16", 2), + ("int32", 4), + ("int64", 8), + ("uint8", 1), + ("uint16", 2), + ("uint32", 4), + ("uint64", 8), + # Floating point types + ("float16", 2), + ("float32", 4), + ("float64", 8), + # Boolean + ("bool_", 1), + # Date and timestamp types + ("date32", 4), + ("date64", 8), + ("timestamp", 8), + # Time types + ("time32", 4), + ("time64", 8), + # Decimal types + ("decimal128", 16), + ("decimal256", 32), + ], +) +def test_arrow_dtype_itemsize_fixed_width(type_name, expected_size): + # GH 57948 + + parametric_type_map = { + "timestamp": pa.timestamp("ns"), + "time32": pa.time32("s"), + "time64": pa.time64("ns"), + "decimal128": pa.decimal128(38, 10), + "decimal256": pa.decimal256(76, 10), + } + + if type_name in parametric_type_map: + arrow_type = parametric_type_map.get(type_name) + else: + arrow_type = getattr(pa, type_name)() + dtype = ArrowDtype(arrow_type) + + if type_name == "bool_": + expected_size = dtype.numpy_dtype.itemsize + + assert dtype.itemsize == expected_size, ( + f"{type_name} expected {expected_size}, got {dtype.itemsize} " + f"(bit_width={getattr(dtype.pyarrow_dtype, 'bit_width', 'N/A')})" + ) + + +@pytest.mark.parametrize("type_name", ["string", "binary", "large_string"]) +def test_arrow_dtype_itemsize_variable_width(type_name): + # GH 57948 + + arrow_type = getattr(pa, type_name)() + dtype = ArrowDtype(arrow_type) + + assert dtype.itemsize == dtype.numpy_dtype.itemsize + + +def test_arrow_dtype_error_fallback(monkeypatch): + # GH 57948 + + dtype = ArrowDtype(pa.int32()) + + class ErrorType: + id = None + + @property + def bit_width(self): + raise ValueError("Simulated Error") + + def to_pandas_dtype(self): + return pd.Series([0]).dtype + + monkeypatch.setattr(dtype, "pyarrow_dtype", ErrorType()) + assert dtype.itemsize == dtype.numpy_dtype.itemsize From de95b0af361f99f626baaf8e9c441e02d9d4ab3b Mon Sep 17 00:00:00 2001 From: mraabhijit Date: Fri, 17 Oct 2025 22:20:49 +0530 Subject: [PATCH 15/15] remove unnecessary test test_arrow_dtype_error_fallback --- pandas/tests/extension/test_arrow.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 420a69fe012b7..2949b7ccc7cf3 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3768,25 +3768,6 @@ def test_arrow_dtype_itemsize_variable_width(type_name): assert dtype.itemsize == dtype.numpy_dtype.itemsize -def test_arrow_dtype_error_fallback(monkeypatch): - # GH 57948 - - dtype = ArrowDtype(pa.int32()) - - class ErrorType: - id = None - - @property - def bit_width(self): - raise ValueError("Simulated Error") - - def to_pandas_dtype(self): - return pd.Series([0]).dtype - - monkeypatch.setattr(dtype, "pyarrow_dtype", ErrorType()) - assert dtype.itemsize == dtype.numpy_dtype.itemsize - - def test_cast_pontwise_result_decimal_nan(): # GH#62522 we don't want to get back null[pyarrow] here ser = pd.Series([], dtype="float64[pyarrow]")