Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
07275e7
remove expensive len() call
shuoweil Jul 24, 2025
f5a5ec1
add testcase
shuoweil Jul 24, 2025
f67710c
fix a typo
shuoweil Jul 24, 2025
39c9acf
change how row_count is updated
shuoweil Jul 25, 2025
e31d123
testcase stil fails, need to merged in 1888
shuoweil Jul 29, 2025
32ca0b0
update the method of using PandasBatches.total_rows
shuoweil Jul 30, 2025
b2a4eaa
change tests in read_gbq_colab
shuoweil Aug 1, 2025
94314e5
polish comment
shuoweil Aug 1, 2025
477e33c
fix a test
shuoweil Aug 6, 2025
54b8e67
change code and update more testcase
shuoweil Aug 12, 2025
6ceb764
remove unneeded except
shuoweil Aug 14, 2025
16db3a5
add assert for total_rows
shuoweil Aug 14, 2025
fbd4155
get actual row_counts
shuoweil Aug 19, 2025
39fb4c6
avoid two query calls
shuoweil Aug 19, 2025
a1a8250
remove double query when display widget
shuoweil Aug 21, 2025
80ced0c
get row count directly
shuoweil Sep 13, 2025
9d235bb
restore notebook
shuoweil Sep 16, 2025
c9d7c06
restore notebook change
shuoweil Sep 16, 2025
97ebd2b
remove duplicated code
shuoweil Sep 18, 2025
0e0f886
minor updates
shuoweil Oct 2, 2025
fde8ca6
still have zero total rows issue
shuoweil Oct 3, 2025
f69016c
now large dataset can get the correct row counts
shuoweil Oct 3, 2025
6740a7d
benchmark change
shuoweil Oct 3, 2025
14d3ddb
revert a benchmark
shuoweil Oct 7, 2025
2d063eb
revert executor change
shuoweil Oct 8, 2025
696b173
raising a NotImplementedError when the row count is none
shuoweil Oct 9, 2025
4b2c093
change return type
shuoweil Oct 11, 2025
501005e
Revert accidental change of dataframe.ipynb
shuoweil Oct 15, 2025
a03e8f0
remove unnecessary execution in benchmark
shuoweil Oct 15, 2025
f6ceed1
remove row_count check
shuoweil Oct 15, 2025
0209ba0
remove extra execute_result
shuoweil Oct 15, 2025
cded8e9
remove unnecessary tests
shuoweil Oct 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@


@dataclasses.dataclass
class PandasBatches(Iterator[pd.DataFrame]):
class PandasBatches:
"""Interface for mutable objects with state represented by a block value object."""

def __init__(
Expand All @@ -124,6 +124,9 @@ def total_bytes_processed(self) -> Optional[int]:
def __next__(self) -> pd.DataFrame:
return next(self._dataframes)

def __iter__(self) -> Iterator[pd.DataFrame]:
return self


@dataclasses.dataclass()
class MaterializationOptions:
Expand Down Expand Up @@ -693,7 +696,7 @@ def to_pandas_batches(
page_size: Optional[int] = None,
max_results: Optional[int] = None,
allow_large_results: Optional[bool] = None,
) -> Iterator[pd.DataFrame]:
) -> PandasBatches:
"""Download results one message at a time.
page_size and max_results determine the size and number of batches,
Expand Down
4 changes: 2 additions & 2 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1884,7 +1884,7 @@ def to_pandas_batches(
max_results: Optional[int] = None,
*,
allow_large_results: Optional[bool] = None,
) -> Iterable[pandas.DataFrame]:
) -> blocks.PandasBatches:
"""Stream DataFrame results to an iterable of pandas DataFrame.

page_size and max_results determine the size and number of batches,
Expand Down Expand Up @@ -1929,7 +1929,7 @@ def to_pandas_batches(
over the default size limit of 10 GB.

Returns:
Iterable[pandas.DataFrame]:
bigframes.core.blocks.PandasBatches:
An iterable of smaller dataframes which combine to
form the original dataframe. Results stream from bigquery,
see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable
Expand Down
75 changes: 43 additions & 32 deletions bigframes/display/anywidget.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
import bigframes
import bigframes.display.html

# anywidget and traitlets are optional dependencies. We don't want the import of this
# module to fail if they aren't installed, though. Instead, we try to limit the surface that
# these packages could affect. This makes unit testing easier and ensures we don't
# accidentally make these required packages.
# anywidget and traitlets are optional dependencies. We don't want the import of
# this module to fail if they aren't installed, though. Instead, we try to
# limit the surface that these packages could affect. This makes unit testing
# easier and ensures we don't accidentally make these required packages.
try:
import anywidget
import traitlets
Expand All @@ -45,10 +45,18 @@


class TableWidget(WIDGET_BASE):
"""
An interactive, paginated table widget for BigFrames DataFrames.
"""An interactive, paginated table widget for BigFrames DataFrames.

This widget provides a user-friendly way to display and navigate through
large BigQuery DataFrames within a Jupyter environment.
"""

page = traitlets.Int(0).tag(sync=True)
page_size = traitlets.Int(0).tag(sync=True)
row_count = traitlets.Int(0).tag(sync=True)
table_html = traitlets.Unicode().tag(sync=True)
_initial_load_complete = traitlets.Bool(False).tag(sync=True)

def __init__(self, dataframe: bigframes.dataframe.DataFrame):
"""Initialize the TableWidget.

Expand All @@ -57,35 +65,38 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame):
"""
if not ANYWIDGET_INSTALLED:
raise ImportError(
"Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use TableWidget."
"Please `pip install anywidget traitlets` or "
"`pip install 'bigframes[anywidget]'` to use TableWidget."
)

super().__init__()
self._dataframe = dataframe

# Initialize attributes that might be needed by observers FIRST
super().__init__()

# This flag prevents observers from firing during initialization.
# When traitlets like `page` and `page_size` are set in `__init__`, we
# don't want their corresponding `_..._changed` methods to execute
# until the widget is fully constructed.
self._initializing = True

# Initialize attributes that might be needed by observers first
self._table_id = str(uuid.uuid4())
self._all_data_loaded = False
self._batch_iter: Optional[Iterator[pd.DataFrame]] = None
self._cached_batches: List[pd.DataFrame] = []

# respect display options for initial page size
initial_page_size = bigframes.options.display.max_rows

# Initialize data fetching attributes.
self._batches = dataframe.to_pandas_batches(page_size=initial_page_size)

# set traitlets properties that trigger observers
self.page_size = initial_page_size
# Respect display options for initial page size
self.page_size = bigframes.options.display.max_rows

# len(dataframe) is expensive, since it will trigger a
# SELECT COUNT(*) query. It is a must have however.
# TODO(b/428238610): Start iterating over the result of `to_pandas_batches()`
# before we get here so that the count might already be cached.
self.row_count = len(dataframe)
# The query issued by `to_pandas_batches()` already contains
# metadata about how many results there were. Use that to avoid
# doing an extra COUNT(*) query that `len(...)` would do.
self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size)
self.row_count = self._batches.total_rows or 0

# get the initial page
self._set_table_html()
self._initial_load_complete = True
self._initializing = False

@functools.cached_property
def _esm(self):
Expand All @@ -97,11 +108,6 @@ def _css(self):
"""Load CSS code from external file."""
return resources.read_text(bigframes.display, "table_widget.css")

page = traitlets.Int(0).tag(sync=True)
page_size = traitlets.Int(25).tag(sync=True)
row_count = traitlets.Int(0).tag(sync=True)
table_html = traitlets.Unicode().tag(sync=True)

@traitlets.validate("page")
def _validate_page(self, proposal: Dict[str, Any]) -> int:
"""Validate and clamp the page number to a valid range.
Expand Down Expand Up @@ -178,14 +184,15 @@ def _cached_data(self) -> pd.DataFrame:
return pd.DataFrame(columns=self._dataframe.columns)
return pd.concat(self._cached_batches, ignore_index=True)

def _reset_batches_for_new_page_size(self):
def _reset_batches_for_new_page_size(self) -> None:
"""Reset the batch iterator when page size changes."""
self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size)

self._cached_batches = []
self._batch_iter = None
self._all_data_loaded = False

def _set_table_html(self):
def _set_table_html(self) -> None:
"""Sets the current html data based on the current page and page size."""
start = self.page * self.page_size
end = start + self.page_size
Expand All @@ -208,13 +215,17 @@ def _set_table_html(self):
)

@traitlets.observe("page")
def _page_changed(self, _change: Dict[str, Any]):
def _page_changed(self, _change: Dict[str, Any]) -> None:
"""Handler for when the page number is changed from the frontend."""
if self._initializing:
return
self._set_table_html()

@traitlets.observe("page_size")
def _page_size_changed(self, _change: Dict[str, Any]):
def _page_size_changed(self, _change: Dict[str, Any]) -> None:
"""Handler for when the page size is changed from the frontend."""
if self._initializing:
return
# Reset the page to 0 when page size changes to avoid invalid page states
self.page = 0

Expand Down
6 changes: 6 additions & 0 deletions bigframes/display/table_widget.js
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,12 @@ function render({ model, el }) {
}
});
model.on(Event.CHANGE_TABLE_HTML, handleTableHTMLChange);
model.on(`change:${ModelProperty.ROW_COUNT}`, updateButtonStates);
model.on(`change:_initial_load_complete`, (val) => {
if (val) {
updateButtonStates();
}
});

// Assemble the DOM
paginationContainer.appendChild(prevPage);
Expand Down
74 changes: 33 additions & 41 deletions notebooks/dataframes/anywidget_mode.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,24 @@
"id": "ce250157",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Query job 6d85c081-49c7-408a-ab96-e0e9e5102419 is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:6d85c081-49c7-408a-ab96-e0e9e5102419&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9e3e413eb0774a62818c58d217af8488",
"model_id": "31ba8e41e4ca4579b85409237cb7a566",
"version_major": 2,
"version_minor": 1
"version_minor": 0
},
"text/plain": [
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
Expand Down Expand Up @@ -171,6 +183,18 @@
"id": "6920d49b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Query job 48cb4908-a59a-420f-8fcb-200d0d9187ef is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:48cb4908-a59a-420f-8fcb-200d0d9187ef&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
Expand All @@ -181,17 +205,16 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "df5e93f0d03f45cda67aa6da7f9ef1ae",
"model_id": "5d22f3f19e4140b0ba51869e97c3f690",
"version_major": 2,
"version_minor": 1
"version_minor": 0
},
"text/plain": [
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
"output_type": "display_data"
}
],
"source": [
Expand Down Expand Up @@ -253,53 +276,22 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"id": "a9d5d13a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Small dataset pages: 1\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a4ec5248708442fabc59c446c78a1304",
"version_major": 2,
"version_minor": 1
},
"text/plain": [
"TableWidget(page_size=10, row_count=5, table_html='<table border=\"1\" class=\"dataframe table table-striped tabl…"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# Test with very small dataset\n",
"small_df = df.sort_values([\"name\", \"year\", \"state\"]).head(5)\n",
"small_widget = TableWidget(small_df)\n",
"print(f\"Small dataset pages: {math.ceil(small_widget.row_count / small_widget.page_size)}\")\n",
"small_widget"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c4e5836b-c872-4a9c-b9ec-14f6f338176d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"display_name": "3.10.18",
"language": "python",
"name": "python3"
},
Expand All @@ -313,7 +305,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
"version": "3.10.18"
}
},
"nbformat": 4,
Expand Down
15 changes: 8 additions & 7 deletions tests/benchmark/read_gbq_colab/filter_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@
# limitations under the License.
import pathlib

import benchmark.utils as utils

import bigframes.pandas as bpd
import tests.benchmark.utils as utils

PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE

Expand All @@ -30,17 +29,19 @@ def filter_output(
# e.g. "{local_inline}" or "{local_large}"
df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")

# Simulate getting the first page, since we'll always do that first in the UI.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
next(iter(batches))

# Simulate the user filtering by a column and visualizing those results
df_filtered = df[df["col_bool_0"]]
rows, _ = df_filtered.shape
batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE)

rows = batches_filtered.total_rows or 0
assert rows >= 0

# It's possible we don't have any pages at all, since we filtered out all
# matching rows.
first_page = next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
first_page = next(iter(batches_filtered))
assert len(first_page.index) <= rows


Expand Down
10 changes: 5 additions & 5 deletions tests/benchmark/read_gbq_colab/first_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@
# limitations under the License.
import pathlib

import benchmark.utils as utils

import bigframes.pandas
import tests.benchmark.utils as utils

PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE

Expand All @@ -27,9 +26,10 @@ def first_page(*, project_id, dataset_id, table_id):
f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
)

# Get number of rows (to calculate number of pages) and the first page.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
assert batches.total_rows is not None and batches.total_rows >= 0
first_page = next(iter(batches))
assert first_page is not None


if __name__ == "__main__":
Expand Down
Loading