fix: missing first page in to_dataframe (#15575) · googleapis/google-cloud-python@397fb14 · GitHub
Skip to content

Commit 397fb14

Browse files
fix: missing first page in to_dataframe (#15575)
There was a bug in ReadRowsIterable.to_dataframe(), where it would attempt to use `.to_arrow()` to build a dataframe, before falling back to a generic method if `.to_arrow` was unsupported. This was a problem, because both methods read from a single `self.pages` iterator, so the first attempt would pull a page of data out of the feed, which would then be lost if `to_arrow()` is unsupported This PR fixes the bug by pulling a single item off of the stream and calling `to_arrow` on it individually, instead of reading off the self.pages generator directly Fixes #14900 --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent f0841b1 commit 397fb14

3 files changed

Lines changed: 173 additions & 24 deletions

File tree

packages/google-cloud-bigquery-storage/google/cloud/bigquery_storage_v1/reader.py

Lines changed: 63 additions & 24 deletions

packages/google-cloud-bigquery-storage/tests/unit/test_reader_v1.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -622,3 +622,81 @@ def test_to_dataframe_by_page(class_under_test, mock_gapic_client):
622622
drop=True
623623
),
624624
)
625+
626+
627+
def test_to_dataframe_avro_no_lost_records(class_under_test, mock_gapic_client):
628+
"""Verify that to_dataframe() does not lose records for Avro streams.
629+
See: https://github.com/googleapis/google-cloud-python/issues/14900
630+
"""
631+
bq_columns = [
632+
{"name": "name", "type": "string"},
633+
{"name": "number", "type": "int64"},
634+
]
635+
avro_schema = _bq_to_avro_schema(bq_columns)
636+
637+
# 2 pages of data
638+
bq_blocks = [
639+
[{"name": "a", "number": 1}, {"name": "b", "number": 2}],
640+
[{"name": "c", "number": 3}, {"name": "d", "number": 4}],
641+
]
642+
643+
avro_blocks = _bq_to_avro_blocks(bq_blocks, avro_schema)
644+
mock_gapic_client.read_rows.return_value = iter(avro_blocks)
645+
646+
reader = class_under_test(mock_gapic_client, "name", 0, {})
647+
df = reader.to_dataframe()
648+
649+
assert len(df) == 4
650+
assert df["name"].tolist() == ["a", "b", "c", "d"]
651+
652+
653+
def test_to_dataframe_empty_stream_no_session(class_under_test, mock_gapic_client):
654+
"""Verify that to_dataframe() handles empty streams without a session safely.
655+
See: https://github.com/googleapis/google-cloud-python/issues/14900
656+
"""
657+
mock_gapic_client.read_rows.return_value = iter([])
658+
659+
reader = class_under_test(mock_gapic_client, "name", 0, {})
660+
df = reader.to_dataframe()
661+
assert len(df) == 0
662+
assert isinstance(df, pandas.DataFrame)
663+
664+
665+
def test_to_dataframe_empty_stream_with_session(class_under_test, mock_gapic_client):
666+
"""Verify that to_dataframe() handles empty streams with a session correctly.
667+
See: https://github.com/googleapis/google-cloud-python/issues/14900
668+
"""
669+
bq_columns = [{"name": "name", "type": "string"}]
670+
avro_schema = _bq_to_avro_schema(bq_columns)
671+
read_session = _generate_avro_read_session(avro_schema)
672+
673+
mock_gapic_client.read_rows.return_value = iter([])
674+
675+
reader = class_under_test(mock_gapic_client, "name", 0, {})
676+
it = reader.rows(read_session=read_session)
677+
df = it.to_dataframe()
678+
679+
assert len(df) == 0
680+
assert df.columns.tolist() == ["name"]
681+
682+
683+
def test_to_arrow_avro_consumes_first_page(class_under_test, mock_gapic_client):
684+
"""Verify that to_arrow() consumes the first page of an Avro stream if format is unknown.
685+
See: https://github.com/googleapis/google-cloud-python/issues/14900
686+
"""
687+
bq_columns = [{"name": "name", "type": "string"}]
688+
avro_schema = _bq_to_avro_schema(bq_columns)
689+
bq_blocks = [[{"name": "a"}], [{"name": "b"}]]
690+
avro_blocks = _bq_to_avro_blocks(bq_blocks, avro_schema)
691+
692+
mock_gapic_client.read_rows.return_value = iter(avro_blocks)
693+
694+
reader = class_under_test(mock_gapic_client, "name", 0, {})
695+
it = reader.rows()
696+
697+
with pytest.raises(NotImplementedError):
698+
it.to_arrow()
699+
700+
# Since read_session was not provided, to_arrow() had to consume the first message
701+
# to find out it was Avro. So offset should be 1.
702+
assert reader._offset == 1

packages/google-cloud-bigquery-storage/tests/unit/test_reader_v1_arrow.py

Lines changed: 32 additions & 0 deletions

0 commit comments

Comments
 (0)