-
Notifications
You must be signed in to change notification settings - Fork 125
feat: to_gbq
uses Parquet by default, use api_method="load_csv"
for old behavior
#413
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
a794a33
64bae49
80d2cbc
1b29ced
0fea9fa
a2ea621
4aece99
ea3b8cc
9a798d4
96cc97f
fd52dff
f790cfe
845ff32
9150471
88d8676
7eb4387
0b158d3
93f851c
0d5d5c5
a26e73b
171230f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,9 @@ | |
"""Helper methods for loading data into BigQuery""" | ||
|
||
import io | ||
from typing import Any, Dict, Optional | ||
|
||
import pandas | ||
from google.cloud import bigquery | ||
|
||
from pandas_gbq.features import FEATURES | ||
|
@@ -52,45 +54,110 @@ def split_dataframe(dataframe, chunksize=None): | |
yield remaining_rows, chunk | ||
|
||
|
||
def load_chunks( | ||
client, | ||
dataframe, | ||
destination_table_ref, | ||
chunksize=None, | ||
schema=None, | ||
location=None, | ||
def load_parquet( | ||
client: bigquery.Client, | ||
dataframe: pandas.DataFrame, | ||
destination_table_ref: bigquery.TableReference, | ||
location: Optional[str], | ||
schema: Optional[Dict[str, Any]], | ||
): | ||
job_config = bigquery.LoadJobConfig() | ||
job_config.write_disposition = "WRITE_APPEND" | ||
job_config.source_format = "PARQUET" | ||
|
||
if schema is not None: | ||
schema = pandas_gbq.schema.remove_policy_tags(schema) | ||
job_config.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) | ||
|
||
client.load_table_from_dataframe( | ||
dataframe, destination_table_ref, job_config=job_config, location=location, | ||
).result() | ||
|
||
|
||
def load_csv( | ||
client: bigquery.Client, | ||
dataframe: pandas.DataFrame, | ||
destination_table_ref: bigquery.TableReference, | ||
location: Optional[str], | ||
chunksize: Optional[int], | ||
schema: Optional[Dict[str, Any]], | ||
): | ||
job_config = bigquery.LoadJobConfig() | ||
job_config.write_disposition = "WRITE_APPEND" | ||
job_config.source_format = "CSV" | ||
job_config.allow_quoted_newlines = True | ||
|
||
# Explicit schema? Use that! | ||
if schema is not None: | ||
schema = pandas_gbq.schema.remove_policy_tags(schema) | ||
job_config.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) | ||
# If not, let BigQuery determine schema unless we are encoding the CSV files ourselves. | ||
elif not FEATURES.bigquery_has_from_dataframe_with_csv: | ||
|
||
chunks = split_dataframe(dataframe, chunksize=chunksize) | ||
for remaining_rows, chunk in chunks: | ||
yield remaining_rows | ||
|
||
client.load_table_from_dataframe( | ||
chunk, destination_table_ref, job_config=job_config, location=location, | ||
).result() | ||
|
||
|
||
def load_csv_from_file( | ||
client: bigquery.Client, | ||
dataframe: pandas.DataFrame, | ||
destination_table_ref: bigquery.TableReference, | ||
location: Optional[str], | ||
chunksize: Optional[int], | ||
schema: Optional[Dict[str, Any]], | ||
): | ||
job_config = bigquery.LoadJobConfig() | ||
job_config.write_disposition = "WRITE_APPEND" | ||
job_config.source_format = "CSV" | ||
job_config.allow_quoted_newlines = True | ||
|
||
if schema is None: | ||
schema = pandas_gbq.schema.generate_bq_schema(dataframe) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This may introduce a failure if the schema is None and the The parquet conversion may be successful, but the actual BQ table schema type may not match the resultant conversion. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interesting that our tests wouldn't have caught that. Do you have an example of a dataframe that demonstrates this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW, the reason we don't have this here is that the |
||
schema = pandas_gbq.schema.remove_policy_tags(schema) | ||
job_config.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) | ||
|
||
schema = pandas_gbq.schema.remove_policy_tags(schema) | ||
job_config.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) | ||
|
||
chunks = split_dataframe(dataframe, chunksize=chunksize) | ||
for remaining_rows, chunk in chunks: | ||
yield remaining_rows | ||
|
||
if FEATURES.bigquery_has_from_dataframe_with_csv: | ||
client.load_table_from_dataframe( | ||
chunk, destination_table_ref, job_config=job_config, location=location, | ||
try: | ||
chunk_buffer = encode_chunk(chunk) | ||
client.load_table_from_file( | ||
chunk_buffer, | ||
destination_table_ref, | ||
job_config=job_config, | ||
location=location, | ||
).result() | ||
finally: | ||
chunk_buffer.close() | ||
|
||
|
||
def load_chunks( | ||
client, | ||
dataframe, | ||
destination_table_ref, | ||
chunksize=None, | ||
schema=None, | ||
location=None, | ||
api_method="load_parquet", | ||
): | ||
if api_method == "load_parquet": | ||
load_parquet(client, dataframe, destination_table_ref, location, schema) | ||
# TODO: yield progress depending on result() with timeout | ||
return [0] | ||
elif api_method == "load_csv": | ||
if FEATURES.bigquery_has_from_dataframe_with_csv: | ||
return load_csv( | ||
client, dataframe, destination_table_ref, location, chunksize, schema | ||
) | ||
else: | ||
try: | ||
chunk_buffer = encode_chunk(chunk) | ||
client.load_table_from_file( | ||
chunk_buffer, | ||
destination_table_ref, | ||
job_config=job_config, | ||
location=location, | ||
).result() | ||
finally: | ||
chunk_buffer.close() | ||
return load_csv_from_file( | ||
client, dataframe, destination_table_ref, location, chunksize, schema | ||
) | ||
else: | ||
raise ValueError( | ||
f"got unexpected api_method: {api_method!r}, expected one of 'load_parquet', 'load_csv'" | ||
plamut marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) |
Uh oh!
There was an error while loading. Please reload this page.