I have a list of csv files, I want to copy the rows and push them to BQ sequentially. At the moment, I am using pandas to read the csv files, and the to_gbq
method to get the data in bigquery. However, since the files are big (few gigs each), I wanted to ingest the data in a batch mode to avoid any memory error.
I have written the following updated function, bq-client
seems to be faster than to_gbq
.
from google.cloud import bigquery
import pandas
from tqdm import tqdm
import warnings
warnings.simplefilter("always", category=PendingDeprecationWarning)
warnings.simplefilter("always", category=DeprecationWarning)
def df_to_bq(df, table_id, table_schema, batch_size = None):
client = bigquery.Client(project = 'high-theme-12435')
job_config = bigquery.LoadJobConfig(schema=table_schema, source_format=bigquery.SourceFormat.CSV)
if batch_size == None:
job = client.load_table_from_dataframe(
df, table_id, job_config=job_config
)
else:
for (batch_no, i) in tqdm(enumerate(range(0, len(df), batch_size))):
batch_df = df.iloc[i: i+batch_size]
job = client.load_table_from_dataframe(batch_df, table_id, job_config=job_config)
print(f"### DUMP to BQ done for batch {batch_no}. ({i} to {i+len(batch_df)}.) ###")
table_schema = [
bigquery.SchemaField("col1", "INTEGER"),
bigquery.SchemaField("col2", "STRING"),
bigquery.SchemaField("col3", "TIMESTAMP"),
bigquery.SchemaField("col4", "FLOAT"),
]
import pandas as pd
import datetime as dt
from dateutil import parser
df = pd.read_csv('test.csv')
def from_iso_date(date_str):
if not date_str:
return None
return parser.parse(date_str)
df['timecol'] = pd.to_datetime('now')
df['col3'] = df['col3'].apply(from_iso_date)
table_id = 'high-theme-12435.test.test_table'
df_to_bq(df, table_id, table_schema, batch_size = 1000)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.