I need some assistance figuring out how to query over the 10,000 row limit in the Google Analytics Reporting API v4. This is what I have so far:
"reportRequests":
[
{
"viewId": VIEW_ID,
"pageSize": "10000",
"dateRanges": [
{"startDate": "182daysAgo", "endDate": "today"}
],
"metrics": [
{"expression": "ga:sessions"}
],
"dimensions": [{"name": "ga:pagePath"}],
"dimensionFilterClauses": [
{
"filters": [
{
"dimensionName": "ga:medium",
"operator": "EXACT",
"expressions": ["organic"]
}
]
}
]
}
]
}
I've tried increasing the pageSize to 100000 but no dice. I did some additional research and I know I need to do something with nextPageToken but I'm not sure how to set it up. I'm trying to get the GA api to return at least 50,000 rows of data.
Any help would be much appreciated.
Cheers
Jordan,
The pagination is described in the Google Analytics Reporting API V4 documentation . I believe this StackOverflow answer might be relevant as well.
Thanks, Ilya
The following class may be useful to you. Particularly the function query_reporting_api_v4
loops over each page of the report until all the data is retrieved.
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient.errors import HttpError
import logging
import pandas as pd
import time
import random
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
KEY_FILE_LOCATION = 'gcp_client_secrets.json'
def initialize_analytics_reporting():
"""Initializes an Analytics Reporting API V4 service object.
Useful links:
https://oauth2client.readthedocs.io/en/latest/source/oauth2client.service_account.html
Returns:
An authorized Analytics Reporting API V4 service object.
"""
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
# Build the service object.
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
class GoogleAnalytics:
"""Interact with Google Analytics (GA)"""
MAXRETURNEDROWS = 100000 # GA API v4 will never return more than 100,000 rows at once
MAXMETRICS = 10 # GA API v4 will only allow you to pull 10 metrics in one report
MAXDIMENSIONS = 9 # GA API v4 will only allow you to pull 7 dimensions in one report
def __init__(self, view_id: str):
""" Create a GoogleAnalytics object
Initializes a GoogleAnalytics object
and creates a service to use for interacting
with your Google Analytics account.
Args:
view_id (str): id of the view in Google Analytics
Example:
ga = GoogleAnalytics(view_id='12345')
"""
self.service = initialize_analytics_reporting()
self.view_id = str(int(view_id))
def query_reporting_api_v4(self,
start_date: str,
end_date: str,
metrics_names_cs_list: str,
dimensions_names_cs_list: str,
dimension_filters_dict: dict=None) -> pd.DataFrame:
""" Query Google Analytics Core Reporting API (v4)
Link to Developer Guide batchGet method: https://developers.google.com/analytics/devguides/reporting/core/v4/rest/v4/reports/batchGet
Dimensions & Metrics Explorer: https://developers.google.com/analytics/devguides/reporting/core/dimsmets
Limits & Quotas: https://developers.google.com/analytics/devguides/reporting/core/v4/limits-quotas
https://developers.google.com/analytics/devguides/reporting/core/v4/resource-based-quota#sampling_threshold
Data sampling: https://support.google.com/analytics/answer/2637192#zippy=%2Cin-this-article
GA only allows you to pull 10 metrics and 7 dimensions at once.
Args:
start_date: (str) start date
end_date: (str) end date
metrics_names_cs_list: (str) metric names, comma-separated list
dimensions_names_cs_list: (str) dimension names, comma-separated list
dimension_filters_dict: (dict) dictionary of dimension filters where the keys are the dimension names to filter and the values are a list of REGEX search terms
Returns:
reports_dict: (dict) Response returned by GA
data_df: (pandas.DataFrame) Data returned by GA as a dataframe
"""
log_msg = LogMessage()
# Prepare metrics, dimensions, and dimension filters for the API call
metrics_body_list = self.prepare_metrics_body_list(metrics_names_cs_list)
dimensions_body_list = self.prepare_dimensions_body_list(dimensions_names_cs_list)
dimension_filters_body_list = self.prepare_dimension_filters_body_list(dimension_filters_dict)
page_token_i = '0'
is_max_returned_rows = True
reports_dict = dict()
data_df = pd.DataFrame()
response_metadata_list = []
i = 0
while is_max_returned_rows:
has_failed = True
has_failed_iter = 0
max_retry_iter = 4
while has_failed and has_failed_iter <= max_retry_iter:
# GA API v4 outputs a list of dictionaries - one dict for each report.
# This method only returns one report.
try:
reports_dict_i = self.reporting_api_batch_get(start_date, end_date,
metrics_body_list, dimensions_body_list,
page_token_i, dimension_filters_body_list)
has_failed = False
except (HttpError, Exception) as error:
log_msg.add('message', 'There has been an HTTP error trying to request data from the API. Will retry up to 5 times then fail if no retries are successful.')
log_msg.add('has_failed_iter', has_failed_iter)
log_msg.add('exception_message', str(error))
logging.warning(str(log_msg))
log_msg.log_message = ""
if has_failed_iter == max_retry_iter:
raise error
time.sleep((2 ** has_failed_iter) + random.random()) # exponential back-off
#if error.resp.status in [500, 429]:
#time.sleep((2 ** has_failed_iter) + random.random()) # exponential back-off
#else:
# raise error
has_failed_iter = has_failed_iter + 1
# Turn the dict response to a pandas DataFrame
data_df_i = self.api_response_to_dataframe_v4(reports_dict_i)
# Log possible issues with the data set pulled from GA and store in DataFrame for future reference
response_metadata_dict_i = self.check_response_data_quality(reports_dict_i,
start_date, end_date,
metrics_names_cs_list,
dimensions_names_cs_list,
str(safe_shape(data_df_i)),
dimension_filters_dict,
page_token_i)
response_metadata_list.append(response_metadata_dict_i)
reports_dict["dataset{i}".format(i=i)] = reports_dict_i
data_df = data_df.append(data_df_i, ignore_index=True)
if safe_shape(data_df_i)[0] == self.MAXRETURNEDROWS:
log_msg.add('message', 'The number of rows in the dataframe returned by Google Analytics API v4 is the same size as the max that the API allows. The method will loop until we get all the data.')
log_msg.add('i', i)
log_msg.add('page_token_i', page_token_i)
logging.warning(str(log_msg))
log_msg.log_message = ""
page_token_i = self.get_next_page_token(reports_dict_i)
else:
is_max_returned_rows = False
i = i + 1
return (reports_dict, data_df)
def prepare_metrics_body_list(self, metrics_names_cs_list: str) -> list:
"""
Prepare the metrics body list for the API call.
Args:
metrics_names_cs_list: (str) metric names, comma-separated list
Returns:
metrics_body_list: (list) list of dictionaries for the metrics body
"""
metrics_names_list = metrics_names_cs_list.split(",")
metrics_body_list = []
for metric_name in metrics_names_list:
metrics_body_list.append({'expression': metric_name})
if len(metrics_body_list) > self.MAXMETRICS:
log_msg = LogMessage()
log_msg.add('message',
f"GA only allows you to pull {self.MAXMETRICS} metrics at once and you are asking for {len(metrics_body_list)}.")
logging.warning(log_msg)
return metrics_body_list
def prepare_dimensions_body_list(self, dimensions_names_cs_list: str) -> list:
"""
Prepare the dimensions body list for the API call.
Args:
dimensions_names_cs_list: (str) dimension names, comma-separated list
Returns:
dimensions_body_list: (list) list of dictionaries for the dimensions body
"""
dimensions_names_list = dimensions_names_cs_list.split(",")
dimensions_body_list = []
for dimension_name in dimensions_names_list:
dimensions_body_list.append({'name': dimension_name})
if len(dimensions_body_list) > self.MAXDIMENSIONS:
log_msg = LogMessage()
log_msg.add('message',
f"GA only allows you to pull {self.MAXDIMENSIONS} dimensions at once and you are asking for {len(dimensions_body_list)}.")
logging.warning(log_msg)
return dimensions_body_list
def prepare_dimension_filters_body_list(self, dimension_filters_dict: dict) -> list:
"""
Prepare the dimension filters body list for the API call.
Args:
dimension_filters_dict: (dict) dimension filters, keyed by dimension name, value is a list of expressions
Returns:
dimension_filters_body_list: (list) list of dictionaries for the dimension filters body
"""
dimension_filters_body_list = []
if dimension_filters_dict is not None:
for dimension_filter_key, dimension_filter_value in dimension_filters_dict.items():
dimension_filters_body_list.append({'dimensionName': dimension_filter_key,
'operator': 'REGEXP',
'expressions': dimension_filter_value})
return dimension_filters_body_list
def reporting_api_batch_get(self, start_date, end_date,
metrics_body_list, dimensions_body_list,
page_token_i, dimension_filters_body_list):
"""
This method is used to pull data from the Google Analytics API v4.
"""
reports_dict_i = self.service.reports().batchGet(
body={
'reportRequests': [
{
'viewId': self.view_id,
'dateRanges': [{'startDate': start_date, 'endDate': end_date}],
'metrics': metrics_body_list,
'dimensions': dimensions_body_list,
'pageSize': self.MAXRETURNEDROWS,
'pageToken': page_token_i,
'dimensionFilterClauses': [{
'filters': dimension_filters_body_list
}],
'samplingLevel': "LARGE"
}]
}
).execute()
print(reports_dict_i)
return reports_dict_i
def api_response_to_dataframe_v4(self, response):
""" Convert a GA API v4 response to a dataframe
Args:
response: (dict) response from GA
Returns:
df: (pandas.DataFrame) response from GA as a dataframe
"""
data_list = []
# get report data
for report in response.get('reports', []):
# set column headers
column_header = report.get('columnHeader', {})
dimension_headers = column_header.get('dimensions', [])
metric_headers = column_header.get('metricHeader', {}).get('metricHeaderEntries', [])
rows = report.get('data', {}).get('rows', [])
for row in rows:
# create dict for each row
row_dict = {}
dimensions = row.get('dimensions', [])
date_range_values = row.get('metrics', [])
# fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimension_headers, dimensions):
row_dict[header] = dimension
# fill dict with metric header (key) and metric value (value)
for i, values in enumerate(date_range_values):
for metric, value in zip(metric_headers, values.get('values')):
# set int as int, float a float
if ',' in value or '.' in value:
row_dict[metric.get('name')] = float(value)
else:
row_dict[metric.get('name')] = int(value)
data_list.append(row_dict)
df = pd.DataFrame(data_list)
return df
def get_next_page_token(self, response):
"""Gets the next page token of a GA response
GA only returns results from the first "page".
In order to get all results from a query, you
need to run the API call until the next page token
is null.
"""
next_page_token = response['reports'][0]['nextPageToken']
return next_page_token
def check_response_data_quality(self, api_response,
start_date, end_date,
metric_names_list,
dimension_names_list,
dataframe_shape,
dimension_filters_dict=None,
page_token=0):
"""Checks the data quality of the response the GA API returned
Logs the following data quality issues:
1) Golden data: indicates if response to this request is golden or not.
Data is golden when the exact same request will not produce any new results if asked at a later point in time
2) Sampling - based on https://developers.google.com/analytics/devguides/reporting/core/v4/basics#sampling
samplesReadCounts[]: total number of samples read, one entry per date range
samplingSpaceSizes[]: total number of samples present, one entry per date range
Args:
api_response (dict): response from GA API
Returns:
"""
# Initialize return variables
log_msg = LogMessage()
is_data_golden = None
samples_read_counts = None
sampling_space_sizes = None
# Check to see if the data is golden (if it is not, this means it could change over time)
try:
is_data_golden = api_response['reports'][0]['data']['isDataGolden']
except KeyError as key_ex:
log_msg.add("method", "GoogleAnalytics.check_response_data_quality")
log_msg.add("message", "The isDataGolden key does not exist")
log_msg.add("exception_msg", str(key_ex))
logging.warning(log_msg)
log_msg.log_message = ""
else:
if not is_data_golden:
log_msg.add("message", "This data set is not golden (data is golden when the exact same request will not produce any new results if asked at a later point in time).")
logging.warning(log_msg)
log_msg.log_message = ""
# Check to see if the data set is sampled
try:
samples_read_counts = api_response['reports'][0]['data']['samplesReadCounts']
sampling_space_sizes = api_response['reports'][0]['data']['samplingSpaceSizes']
except:
log_msg.add('message', 'This data set is not sampled! Yay!!! :)')
logging.debug(log_msg)
log_msg.log_message = ""
else:
log_msg.add('message', 'This data set is sampled!')
log_msg.add('samples_read_counts', str(samples_read_counts))
log_msg.add('sampling_space_sizes', str(sampling_space_sizes))
logging.warning(log_msg)
return {"ga_view_id": self.view_id,
"start_date": start_date,
"end_date": end_date,
"dimension_names_list": dimension_names_list,
"metric_names_list": metric_names_list,
"dimension_filters": dimension_filters_dict,
"page_token": page_token,
"is_data_golden": is_data_golden,
"samples_read_counts": samples_read_counts,
"sampling_space_sizes": sampling_space_sizes,
"dataframe_shape": dataframe_shape}
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.