First, don't use symbols as field names; user the word percentage
rather than the symbol %
.
Second, your mapping table ( table2
) should probably have the lower and upper bounds to make things simpler later on... (You can accomplish that using window functions if you can't correct the mapping table.)
Then you can use window functions on your data to identify each row in terms of which row it is within just its own group.
Once done, it becomes a relatively simple join...
WITH
map AS
(
SELECT
*,
SUM(percentage) OVER (PARTITION BY State, Region ORDER BY Variable) AS upper_bound
FROM
Table2 # lower_bound is just upper_bound - percentage
),
data AS
(
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY State, Region) - 1 AS group_row_number, # -1 to make the row number start from zero
COUNT(*) OVER (PARTITION BY State, Region) AS group_size
FROM
Table1
)
SELECT
*
FROM
data
INNER JOIN
map
ON data.Region = map.Region
AND data.State = map.State
AND data.group_row_number >= data.group_size * (map.upper_bound - map.percentage)
AND data.group_row_number < data.group_size * map.upper_bound
Below is for BigQuery Standard SQL
Non-orthodox version with use of RANGE_BUCKET function
#standardSQL
WITH buckets AS (
SELECT state, region,
ARRAY_AGG(variable ORDER BY variable) variables,
ARRAY_AGG(percentage ORDER BY variable) bins
FROM (
SELECT state, region, variable, SUM(1. * percentage) OVER(win) percentage
FROM table2
WINDOW win AS (PARTITION BY state, region ORDER BY variable)
)
GROUP BY state, region
)
SELECT user, state, region,
variables[OFFSET(
RANGE_BUCKET((ROW_NUMBER() OVER(win) - 1) / (COUNT(1) OVER(win)) * 100, bins)
)] AS variable
FROM table1
JOIN buckets USING (state, region)
WINDOW win AS (PARTITION BY state, region)
-- ORDER BY user
If to apply to sample data from your question - output is
Row user state region variable
1 1 ORD 1 ABC
2 2 ORD 1 ABC
3 3 ORD 1 ABC
4 4 ORD 1 XYZ
5 5 ORD 1 XYZ
6 6 ORD 1 XYZ
7 7 IAD 2 ABC
8 8 IAD 2 ABC
9 9 IAD 2 ABC
10 10 IAD 2 ABC
11 11 IAD 2 AED
12 12 IAD 2 AED
13 13 IAD 2 XYZ
14 14 IAD 2 XYZ
Below is more traditional version (obviously with same output as in above/first version)
#standardSQL
WITH buckets AS (
SELECT *, SUM(percentage) OVER(PARTITION BY state, region ORDER BY variable) AS bin
FROM table2
), table1_with_stats AS (
SELECT *,
ROW_NUMBER() OVER(win) - 1 AS position,
COUNT(*) OVER(win) AS size
FROM table1
WINDOW win AS (PARTITION BY state, region)
)
SELECT user, state, region, variable
FROM table1_with_stats
INNER JOIN buckets
USING (state, region)
WHERE position BETWEEN size * (bin - percentage) / 100
AND size * bin /100 - 1
-- ORDER BY user
You can test, play with above using below CTE
WITH table1 AS (
SELECT 1 user, 'ORD' state, 1 region UNION ALL
SELECT 2, 'ORD', 1 UNION ALL
SELECT 3, 'ORD', 1 UNION ALL
SELECT 4, 'ORD', 1 UNION ALL
SELECT 5, 'ORD', 1 UNION ALL
SELECT 6, 'ORD', 1 UNION ALL
SELECT 7, 'IAD', 2 UNION ALL
SELECT 8, 'IAD', 2 UNION ALL
SELECT 9, 'IAD', 2 UNION ALL
SELECT 10, 'IAD', 2 UNION ALL
SELECT 11, 'IAD', 2 UNION ALL
SELECT 12, 'IAD', 2 UNION ALL
SELECT 13, 'IAD', 2 UNION ALL
SELECT 14, 'IAD', 2
), table2 AS (
SELECT 'ORD' state, 1 region, 'ABC' variable, 50 percentage UNION ALL
SELECT 'ORD', 1, 'XYZ', 50 UNION ALL
SELECT 'IAD', 2, 'ABC', 50 UNION ALL
SELECT 'IAD', 2, 'XYZ', 25 UNION ALL
SELECT 'IAD', 2, 'AED', 25
)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.