Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

modifying sql to do a join with the pg table after creating a temp table #146

Merged
merged 4 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 33 additions & 12 deletions core/replay/report_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,19 +369,40 @@ def unload(unload_location, iam_role, cluster, user, replay):
query = re.sub(r"{{END_TIME}}", f"'{cluster.get('end_time')}'", query)
parsed_location = f"s3://{unload_location.get('bucket_name')}/{unload_location.get('prefix')}"

# format unload query with actual query from sql/
unload_query = (
f"unload ($${query}$$) to '{parsed_location}analysis/{replay}/raw_data/"
f"{query_name}' iam_role '{iam_role}' CSV header allowoverwrite parallel off;"
)
try:
cursor.execute(unload_query) # execute unload
except Exception as e:
logger.error(
f"Could not unload {query_name} results. Confirm IAM permissions include UNLOAD "
f"access for Redshift. {e}"
# create temp tables for queries joining with pg_user
if "CREATE TEMP TABLE" in query:
query_split = query.split('\n\n')
try:
cursor.execute(query_split[0])

unload_query = (
f"unload ($${query_split[1]}$$) to '{parsed_location}analysis/{replay}/raw_data/"
f"{query_name}' iam_role '{iam_role}' CSV header allowoverwrite parallel off;"
)

cursor.execute(unload_query)

except Exception as e:
logger.error(
f"Could not execute query {query_name}"
f"Error: {e}"
)
exit(-1)

else:
# format unload query with actual query from sql/
unload_query = (
f"unload ($${query}$$) to '{parsed_location}analysis/{replay}/raw_data/"
f"{query_name}' iam_role '{iam_role}' CSV header allowoverwrite parallel off;"
)
exit(-1)
try:
cursor.execute(unload_query) # execute unload
except Exception as e:
logger.error(
f"Could not unload {query_name} results. Confirm IAM permissions include UNLOAD "
f"access for Redshift. {e}"
)
exit(-1)

logger.info(f"Query results available in {unload_location.get('url')}")
return queries
Expand Down
54 changes: 34 additions & 20 deletions core/sql/aborted_queries.sql
Original file line number Diff line number Diff line change
@@ -1,20 +1,34 @@
SELECT
q.user_id as "userid",
TRIM(u.usename) AS usename
,case when q.result_cache_hit = 't' then 'Result Cache' else 'Default queue' end as "queue"
,date_trunc('hour', q.start_time) as "period"
,q.transaction_id as "xid"
,q.query_id as "query"
,q.query_text::char(50) as "querytxt"
,q.queue_time / 1000000.00 as "queue_s"
,q.execution_time / 1000000.00 as "exec_time_s" -- This includes compile time. Differs in behavior from provisioned metric
,case when q.status = 'failed' then 1 else 0 end "aborted"
,q.elapsed_time / 1000000.00 as "total_elapsed_s" -- This includes compile time. Differs in behavior from provisioned metric
FROM sys_query_history q
LEFT JOIN pg_user u ON u.usesysid = q.user_id
WHERE q.user_id > 1
AND q.start_time >={{START_TIME}}
AND q.start_time <={{END_TIME}}
AND q.query_text LIKE '%replay_start%'
AND q.status = 'failed'
ORDER BY total_elapsed_s DESC;
/*AbortedQueries*/
CREATE TEMP TABLE aborted_queries AS (
SELECT q.user_id as "userid"
, case when q.result_cache_hit = 't' then 'Result Cache' else 'Default queue' end as "queue"
, date_trunc('hour', q.start_time) as "period"
, q.transaction_id as "xid"
, q.query_id as "query"
, q.query_text::char(50) as "querytxt"
, q.queue_time / 1000000.00 as "queue_s"
, q.execution_time / 1000000.00 as "exec_time_s" -- This includes compile time. Differs in behavior from provisioned metric
, case when q.status = 'failed' then 1 else 0 end "aborted"
, q.elapsed_time / 1000000.00 as "total_elapsed_s" -- This includes compile time. Differs in behavior from provisioned metric
FROM sys_query_history q
WHERE q.user_id > 1
AND q.start_time >={{START_TIME}}
AND q.start_time <={{END_TIME}}
AND q.query_text LIKE '%replay_start%'
AND q.status = 'failed'
);

SELECT a.userid,
b.usename,
a.queue,
a.period,
a.xid,
a.query,
a.querytxt,
a.queue_s,
a.exec_time_s,
a.aborted,
a.total_elapsed_s
FROM aborted_queries a
LEFT JOIN pg_user b ON a.userid = b.usesysid
ORDER BY a.total_elapsed_s DESC;
132 changes: 66 additions & 66 deletions core/sql/cluster_level_metrics.sql
Original file line number Diff line number Diff line change
@@ -1,70 +1,70 @@
/*ClusterLevelMetrics*/
WITH queries AS
(
select
q.user_id as "userid"
,date_trunc('hour', q.start_time) as "period"
,q.transaction_id as "xid"
,q.query_id as "query"
,q.query_text::char(50) as "querytxt"
,q.queue_time / 1000000.00 as "queue_s"
,q.execution_time / 1000000.00 as "exec_time_s" -- This includes compile time. Differs in behavior from provisioned metric
,case when q.status = 'failed' then 1 else 0 end "aborted"
,q.elapsed_time / 1000000.00 as "total_elapsed_s" -- This includes compile time. Differs in behavior from provisioned metric
FROM sys_query_history q
WHERE q.user_id > 1
AND q.start_time >= {{START_TIME}}
AND q.start_time <= {{END_TIME}}
AND q.query_text LIKE '%replay_start%'
AND q.status != 'failed'
),
elapsed_time AS
(
SELECT 'Query Latency' AS measure_type,
COUNT(*) AS query_count,
ROUND(PERCENTILE_CONT (0.25) WITHIN GROUP(ORDER BY total_elapsed_s),2) AS p25_s,
ROUND(PERCENTILE_CONT (0.50) WITHIN GROUP(ORDER BY total_elapsed_s),2) AS p50_s,
ROUND(PERCENTILE_CONT (0.75) WITHIN GROUP(ORDER BY total_elapsed_s),2) AS p75_s,
ROUND(PERCENTILE_CONT (0.90) WITHIN GROUP(ORDER BY total_elapsed_s),2) AS p90_s,
ROUND(PERCENTILE_CONT (0.95) WITHIN GROUP(ORDER BY total_elapsed_s),2) AS p95_s,
ROUND(PERCENTILE_CONT (0.99) WITHIN GROUP(ORDER BY total_elapsed_s),2) AS p99_s,
MAX(total_elapsed_s) AS max_s,
AVG(total_elapsed_s) AS avg_s,
stddev(total_elapsed_s) AS std_s
FROM queries
GROUP BY 1
),
exec_time AS
(
SELECT 'Execution Time' AS measure_type,
COUNT(*) AS query_count,
ROUND(PERCENTILE_CONT (0.25) WITHIN GROUP(ORDER BY exec_time_s),2) AS p25_s,
ROUND(PERCENTILE_CONT (0.50) WITHIN GROUP(ORDER BY exec_time_s),2) AS p50_s,
ROUND(PERCENTILE_CONT (0.75) WITHIN GROUP(ORDER BY exec_time_s),2) AS p75_s,
ROUND(PERCENTILE_CONT (0.90) WITHIN GROUP(ORDER BY exec_time_s),2) AS p90_s,
ROUND(PERCENTILE_CONT (0.95) WITHIN GROUP(ORDER BY exec_time_s),2) AS p95_s,
ROUND(PERCENTILE_CONT (0.99) WITHIN GROUP(ORDER BY exec_time_s),2) AS p99_s,
MAX(exec_time_s) AS max_s,
AVG(exec_time_s) AS avg_s,
stddev(exec_time_s) AS std_s
FROM queries
GROUP BY 1
),
queue_time AS
(
SELECT 'Queue Time' AS measure_type,
COUNT(*) AS query_count,
ROUND(PERCENTILE_CONT (0.25) WITHIN GROUP(ORDER BY queue_s),2) AS p25_s,
ROUND(PERCENTILE_CONT (0.50) WITHIN GROUP(ORDER BY queue_s),2) AS p50_s,
ROUND(PERCENTILE_CONT (0.75) WITHIN GROUP(ORDER BY queue_s),2) AS p75_s,
ROUND(PERCENTILE_CONT (0.90) WITHIN GROUP(ORDER BY queue_s),2) AS p90_s,
ROUND(PERCENTILE_CONT (0.95) WITHIN GROUP(ORDER BY queue_s),2) AS p95_s,
ROUND(PERCENTILE_CONT (0.99) WITHIN GROUP(ORDER BY queue_s),2) AS p99_s,
MAX(queue_s) AS max_s,
AVG(queue_s) AS avg_s,
stddev(queue_s) AS std_s
FROM queries
GROUP BY 1
)
(
select q.user_id as "userid"
, date_trunc('hour', q.start_time) as "period"
, q.transaction_id as "xid"
, q.query_id as "query"
, q.query_text::char(50) as "querytxt"
, q.queue_time / 1000000.00 as "queue_s"
, q.execution_time / 1000000.00 as "exec_time_s" -- This includes compile time. Differs in behavior from provisioned metric
, case when q.status = 'failed' then 1 else 0 end "aborted"
, q.elapsed_time / 1000000.00 as "total_elapsed_s" -- This includes compile time. Differs in behavior from provisioned metric
FROM sys_query_history q
WHERE q.user_id > 1
AND q.start_time >= {{START_TIME}}
AND q.start_time <= {{END_TIME}}
AND q.query_text LIKE '%replay_start%'
AND q.status != 'failed'
),
elapsed_time AS
(
SELECT 'Query Latency' AS measure_type,
COUNT(*) AS query_count,
ROUND(PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY total_elapsed_s), 2) AS p25_s,
ROUND(PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY total_elapsed_s), 2) AS p50_s,
ROUND(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY total_elapsed_s), 2) AS p75_s,
ROUND(PERCENTILE_CONT(0.90) WITHIN GROUP (ORDER BY total_elapsed_s), 2) AS p90_s,
ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_elapsed_s), 2) AS p95_s,
ROUND(PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY total_elapsed_s), 2) AS p99_s,
MAX(total_elapsed_s) AS max_s,
AVG(total_elapsed_s) AS avg_s,
stddev(total_elapsed_s) AS std_s
FROM queries
GROUP BY 1
),
exec_time AS
(
SELECT 'Execution Time' AS measure_type,
COUNT(*) AS query_count,
ROUND(PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY exec_time_s), 2) AS p25_s,
ROUND(PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY exec_time_s), 2) AS p50_s,
ROUND(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY exec_time_s), 2) AS p75_s,
ROUND(PERCENTILE_CONT(0.90) WITHIN GROUP (ORDER BY exec_time_s), 2) AS p90_s,
ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY exec_time_s), 2) AS p95_s,
ROUND(PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY exec_time_s), 2) AS p99_s,
MAX(exec_time_s) AS max_s,
AVG(exec_time_s) AS avg_s,
stddev(exec_time_s) AS std_s
FROM queries
GROUP BY 1
),
queue_time AS
(
SELECT 'Queue Time' AS measure_type,
COUNT(*) AS query_count,
ROUND(PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY queue_s), 2) AS p25_s,
ROUND(PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY queue_s), 2) AS p50_s,
ROUND(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY queue_s), 2) AS p75_s,
ROUND(PERCENTILE_CONT(0.90) WITHIN GROUP (ORDER BY queue_s), 2) AS p90_s,
ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY queue_s), 2) AS p95_s,
ROUND(PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY queue_s), 2) AS p99_s,
MAX(queue_s) AS max_s,
AVG(queue_s) AS avg_s,
stddev(queue_s) AS std_s
FROM queries
GROUP BY 1
)
SELECT measure_type,
query_count,
p25_s,
Expand Down
105 changes: 53 additions & 52 deletions core/sql/latency_distribution.sql
Original file line number Diff line number Diff line change
@@ -1,61 +1,62 @@
/*LatencyDistribution*/
WITH queries AS
(
SELECT q.query_id
,q.elapsed_time / 1000000.00 as total_elapsed_s
FROM sys_query_history q
WHERE q.user_id > 1
AND q.start_time >= {{START_TIME}}
AND q.start_time <= {{END_TIME}}
AND q.query_text LIKE '%replay_start%'
AND status != 'failed'
)
,
pct AS
(
SELECT ROUND(PERCENTILE_CONT (0.98) WITHIN GROUP(ORDER BY q1.total_elapsed_s),2) AS p98_s,
COUNT(*) AS query_count,
MAX(q1.total_elapsed_s) max_s,
MIN(q1.total_elapsed_s) min_s,
MIN(CASE WHEN q1.total_elapsed_s = 0.00 THEN NULL ELSE q1.total_elapsed_s END) min_2s
FROM queries q1
),
bucket_count AS
(
SELECT CASE
WHEN query_count > 100 THEN 40
ELSE 5
END AS b_count
FROM pct
),
buckets AS
(
SELECT (min_2s +((n)*(p98_s / b_count))) AS sec_end,
n,
(min_2s +((n -1)*(p98_s / b_count))) AS sec_start
FROM (SELECT ROW_NUMBER() OVER () n FROM pg_class LIMIT 39),
bucket_count,
pct
WHERE sec_end <= p98_s
UNION ALL
SELECT min_2s AS sec_end,
0 AS n,
0.00 AS sec_start
FROM pct
UNION ALL
SELECT (max_s +0.01) AS sec_end,
b_count AS n,
p98_s AS sec_start
FROM pct,
bucket_count
)
(
SELECT q.query_id
, q.elapsed_time / 1000000.00 as total_elapsed_s
FROM sys_query_history q
WHERE q.user_id > 1
AND q.start_time >= {{START_TIME}}
AND q.start_time <= {{END_TIME}}
AND q.query_text LIKE '%replay_start%'
AND status != 'failed'
)
,
pct AS
(
SELECT ROUND(PERCENTILE_CONT(0.98) WITHIN GROUP (ORDER BY q1.total_elapsed_s), 2) AS p98_s,
COUNT(*) AS query_count,
MAX(q1.total_elapsed_s) max_s,
MIN(q1.total_elapsed_s) min_s,
MIN(CASE WHEN q1.total_elapsed_s = 0.00 THEN NULL ELSE q1.total_elapsed_s END) min_2s
FROM queries q1
),
bucket_count AS
(
SELECT CASE
WHEN query_count > 100 THEN 40
ELSE 5
END AS b_count
FROM pct
),
buckets AS
(
SELECT (min_2s + ((n) * (p98_s / b_count))) AS sec_end,
n,
(min_2s + ((n - 1) * (p98_s / b_count))) AS sec_start
FROM (SELECT ROW_NUMBER() OVER () n FROM pg_class LIMIT 39),
bucket_count,
pct
WHERE sec_end <= p98_s
UNION ALL
SELECT min_2s AS sec_end,
0 AS n,
0.00 AS sec_start
FROM pct
UNION ALL
SELECT (max_s + 0.01) AS sec_end,
b_count AS n,
p98_s AS sec_start
FROM pct,
bucket_count
)
SELECT sec_end,
n,
sec_start,
COUNT(query_id)
FROM buckets
LEFT JOIN queries
ON total_elapsed_s >= sec_start
AND total_elapsed_s < sec_end
LEFT JOIN queries
ON total_elapsed_s >= sec_start
AND total_elapsed_s < sec_end
GROUP BY 1,
2,
3
Expand Down
Loading
Loading