本文提供基于python notebook,根据query_log查询日志,分析系统的高频SQL、查询慢SQL、查询慢的SQL在客户端的分布情况,为下一步的优化提供依据。
查询高频SQL
因为query_log记录的SQL是完整的SQL,查询条件不同,则SQL不同的,需要将SQL中的具体参数值转化为?
因为系统中的查询条件基于都放在’'中,如is_delete = '0'
, 所以使用replaceRegexpAll函数将''
中的内容替换为?
: replaceRegexpAll(query,'\\'(.*?)\\'','?')
sql='select ip,sql,sum(total) as sumTotal,ceil(sum(elapsedTime)/sum(total)) as ts,ceil(sum(read_rows)/sum(total)) as read_rows from ('
sub_sql='''
select substr(IPv6NumToString(address),8) as ip,replaceRegexpAll(query,'\\'(.*?)\\'','?') as sql,
count() as total,sum(query_duration_ms) as elapsedTime,sum(read_rows) as read_rows
from remote('$ip','system','query_log')
where event_time>'{startTime}' and event_time<'{endTime}'
and is_initial_query=1 and query_duration_ms>{queryTime}
and type = 'QueryFinish' and query_kind='Select'
group by ip,sql order by total desc limit {topN}
'''
ips = ['172.16.xx.xx','172.16.xx.xx','172.16.xx.xx','172.16.xx.xx','172.16.xx.xx','172.16.xx.xx']
for i, ip in enumerate(ips):
sql+=sub_sql.replace('$ip',ip)
if(i<(len(ips)-1)):
sql+= 'union all'
sql+=' ) a group by ip,sql order by sumTotal desc limit {topN}'
params = {"startTime": "2021-12-17 07:00:00", "endTime": "2021-12-17 10:00:00","queryTime":0,"topN":30}
sql.format(**params)
i=0
for row in engine.execute(sql.format(**params)).fetchall():
i+=1
print('序号:{},ip:{},执行数量:{},平均耗时:{}ms,平均读取记录行数:{}------SQL:{}'.format(i,row[0],row[2],row[3],row[4],row[1]))
print('=====================================================')
查询慢SQL
可以查询SQL运行耗时大于指定时间,且包含字符串的慢SQL
sql='select ip,query_kind,event_time,query,read_rows,written_rows,result_rows,query_duration_ms,memory_usage from ('
sub_sql='''
select substr(IPv6NumToString(address),8) as ip,query_kind,event_time,query,read_rows,written_rows,result_rows,
query_duration_ms,memory_usage
from remote('$ip','system','query_log')
where event_time>'{startTime}' and event_time<'{endTime}' and is_initial_query=1 and query_duration_ms>{queryTime}
and type = 'QueryFinish' and position(query,'{query}')>0 order by event_time desc limit {topN}
'''
ips = ['172.16.xx.xx','172.16.xx.xx','172.16.xx.xx','172.16.xx.xx','172.16.xx.xx','172.16.xx.xx']
for i, ip in enumerate(ips):
sql+=sub_sql.replace('$ip',ip)
if(i<(len(ips)-1)):
sql+= 'union all'
sql+=' ) a order by event_time desc limit {topN} '
params = {"startTime": "2021-12-17 11:30:00", "endTime": "2021-12-17 12:00:00","query":" begin_time desc ","queryTime":1000,"topN":10}
i=0
for row in engine.execute(sql.format(**params)).fetchall():
i+=1
print('序号:{},执行时间:{},IP:{},read_rows:{},written_rows:{},result_rows:{},耗时:{},占用内存:{}------SQL:{}'.format(i,row[2],row[0],row[4],row[5],row[6],row[7],row[8],row[3]))
按客户端IP统计慢SQL数量
sql='select ip,count(*) as total from ('
sub_sql='''
select substr(IPv6NumToString(address),8) as ip
from remote('$ip','system','query_log')
where event_time>'{startTime}' and event_time<'{endTime}' and is_initial_query=1 and query_duration_ms>{queryTime}
and type = 'QueryFinish' and position(query,'{query}')>0 order by event_time desc limit {topN}
'''
ips = ['172.16.xx.xx','172.16.xx.xx','172.16.xx.xx','172.16.xx.xx','172.16.xx.xx','172.16.xx.xx']
for i, ip in enumerate(ips):
sql+=sub_sql.replace('$ip',ip)
if(i<(len(ips)-1)):
sql+= 'union all'
sql+=' ) a group by ip order by total desc '
print(sql)
params = {"startTime": "2021-12-17 11:30:00", "endTime": "2021-12-17 12:00:00","query":"","queryTime":0,"topN":10}
i=0
for row in engine.execute(sql.format(**params)).fetchall():
i+=1
print('IP:{}\tSQL数量:{}'.format(row[0],row[1]))
附录
notebook导入的依赖和函数
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from sqlalchemy import create_engine, MetaData
import sqlalchemy
import time
import datetime
#设置显示格式
pd.set_option('display.max_rows',10000)
pd.set_option('display.max_columns',500)
pd.set_option('display.width',1000)
pd.set_option('display.float_format', lambda x: '{:.0f}'.format(x))
def printDF(df,pos=slice(1,2)):
for row in df.itertuples():
print(row[pos])
def printRes(sql):
for row in engine.execute(sql).fetchall():
print(row[0])
def executeSQL(sql):
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f'))
start =time.time()
df = pd.read_sql(sql,con=engine)
end = time.time()
print('Running time: %s Seconds'%(end-start))
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f'))
return df
def showCreateTable(tblName):
for col in engine.execute('show create table dbName.{}'.format(tblName)).fetchall()[0]:
print(col)
for col in engine.execute('show create table dbName_01.{}_LOCAL'.format(tblName)).fetchall()[0]:
print(col)
for col in engine.execute('show create table dbName_02.{}_LOCAL'.format(tblName)).fetchall()[0]:
print(col)
uri = 'clickhouse+native://default:@172.16.xx.xx:19000/xx'
engine =create_engine(uri,encoding='utf-8')
日期时间相关函数使用示例
date_sub(now(), interval 7 day)
toHour(now()
parseDateTimeBestEffort('2021-12-16 03:36:40')
addDays(toStartOfDay(now()),-2)
toDateTime('2021-12-16 05:10:00')