Clickhouse SQL 统计脚本
CREATE TABLE IF NOT EXISTS field_empty_stats (id UInt32 AUTO_INCREMENT,database String,table_name String,column_name String,column_type String,empty_count UInt64,total_rows UInt64,empty_percent Float64,run_time DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY (database, table_name, column_name, run_time);
WITH tables AS (SELECT database,name AS table_nameFROM system.tables WHERE database = 'your_database'AND table_name IN ('table1', 'table2')
)
SELECT t.database,t.table_name,c.name AS column_name,c.type AS column_type,CASE WHEN c.type LIKE 'String%' THEN(SELECT sum(TRIM(IFNULL({col:Identifier}, '')) = '') FROM {db:Identifier}.{table:Identifier})WHEN c.type IN ('UInt8','UInt16','UInt32','UInt64','Int8','Int16','Int32','Int64','Float32','Float64','Decimal') THEN(SELECT sum(IFNULL({col:Identifier}, 0) = 0) FROM {db:Identifier}.{table:Identifier})ELSE (SELECT sum({col:Identifier} IS NULL) FROM {db:Identifier}.{table:Identifier})END AS empty_count,(SELECT count() FROM {db:Identifier}.{table:Identifier}) AS total_rows,ROUND(empty_count * 100.0 / total_rows, 2) AS empty_percent
FROM system.columns c
JOIN tables t ON c.database = t.database AND c.table = t.table_name
SETTINGS allow_experimental_analyzer = 1,format_template = 'SELECT \'{database}\', \'{table_name}\', \'{column_name}\', \'{column_type}\', ${{col:Identifier}}, ${{table:Identifier}}, ${{db:Identifier}}'
C++ 控制台程序
#include <clickhouse/client.h>
#include <iostream>
#include <vector>
#include <ctime>
#include <iomanip>
#include <sstream>using namespace clickhouse;
std::string currentDateTime() {auto now = std::time(nullptr);auto tm = *std::localtime(&now);std::ostringstream oss;oss << std::put_time(&tm, "%Y-%m-%d %H:%M:%S");return oss.str();
}int main(int argc, char* argv[]) {if (argc < 3) {std::cerr << "Usage: " << argv[0] << " <database> <table1> [table2 ...]\n";return 1;}const std::string database = argv[1];std::vector<std::string> tables;for (int i = 2; i < argc; ++i) {tables.push_back(argv[i]);}ClientOptions opts;opts.SetHost("localhost");opts.SetPort(9000);opts.SetUser("default");opts.SetPassword("");Client client(opts);std::ostringstream sql;sql << "WITH tables AS ("<< " SELECT database, name AS table_name "<< " FROM system.tables "<< " WHERE database = '" << database << "' "<< " AND table_name IN (";for (size_t i = 0; i < tables.size(); ++i) {sql << "'" << tables[i] << "'";if (i < tables.size() - 1) sql << ", ";}sql << ") )"<< "INSERT INTO field_empty_stats "<< "(database, table_name, column_name, column_type, empty_count, total_rows, empty_percent) "<< "SELECT "<< " t.database, "<< " t.table_name, "<< " c.name AS column_name, "<< " c.type AS column_type, "<< " CASE "<< " WHEN c.type LIKE 'String%' THEN "<< " (SELECT sum(TRIM(IFNULL({col:Identifier}, '')) = '') "<< " FROM {db:Identifier}.{table:Identifier}) "<< " WHEN c.type IN ( "<< " 'UInt8','UInt16','UInt32','UInt64', "<< " 'Int8','Int16','Int32','Int64', "<< " 'Float32','Float64','Decimal' "<< " ) THEN "<< " (SELECT sum(IFNULL({col:Identifier}, 0) = 0) "<< " FROM {db:Identifier}.{table:Identifier}) "<< " ELSE "<< " (SELECT sum({col:Identifier} IS NULL) "<< " FROM {db:Identifier}.{table:Identifier}) "<< " END AS empty_count, "<< " (SELECT count() FROM {db:Identifier}.{table:Identifier}) AS total_rows, "<< " ROUND(empty_count * 100.0 / total_rows, 2) AS empty_percent "<< "FROM system.columns c "<< "JOIN tables t ON c.database = t.database AND c.table = t.table_name "<< "SETTINGS "<< " allow_experimental_analyzer = 1, "<< " format_template = "<< "'SELECT \\'{database}\\', \\'{table_name}\\', \\'{column_name}\\', "<< "\\'{column_type}\\', ${{col:Identifier}}, ${{table:Identifier}}, ${{db:Identifier}}'";try {client.Execute(sql.str());std::cout << "Successfully analyzed " << tables.size() << " tables in database: " << database << "\n";std::cout << "Results written to field_empty_stats at: " << currentDateTime() << std::endl;} catch (const std::exception& e) {std::cerr << "Error: " << e.what() << std::endl;return 1;}return 0;
}
使用说明
- 创建结果表:
CREATE TABLE IF NOT EXISTS field_empty_stats (id UInt32 AUTO_INCREMENT,database String,table_name String,column_name String,column_type String,empty_count UInt64,total_rows UInt64,empty_percent Float64,run_time DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY (database, table_name, column_name, run_time);
- 编译程序:
g++ -std=c++17 -o field_analyzer field_analyzer.cpp -lclickhouse-cpp-lib
- 运行程序:
./field_analyzer your_database table1 table2 table3
处理逻辑说明
字段类型 | 空值判定逻辑 | 统计内容 |
---|
String | TRIM(IFNULL(column, '')) = '' | 空字符串 |
数值类型 | IFNULL(column, 0) = 0 | 零值 |
其他类型 | column IS NULL | NULL值 |
结果表结构
列名 | 类型 | 说明 |
---|
id | UInt32 | 自增ID |
database | String | 数据库名称 |
table_name | String | 表名称 |
column_name | String | 字段名称 |
column_type | String | 字段类型 |
empty_count | UInt64 | 空值/零值/NULL记录数 |
total_rows | UInt64 | 表总行数 |
empty_percent | Float64 | 空值百分比(保留两位小数) |
run_time | DateTime | 分析执行时间 |
注意事项
- 确保Clickhouse服务器运行在默认端口9000(或修改代码中的端口)
- 需要安装clickhouse-cpp客户端库
- 程序自动处理以下数据类型:
- 字符串类型:空字符串统计
- 数值类型:零值统计
- 其他类型:NULL值统计
- 结果表使用MergeTree引擎,按分析时间排序
- 百分比计算保留两位小数
这个解决方案实现了:
- 动态分析指定数据库的多个表
- 自动识别字段类型并应用相应的空值逻辑
- 结果写入包含自增ID和时间戳的统计表
- 命令行参数指定数据库和表名
- 完整的错误处理机制