当前位置: 首页 > news >正文

Linux下GCC和C++实现统计Clickhouse数据仓库指定表中各字段的空值、空字符串或零值比例

Clickhouse SQL 统计脚本

/* 创建结果表(如果不存在) */
CREATE TABLE IF NOT EXISTS field_empty_stats (id UInt32 AUTO_INCREMENT,database String,table_name String,column_name String,column_type String,empty_count UInt64,total_rows UInt64,empty_percent Float64,run_time DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY (database, table_name, column_name, run_time);/* 字段空值统计查询 */
WITH tables AS (SELECT database,name AS table_nameFROM system.tables WHERE database = 'your_database'AND table_name IN ('table1', 'table2')  -- 替换为实际表名
)
SELECT t.database,t.table_name,c.name AS column_name,c.type AS column_type,CASE WHEN c.type LIKE 'String%' THEN(SELECT sum(TRIM(IFNULL({col:Identifier}, '')) = '') FROM {db:Identifier}.{table:Identifier})WHEN c.type IN ('UInt8','UInt16','UInt32','UInt64','Int8','Int16','Int32','Int64','Float32','Float64','Decimal') THEN(SELECT sum(IFNULL({col:Identifier}, 0) = 0) FROM {db:Identifier}.{table:Identifier})ELSE (SELECT sum({col:Identifier} IS NULL) FROM {db:Identifier}.{table:Identifier})END AS empty_count,(SELECT count() FROM {db:Identifier}.{table:Identifier}) AS total_rows,ROUND(empty_count * 100.0 / total_rows, 2) AS empty_percent
FROM system.columns c
JOIN tables t ON c.database = t.database AND c.table = t.table_name
SETTINGS allow_experimental_analyzer = 1,format_template = 'SELECT \'{database}\', \'{table_name}\', \'{column_name}\', \'{column_type}\', ${{col:Identifier}}, ${{table:Identifier}}, ${{db:Identifier}}'

C++ 控制台程序

#include <clickhouse/client.h>
#include <iostream>
#include <vector>
#include <ctime>
#include <iomanip>
#include <sstream>using namespace clickhouse;// 获取当前时间戳字符串
std::string currentDateTime() {auto now = std::time(nullptr);auto tm = *std::localtime(&now);std::ostringstream oss;oss << std::put_time(&tm, "%Y-%m-%d %H:%M:%S");return oss.str();
}int main(int argc, char* argv[]) {if (argc < 3) {std::cerr << "Usage: " << argv[0] << " <database> <table1> [table2 ...]\n";return 1;}const std::string database = argv[1];std::vector<std::string> tables;for (int i = 2; i < argc; ++i) {tables.push_back(argv[i]);}// 连接ClickhouseClientOptions opts;opts.SetHost("localhost");opts.SetPort(9000);opts.SetUser("default");opts.SetPassword("");Client client(opts);// 构建动态SQLstd::ostringstream sql;sql << "WITH tables AS ("<< "  SELECT database, name AS table_name "<< "  FROM system.tables "<< "  WHERE database = '" << database << "' "<< "    AND table_name IN (";for (size_t i = 0; i < tables.size(); ++i) {sql << "'" << tables[i] << "'";if (i < tables.size() - 1) sql << ", ";}sql << ") )"<< "INSERT INTO field_empty_stats "<< "(database, table_name, column_name, column_type, empty_count, total_rows, empty_percent) "<< "SELECT "<< "  t.database, "<< "  t.table_name, "<< "  c.name AS column_name, "<< "  c.type AS column_type, "<< "  CASE "<< "    WHEN c.type LIKE 'String%' THEN "<< "      (SELECT sum(TRIM(IFNULL({col:Identifier}, '')) = '') "<< "       FROM {db:Identifier}.{table:Identifier}) "<< "    WHEN c.type IN ( "<< "      'UInt8','UInt16','UInt32','UInt64', "<< "      'Int8','Int16','Int32','Int64', "<< "      'Float32','Float64','Decimal' "<< "    ) THEN "<< "      (SELECT sum(IFNULL({col:Identifier}, 0) = 0) "<< "      FROM {db:Identifier}.{table:Identifier}) "<< "    ELSE "<< "      (SELECT sum({col:Identifier} IS NULL) "<< "       FROM {db:Identifier}.{table:Identifier}) "<< "  END AS empty_count, "<< "  (SELECT count() FROM {db:Identifier}.{table:Identifier}) AS total_rows, "<< "  ROUND(empty_count * 100.0 / total_rows, 2) AS empty_percent "<< "FROM system.columns c "<< "JOIN tables t ON c.database = t.database AND c.table = t.table_name "<< "SETTINGS "<< "  allow_experimental_analyzer = 1, "<< "  format_template = "<< "'SELECT \\'{database}\\', \\'{table_name}\\', \\'{column_name}\\', "<< "\\'{column_type}\\', ${{col:Identifier}}, ${{table:Identifier}}, ${{db:Identifier}}'";try {// 执行查询client.Execute(sql.str());std::cout << "Successfully analyzed " << tables.size() << " tables in database: " << database << "\n";std::cout << "Results written to field_empty_stats at: " << currentDateTime() << std::endl;} catch (const std::exception& e) {std::cerr << "Error: " << e.what() << std::endl;return 1;}return 0;
}

使用说明

  1. 创建结果表
CREATE TABLE IF NOT EXISTS field_empty_stats (id UInt32 AUTO_INCREMENT,database String,table_name String,column_name String,column_type String,empty_count UInt64,total_rows UInt64,empty_percent Float64,run_time DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY (database, table_name, column_name, run_time);
  1. 编译程序
g++ -std=c++17 -o field_analyzer field_analyzer.cpp -lclickhouse-cpp-lib
  1. 运行程序
./field_analyzer your_database table1 table2 table3

处理逻辑说明

字段类型空值判定逻辑统计内容
StringTRIM(IFNULL(column, '')) = ''空字符串
数值类型IFNULL(column, 0) = 0零值
其他类型column IS NULLNULL值

结果表结构

列名类型说明
idUInt32自增ID
databaseString数据库名称
table_nameString表名称
column_nameString字段名称
column_typeString字段类型
empty_countUInt64空值/零值/NULL记录数
total_rowsUInt64表总行数
empty_percentFloat64空值百分比(保留两位小数)
run_timeDateTime分析执行时间

注意事项

  1. 确保Clickhouse服务器运行在默认端口9000(或修改代码中的端口)
  2. 需要安装clickhouse-cpp客户端库
  3. 程序自动处理以下数据类型:
    • 字符串类型:空字符串统计
    • 数值类型:零值统计
    • 其他类型:NULL值统计
  4. 结果表使用MergeTree引擎,按分析时间排序
  5. 百分比计算保留两位小数

这个解决方案实现了:

  • 动态分析指定数据库的多个表
  • 自动识别字段类型并应用相应的空值逻辑
  • 结果写入包含自增ID和时间戳的统计表
  • 命令行参数指定数据库和表名
  • 完整的错误处理机制
http://www.xdnf.cn/news/930745.html

相关文章:

  • Go基本语法——go语言中的四种变量定义方法
  • 【Java学习笔记】BigInteger 和 BigDecimal 类
  • 【Go语言基础【19】】接口:灵活实现多态的核心机制
  • 基于RT-DETR算法的夜间交通车辆与行人目标检测
  • FPGA静态功耗
  • 2025年与2030年AI及AI智能体 (Agent) 市场份额分析报告
  • 网络之交换机
  • 【题解-洛谷】B3626 跳跃机器人
  • JavaWeb——登录(14/16):登录校验-Interceptor-详解(使用细节、拦截路径的配置、匹配规则、执行流程、拦截器与过滤器的区别)
  • 【华为云Astro 轻应用】组装“待处理工单”页面示例
  • C语言基础面试问答
  • 【人工智能 | 项目开发】Python Flask实现本地AI大模型可视化界面
  • 苍穹外卖-day01
  • 用 DeepSeek 高效完成数据分析与挖掘
  • Bootstrap Table开源的企业级数据表格集成
  • 大数据学习(133)-Hive数据分析2
  • 论文笔记:Large Language Models for Next Point-of-Interest Recommendation
  • 云原生监控体系建设:Prometheus+Grafana的企业级实践
  • 作为点的对象CenterNet论文阅读
  • 【论文阅读30】Bi-LSTM(2024)
  • Spring Boot + Flink + FlinkCDC 实现 MySQL 同步到 MySQL
  • 【Java学习笔记】Arrays类
  • 视频音频去掉开头结尾 视频去掉前n秒后n秒 电视剧去掉开头歌曲
  • 408第一季 - 数据结构 - 图
  • 数据结构排序
  • AU音频软件|Audition 2025网盘下载与安装教程指南
  • AURA智能助手在物联网(IoT)和数字化改造领域的使用
  • Linux运维新人自用笔记(乌班图apt命令和dpkg命令、两系统指令区别,rpm解决路径依赖、免安装配置java环境)
  • 机器学习用于算法交易(Matlab实现)
  • 在VSCode中使用Ultralytics扩展