当前位置：首页 > web >正文

CASIA-HWDB的gnt转换为png图片

web 2025/8/12 20:28:38

手写汉字数据集简介

HWDB（Handwritten Chinese Character Database）：
- 由中科院自动化所（CASIA）发布，包含离线手写汉字图像（如HWDB1.0-1.2），涵盖约7,000个汉字类别，每个字符由不同书写者多次书写。
OLHWDB（Online Handwritten Chinese Character Database）：
- 在线手写数据，记录笔尖的运动轨迹（坐标序列和笔画顺序），如OLHWDB1.0-1.2，类别与HWDB一致。

网站地址：

https://nlpr.ia.ac.cn/databases/handwriting/Download.html

gnt转换为png图片

import os
import struct
import numpy as np
from PIL import Image
import glob
import redef sanitize_filename(filename):"""移除文件名中的非法字符"""return re.sub(r'[\\/*?:"<>|\x00]', '', filename)def gnt_to_png(input_path, output_dir):os.makedirs(output_dir, exist_ok=True)index = 0  # 文件名计数器with open(input_path, 'rb') as f:while True:try:# 读取样本大小sample_size_bytes = f.read(4)if not sample_size_bytes:break  # 文件结束sample_size = struct.unpack('<I', sample_size_bytes)[0]# 读取标签（处理空字符）tag_bytes = f.read(2)if len(tag_bytes) < 2:breaktag = tag_bytes.decode('GBK', errors='replace').replace('\x00', '')# 读取宽高width = struct.unpack('<H', f.read(2))[0]height = struct.unpack('<H', f.read(2))[0]# 读取像素数据pixel_data = f.read(width * height)if len(pixel_data) != width * height:break  # 数据不完整# 转换为图像image = np.frombuffer(pixel_data, dtype=np.uint8).reshape(height, width)img = Image.fromarray(image, mode='L')# 生成合法文件名base_name = os.path.splitext(os.path.basename(input_path))[0]safe_base = sanitize_filename(base_name)safe_tag = sanitize_filename(tag)output_path = os.path.join(output_dir, f"{safe_base}_{safe_tag}_{index}.png")# 保存文件（捕获异常）try:img.save(output_path)index += 1except OSError as e:print(f"跳过非法文件名: {output_path} (原因: {e})")except Exception as e:print(f"处理失败: {input_path} (错误: {e})")break# 批量处理目录下所有.gnt文件
input_dir = r"E:\code_python\1"
output_dir = r"E:\code_python\2"for gnt_file in glob.glob(os.path.join(input_dir, "*.gnt")):gnt_to_png(gnt_file, output_dir)