CASIA-HWDB的gnt转换为png图片
手写汉字数据集简介
-
HWDB(Handwritten Chinese Character Database):
-
由中科院自动化所(CASIA)发布,包含离线手写汉字图像(如HWDB1.0-1.2),涵盖约7,000个汉字类别,每个字符由不同书写者多次书写。
-
-
OLHWDB(Online Handwritten Chinese Character Database):
-
在线手写数据,记录笔尖的运动轨迹(坐标序列和笔画顺序),如OLHWDB1.0-1.2,类别与HWDB一致。
-
网站地址:
https://nlpr.ia.ac.cn/databases/handwriting/Download.html
gnt转换为png图片
import os
import struct
import numpy as np
from PIL import Image
import glob
import redef sanitize_filename(filename):"""移除文件名中的非法字符"""return re.sub(r'[\\/*?:"<>|\x00]', '', filename)def gnt_to_png(input_path, output_dir):os.makedirs(output_dir, exist_ok=True)index = 0 # 文件名计数器with open(input_path, 'rb') as f:while True:try:# 读取样本大小sample_size_bytes = f.read(4)if not sample_size_bytes:break # 文件结束sample_size = struct.unpack('<I', sample_size_bytes)[0]# 读取标签(处理空字符)tag_bytes = f.read(2)if len(tag_bytes) < 2:breaktag = tag_bytes.decode('GBK', errors='replace').replace('\x00', '')# 读取宽高width = struct.unpack('<H', f.read(2))[0]height = struct.unpack('<H', f.read(2))[0]# 读取像素数据pixel_data = f.read(width * height)if len(pixel_data) != width * height:break # 数据不完整# 转换为图像image = np.frombuffer(pixel_data, dtype=np.uint8).reshape(height, width)img = Image.fromarray(image, mode='L')# 生成合法文件名base_name = os.path.splitext(os.path.basename(input_path))[0]safe_base = sanitize_filename(base_name)safe_tag = sanitize_filename(tag)output_path = os.path.join(output_dir, f"{safe_base}_{safe_tag}_{index}.png")# 保存文件(捕获异常)try:img.save(output_path)index += 1except OSError as e:print(f"跳过非法文件名: {output_path} (原因: {e})")except Exception as e:print(f"处理失败: {input_path} (错误: {e})")break# 批量处理目录下所有.gnt文件
input_dir = r"E:\code_python\1"
output_dir = r"E:\code_python\2"for gnt_file in glob.glob(os.path.join(input_dir, "*.gnt")):gnt_to_png(gnt_file, output_dir)