当前位置：首页 > news >正文

数据库存储大量的json文件怎么样高效的读取和分页，利用文件缓存办法不占用内存

news 2025/9/6 8:03:09

利用文件缓存的优势：

内存友好：使用文件流处理，不将整个大文件加载到内存
可扩展性：支持超大型JSON文件（GB级别）
缓存效率：文件缓存可以持久化，服务器重启后仍然有效
并发安全：每个分页请求生成独立的缓存文件
灵活性：可以根据需要调整缓存策略和分页大小

首先假设我的数据库里面有大量的json文件，我们现在把导出来，用json格式导出到一个文件内

const fs=require('fs');
const ClientDB=require('../db/db');
const db=new ClientDB('employees','mylog');
const path=require('path');async function testBigJson(){try{const docs=await db.find({});if(docs===false){return;}//获取临时文件名const outputFile=getRandomFileName();//存储临时文件路径const outfile=path.join(__dirname,'..','public',outputFile);//创建文件写入流const writeableStream=fs.createWriteStream(outfile);//我们存储的在数组中，首先写[writeableStream.write('[');let isFile=true;docs.forEach((doc)=>{if(!isFile){writeableStream.write(",\n");}//将我们要导出的字段写成对象，再用json.stringify()写成json格式const formatid=doc.i;const formatusername=doc.username;const formatage=doc.age;const formatcreated=doc.created;const result={id:formatid,username:formatusername,age:formatage,createAt:formatcreated};writeableStream.write(JSON.stringify(result));isFile=false;});writeableStream.write(']');writeableStream.end();//这里是返回临时文件名称，注意不是返回json数据return outputFile;}finally{await db.disconnect();}
}//生成临时的随机json文件
function getRandomFileName(){return `output_${Math.random().toString(36).substring(2,9)}.json`
}testBigJson()
.then(console.log());

第2步使用内存映射（mmap）和缓冲区操作，真正高效的文件读取方案

const fs=require('fs');
const {promisify}=require('util');
const open=promisify(fs.open);
const read=promisify(fs.read);
const close=promisify(fs.close);
const stat=promisify(fs.stat);class HighJsonPaginator{constructor(filePath){//json文件名路径this.filePath=filePath;this.fd=null;this.fileSize=0;this.buffer=Buffer.alloc(64*1024);  //64KB缓存区 }//初始化文件，先获取文件句柄，再获取文件属性，取得文件大小async init(){this.fd=await open(this.filePath,'r');const stats=await stat(this.filePath); //返回文件所有属性this.fileSize=stats.size;}async close(){if(this.fd){await close(this.fd);}}//快速查找json数组的起始和结束位置async findArrayBounds(){const satrtBuffer=Buffer.alloc(1024);const endBuffer=Buffer.alloc(1024);//读取文件开头寻找'['//startBuffer指定的缓冲区，存放后面读取的内容//第1个0为写入缓冲区的位置，先写入缓存再读取内容，1024为读取文件字节的长度//最后的0为读取文件的开始位置await read(this.fd,satrtBuffer,0,1024,0);const startIndex=satrtBuffer.indexOf('[');//读取文件末尾寻找']'await read(this.fd,endBuffer,0,1024,this.fileSize-1024);const endIndex=endBuffer.indexOf(']');return{dataStart:startIndex+1, //跳过'['dataEnd:this.fileSize - (1024-endIndex) //找到']'的位置};};//高效分页读取  使用二进制搜索async paginateEfficiently(page=1,pageSize=10){const bounds=await this.findArrayBounds();const totalBytes=bounds.dataEnd - bounds.dataStart;//创建索引(如果不存在)const index=await this.createOrLoadIndex(bounds);const startItem=(page-1)*pageSize;const endItem=startItem + pageSize;//这里表示小于分页需要的数量，这里是10个，数据就全返回if(startItem >=index.positions.length){return {data:[],pagination:{page,pageSize,total:index.positions.length}};}const items=[];for(let i=startItem;i<Math.min(endItem,index.positions.length);i++){//提取单个json数据，即{}这样一对数据const position=index.positions[i];//然后用readJsonItem()读取转换为对象const item=await this.readJsonItem(position.start,position.end); //这个start/end位置在下面createOrLoadIndex()的循环中//将单个对象加入数组if(item) items.push(item);}return{data:items,pagination:{page,pageSize,total:index.positions.length,totalPages:Math.ceil(index.positions.length / pageSize)}}}//创建或加载索引文件async createOrLoadIndex(bounds){const indexFile=this.filePath +'.index';try{//尝试读取现有索引const indexData=await fs.promises.readFile(indexFile,'utf8');return JSON.parse(indexData);}catch{//创建新索引const index={positions:[]};let position=bounds.dataStart;let depth=0; //尝试let itemStart=position; //项目开始位置while(position < bounds.dataEnd){const bytesToRead=Math.min(this.buffer.length,bounds.dataEnd-position); //选择最小那个值 const {bytesRead}=await read(this.fd,this.buffer,0,bytesToRead,position);for(let i=0;i<bytesRead;i++){const char=String.fromCharCode(this.buffer[i]);if(char==='{'){if(depth===0) itemStart=position+i;depth++;}else if(char==='}'){depth--;if(depth===0){index.positions.push({start:itemStart,end:position+i+1});}}else if(char==='[' || char===']'){//跳过数组括号continue;}}//这里相当于移动指针，比如btyesRead实际读取到字节数为50字节//处理完这50字节后，我们需要的移动指针，//position 从0->50,下一次read(this.fd,this.buffer,0,bytesToRead,position);里面的position就从50开始了position+=bytesRead; }//保存索引await fs.promises.writeFile(indexFile,JSON.stringify(index));return index;}}//读取单个Json项目async readJsonItem(start,end){const length =end-start;const buffer = Buffer.alloc(length);await read(this.fd,buffer,0,length,start);try{return JSON.parse(buffer.toString('utf-8'));}catch(error){console.error('解析json失败：',error);return null;}}
}module.exports =HighJsonPaginator;

下面就是测试，比用readline=require(‘readline’)按行读取json文件快10-100倍，读取1000个文档就在30毫秒左右，看你是什么硬盘，读100个以下几乎就是几毫秒

const HighJsonPaginator=require('./highJsonPaginator');async function benchmark() {const testFile = '../public/output_vzqfri8.json';console.time('高效二进制读取');const efficientPaginator = new HighJsonPaginator(testFile);await efficientPaginator.init();const result=await efficientPaginator.paginateEfficiently(1, 30);await efficientPaginator.close();console.timeEnd('高效二进制读取');console.log(result);
}benchmark();

到这里就结束了，如果你前端要调用读取分页可以直接用第2步的HighJsonPaginator{}类，
如果你想要再中间写一个缓存也是可以的，下面的方案只是作为参考

首先创建一个文件映射，遮避真实json文件名
我这里是直接写进fileMap中，你在用的时候可以把这个文件导入第1个代码中，将返回的临时文件中用set设置进这个fileMap中

// config/files-map.js
const fileMap = new Map();// 将真实文件名映射为随机ID
fileMap.set('output_vzqfri8.json', {id: 'a1b2c3d4e5',name: 'large-data.json',path: './data/output_vzqfri8.json',accessible: true
});fileMap.set('sensitive-data.json', {id: 'f6g7h8i9j0', name: 'sensitive-data.json',path: './data/sensitive-data.json',accessible: false // 限制访问
});// 通过ID获取文件信息
function getFileById(fileId) {for (const [key, value] of fileMap.entries()) {if (value.id === fileId) {return value;}}return null;
}// 通过名称获取文件信息
function getFileByName(filename) {return fileMap.get(filename);
}module.exports = { fileMap, getFileById, getFileByName };

第2步前端的调用，这里只写了个大概

const { getFileById } = require('../config/files-map');router.get('/api/optimized-data/:fileId', async (req, res) => {try {const { fileId } = req.params;const { page = '1', pageSize = '20' } = req.query;// 通过ID获取文件信息const fileInfo = getFileById(fileId);if (!fileInfo) {return res.status(404).json({ error: '文件不存在或无权访问' });}if (!fileInfo.accessible) {return res.status(403).json({ error: '无权访问该文件' });}const pageNum = Math.max(1, parseInt(page));const pageSizeNum = Math.min(Math.max(1, parseInt(pageSize)), 200);const cacheKey = `data:${fileId}:${pageNum}:${pageSizeNum}`;// 使用示例：/api/optimized-data/a1b2c3d4e5?page=2&pageSize=50// 用户看不到真实的文件名 large-data.json// ... 其余处理逻辑const result = await processFile(fileInfo.path, pageNum, pageSizeNum);res.json(result);} catch (error) {console.error('文件处理错误:', error);res.status(500).json({ error: '内部服务器错误' });}
});

或者用固定文件名，或jwt令牌也可以的

// 使用固定的文件标识，不暴露真实文件名
const allowedFiles = {'dataset-1': './data/large-data.json','report-2024': './data/sensitive-report.json','user-stats': './data/user-statistics.json'
};router.get('/api/optimized-data/:fileKey', async (req, res) => {try {const { fileKey } = req.params;if (!allowedFiles[fileKey]) {return res.status(404).json({ error: '文件不存在' });}const filePath = allowedFiles[fileKey];// 使用示例：/api/optimized-data/dataset-1?page=2&pageSize=50// 用户看到的是 dataset-1，而不是 large-data.json// ... 处理逻辑} catch (error) {// 错误处理}
});

下面是文件缓存管理

const crypto = require('crypto');
const fs = require('fs').promises;
const path = require('path');class OptimizedFileCache {constructor(cacheDir = './cache') {this.cacheDir = cacheDir;this.indexCache = new Map(); // 内存缓存索引}async getCache(key, generator) {const cachePath = this.getCachePath(key);try {// 检查缓存是否存在且新鲜const cachedData = await this.readCache(cachePath);if (cachedData) {return cachedData;}} catch (error) {// 缓存不存在或损坏}// 生成新数据并缓存const data = await generator();await this.writeCache(cachePath, data);return data;}getCachePath(key) {const hash = crypto.createHash('sha256').update(key).digest('hex');return path.join(this.cacheDir, `${hash}.cache`);}async readCache(filePath) {try {const [data, stats] = await Promise.all([fs.readFile(filePath, 'utf8'),fs.stat(filePath)]);// 检查缓存是否过期（1小时）if (Date.now() - stats.mtimeMs > 3600000) {return null;}return JSON.parse(data);} catch {return null;}}async writeCache(filePath, data) {await fs.mkdir(path.dirname(filePath), { recursive: true });await fs.writeFile(filePath, JSON.stringify(data));}
}

这里是使用文件预读和缓冲区池

class BufferPool {constructor(poolSize = 10, bufferSize = 64 * 1024) {this.pool = [];this.bufferSize = bufferSize;for (let i = 0; i < poolSize; i++) {this.pool.push(Buffer.alloc(bufferSize));}}acquire() {return this.pool.pop() || Buffer.alloc(this.bufferSize);}release(buffer) {if (this.pool.length < 20) { // 限制池大小this.pool.push(buffer);}}
}// 使用预读优化
async function preReadOptimization(fd, position, length, bufferPool) {const buffer = bufferPool.acquire();const { bytesRead } = await read(fd, buffer, 0, Math.min(length, buffer.length), position);const result = buffer.slice(0, bytesRead);bufferPool.release(buffer);return result;
}