当前位置: 首页 > backend >正文

Java实现word、pdf转html保留格式

一、word转html

依赖:

<properties><poi.version>5.2.3</poi.version><xhtml.version>2.0.4</xhtml.version>
</properties><!--word转html-->
<dependency><groupId>org.apache.poi</groupId><artifactId>poi-scratchpad</artifactId><version>${poi.version}</version>
</dependency>
<!--word转html-->
<dependency><groupId>fr.opensagres.xdocreport</groupId><artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId><version>${xhtml.version}</version>
</dependency>
<!--处理office文档表格相关 2007+版-->
<dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>${poi.version}</version>
</dependency>
<!--处理office文档表格相关 2003版-->
<dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>${poi.version}</version>
</dependency>

代码:

import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.commons.codec.binary.Base64;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.xwpf.usermodel.XWPFDocument;import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.net.URL;public class WordUtil {public static String wordToHtml(String fileUrl,String fileSuffix) throws Exception {URL url = new URL(fileUrl);try (InputStream inputStream = url.openStream()) {if(fileSuffix.equals(".docx") || fileSuffix.equals(".DOCX")){return word2007ToHtml(inputStream);} else if (fileSuffix.equals(".doc") || fileSuffix.equals(".DOC")) {return word2003ToHtml(inputStream);}else{throw new RuntimeException("错误的文件后缀");}} catch (RuntimeException e) {throw new RuntimeException(e.getMessage());}}/*** word2007转换成html* 对于docx,可以用下面这种方式:* @throws Exception*/public static String word2007ToHtml(InputStream inputStream) {try (ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();XWPFDocument docxDocument = new XWPFDocument(inputStream)) {XHTMLOptions options = XHTMLOptions.create();// 是否忽略未使用的样式options.setIgnoreStylesIfUnused(false);// 设置片段模式,<div>标签包裹options.setFragment(true);// 图片转base64options.setImageManager(new Base64EmbedImgManager());// 转换htm1XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);return htmlStream.toString();} catch (Exception e) {System.out.println("Word转Html过程出现异常!");throw new RuntimeException(e.getMessage());}}/*** word2003转换成html* 对于doc,可以用下面这种方式:* @throws Exception*/public static String word2003ToHtml(InputStream inputStream ) throws Exception {try (StringWriter writer = new StringWriter();HWPFDocument document = new HWPFDocument(inputStream)) {WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());//将图片转成base64的格式wordToHtmlConverter.setPicturesManager((bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes));wordToHtmlConverter.processDocument(document);org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();DOMSource domSource = new DOMSource(htmlDocument);TransformerFactory factory = TransformerFactory.newInstance();Transformer serializer = factory.newTransformer();serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");serializer.setOutputProperty(OutputKeys.INDENT, "yes");serializer.setOutputProperty(OutputKeys.METHOD, "html");serializer.transform(domSource, new StreamResult(writer));return writer.toString();} catch (Exception e) {System.out.println("Word转Html过程出现异常!");throw new RuntimeException(e.getMessage());}}}

来源博客:Java实现word转html_java word转html-CSDN博客

二、pdf转html

依赖:

        <dependency><groupId>net.sf.cssbox</groupId><artifactId>pdf2dom</artifactId></dependency><dependency><groupId>net.mabboud.fontverter</groupId><artifactId>FontVerter</artifactId></dependency><dependency><groupId>org.reflections</groupId><artifactId>reflections</artifactId></dependency><!--pdf转文本--><dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox</artifactId></dependency>

 代码:

import org.apache.pdfbox.pdmodel.PDDocument;
import org.fit.pdfdom.PDFDomTree;import java.io.*;
import java.net.URL;public class PDFUtil {public static String pdfToHtml(String fileUrl) throws IOException {URL url = new URL(fileUrl);try (InputStream inputStream = url.openStream()){return pdfToHtml(inputStream);}catch (Exception e){throw new IOException(e.getMessage());}}public static String pdfToHtml(InputStream inputStream) throws IOException {String outFilePath = "mypdf.html";String pdfContent = "";PDDocument document = PDDocument.load(inputStream);Writer writer = new PrintWriter(outFilePath, "UTF-8");new PDFDomTree().writeText(document, writer);writer.close();document.close();// 获取html内容try (BufferedReader reader = new BufferedReader(new FileReader(outFilePath))) {StringBuilder htmlContent = new StringBuilder();String line;while ((line = reader.readLine()) != null) {htmlContent.append(line).append("\n"); // 追加每一行内容,并添加换行符}pdfContent = String.valueOf(htmlContent);return pdfContent;} catch (IOException e) {e.printStackTrace();System.err.println("读取 HTML 文件时出错。");}return null;}
}

 来源博客:使用Java实现PDF到HTML的转换_java pdf转html-CSDN博客

http://www.xdnf.cn/news/15468.html

相关文章:

  • HTTP性能优化实战技术
  • 【电脑】显卡(GPU)的基础知识
  • 暑期算法训练.1
  • 【解决】联想电脑亮度调节
  • 行为模式-状态模式
  • 前端打包自动压缩为zip--archiver
  • MongoDB数据问题说明
  • 大模型在1型糖尿病肾病V期预测及治疗方案制定中的应用研究
  • 《大数据技术原理与应用》实验报告五 熟悉 Hive 的基本操作
  • 用uniapp开发鸿蒙应用(暂停更新-根据项目更新,现在项目未开始)
  • LangChain智能体开发实战:从零构建企业级AI助手
  • 17、鸿蒙Harmony Next开发:状态管理(组件拥有的状态和应用拥有的状态)
  • 3种添加视频水印的加密方式,守护视频安全!
  • OpenCV 对比度拉伸图像增强函数contrastStretching()
  • 基于UDP/IP网络游戏加速高级拥塞控制算法(示意:一)
  • 21-C#的委托简单使用-1
  • 【zynq7020】PS的“Hello World”
  • Android弹窗
  • 【C++】初识C++(1)
  • 映美打印机-URL页面打印
  • AI产品经理面试宝典第20天:AI+金融场景相关面试题及回答指导
  • 缓存穿透的“黑暗森林”假说——当攻击者学会隐藏恶意流量
  • 02 51单片机之LED闪烁
  • Knife4j快速入门
  • Java-IO流
  • Redis单线程详解
  • 多线程--单例模式and工厂模式
  • 2025-7-14-C++ 学习 排序(2)
  • C#——数据与变量
  • 力扣454.四数相加Ⅱ