Java实现word、pdf转html保留格式
一、word转html
依赖:
<properties><poi.version>5.2.3</poi.version><xhtml.version>2.0.4</xhtml.version>
</properties><!--word转html-->
<dependency><groupId>org.apache.poi</groupId><artifactId>poi-scratchpad</artifactId><version>${poi.version}</version>
</dependency>
<!--word转html-->
<dependency><groupId>fr.opensagres.xdocreport</groupId><artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId><version>${xhtml.version}</version>
</dependency>
<!--处理office文档表格相关 2007+版-->
<dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>${poi.version}</version>
</dependency>
<!--处理office文档表格相关 2003版-->
<dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>${poi.version}</version>
</dependency>
代码:
import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.commons.codec.binary.Base64;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.xwpf.usermodel.XWPFDocument;import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.net.URL;public class WordUtil {public static String wordToHtml(String fileUrl,String fileSuffix) throws Exception {URL url = new URL(fileUrl);try (InputStream inputStream = url.openStream()) {if(fileSuffix.equals(".docx") || fileSuffix.equals(".DOCX")){return word2007ToHtml(inputStream);} else if (fileSuffix.equals(".doc") || fileSuffix.equals(".DOC")) {return word2003ToHtml(inputStream);}else{throw new RuntimeException("错误的文件后缀");}} catch (RuntimeException e) {throw new RuntimeException(e.getMessage());}}/*** word2007转换成html* 对于docx,可以用下面这种方式:* @throws Exception*/public static String word2007ToHtml(InputStream inputStream) {try (ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();XWPFDocument docxDocument = new XWPFDocument(inputStream)) {XHTMLOptions options = XHTMLOptions.create();// 是否忽略未使用的样式options.setIgnoreStylesIfUnused(false);// 设置片段模式,<div>标签包裹options.setFragment(true);// 图片转base64options.setImageManager(new Base64EmbedImgManager());// 转换htm1XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);return htmlStream.toString();} catch (Exception e) {System.out.println("Word转Html过程出现异常!");throw new RuntimeException(e.getMessage());}}/*** word2003转换成html* 对于doc,可以用下面这种方式:* @throws Exception*/public static String word2003ToHtml(InputStream inputStream ) throws Exception {try (StringWriter writer = new StringWriter();HWPFDocument document = new HWPFDocument(inputStream)) {WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());//将图片转成base64的格式wordToHtmlConverter.setPicturesManager((bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes));wordToHtmlConverter.processDocument(document);org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();DOMSource domSource = new DOMSource(htmlDocument);TransformerFactory factory = TransformerFactory.newInstance();Transformer serializer = factory.newTransformer();serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");serializer.setOutputProperty(OutputKeys.INDENT, "yes");serializer.setOutputProperty(OutputKeys.METHOD, "html");serializer.transform(domSource, new StreamResult(writer));return writer.toString();} catch (Exception e) {System.out.println("Word转Html过程出现异常!");throw new RuntimeException(e.getMessage());}}}
来源博客:Java实现word转html_java word转html-CSDN博客
二、pdf转html
依赖:
<dependency><groupId>net.sf.cssbox</groupId><artifactId>pdf2dom</artifactId></dependency><dependency><groupId>net.mabboud.fontverter</groupId><artifactId>FontVerter</artifactId></dependency><dependency><groupId>org.reflections</groupId><artifactId>reflections</artifactId></dependency><!--pdf转文本--><dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox</artifactId></dependency>
代码:
import org.apache.pdfbox.pdmodel.PDDocument;
import org.fit.pdfdom.PDFDomTree;import java.io.*;
import java.net.URL;public class PDFUtil {public static String pdfToHtml(String fileUrl) throws IOException {URL url = new URL(fileUrl);try (InputStream inputStream = url.openStream()){return pdfToHtml(inputStream);}catch (Exception e){throw new IOException(e.getMessage());}}public static String pdfToHtml(InputStream inputStream) throws IOException {String outFilePath = "mypdf.html";String pdfContent = "";PDDocument document = PDDocument.load(inputStream);Writer writer = new PrintWriter(outFilePath, "UTF-8");new PDFDomTree().writeText(document, writer);writer.close();document.close();// 获取html内容try (BufferedReader reader = new BufferedReader(new FileReader(outFilePath))) {StringBuilder htmlContent = new StringBuilder();String line;while ((line = reader.readLine()) != null) {htmlContent.append(line).append("\n"); // 追加每一行内容,并添加换行符}pdfContent = String.valueOf(htmlContent);return pdfContent;} catch (IOException e) {e.printStackTrace();System.err.println("读取 HTML 文件时出错。");}return null;}
}
来源博客:使用Java实现PDF到HTML的转换_java pdf转html-CSDN博客