使用pdf box去水印
使用pdf box去水印
使用pdf box去除水印的思路,要先解析文档,将pdf的文档内容操作符解析出来,然后再根据解析出来的操作符去生成去水印的代码
不同的文档去除的水印有一点点区别,但是都是大同小异,万变不离其宗。
以下给几个案例给大家参考
1.使用pdf box去除可复制文本水印
/*** 去除可复制文本水印* @param filename 去除水印的文件名称* @param keyword 要去除的水印名称* @return* @throws IOException*/
public String removeTextWatermark(String filename, String keyword) throws IOException {String filePath = uploadFilesDir + File.separator + filename;PDDocument document = PDDocument.load(new File(filePath));for (PDPage page : document.getPages()) {PDFStreamParser parser = new PDFStreamParser(page);parser.parse();List<Object> tokens = parser.getTokens(); //使用PDFBox的PDFStreamParser解析页面内容为token列表List<Object> newTokens = new ArrayList<>();for (int i = 0; i < tokens.size(); i++) {Object token = tokens.get(i);if (token instanceof Operator) {Operator op = (Operator) token;if (op.getName().equals("Tj")) { //文本显示操作符"Tj"COSString previous = (COSString) newTokens.remove(newTokens.size() - 1);if (!previous.getString().contains(keyword)) {newTokens.add(previous);newTokens.add(op);}} else if (op.getName().equals("TJ")) { //文本数组显示操作符"TJ" 检查文本数组拼接后的内容是否包含关键字,不包含则保留COSArray previous = (COSArray) newTokens.remove(newTokens.size() - 1);StringBuilder text = new StringBuilder();for (COSBase base : previous) {if (base instanceof COSString) {text.append(((COSString) base).getString());}}if (!text.toString().contains(keyword)) {newTokens.add(previous);newTokens.add(op);}} else {newTokens.add(op);}} else {newTokens.add(token);}}PDStream newStream = new PDStream(document);OutputStream out = newStream.createOutputStream(COSName.FLATE_DECODE);ContentStreamWriter writer = new ContentStreamWriter(out);writer.writeTokens(newTokens);out.close();page.setContents(newStream);}String textWatermarkRemoveFilename = filename.replace(".pdf", "_cleaned.pdf");document.save(uploadFilesDir + File.separator+ textWatermarkRemoveFilename);document.close();return textWatermarkRemoveFilename;}
2.去除文本不可复制的水印 && 去除路径绘制的水印
/*** 去除不可复制文本水印 也可以去除通过绘制路径生成的水印(比如此项目生成的水印)* extractPartOperators----[COSFloat{0.70711} COSFloat{0.70711} COSFloat{-0.70711} COSFloat{0.70711} COSFloat{297.64999} COSFloat{420.95001} cm* , COSFloat{0.70711} COSFloat{0.70711} COSFloat{-0.70711} COSFloat{0.70711} COSFloat{476.23999} COSFloat{673.52002} cm* , COSName{GS13} gs* , COSName{GS1} gs* , COSFloat{0.70711} COSFloat{0.70711} COSFloat{-0.70711} COSFloat{0.70711} COSFloat{119.06} COSFloat{168.38} cm* , COSInt{1} COSInt{0} COSInt{0} COSInt{-1} COSInt{0} COSFloat{841.9} cm* ]* @param filename* @param* @return* @throws IOException*/
public String removeUncopyTextWatermark(String filename ) throws IOException {String filePath = uploadFilesDir + File.separator + filename;PDDocument document = PDDocument.load(new File(filePath));// Set<String> xObjectNames = PdfOperatorInspector.extractPartOperators(filePath);for (PDPage page : document.getPages()) {PDFStreamParser parser = new PDFStreamParser(page);parser.parse();List<Object> tokens = parser.getTokens();List<Object> newTokens = new ArrayList<>();List<Object> tempBuffer = new ArrayList<>();boolean inWatermarkBlock = false;for (int i = 0; i < tokens.size(); i++) {Object token = tokens.get(i);if (token instanceof Operator) {Operator op = (Operator) token;String opName = op.getName();if ("cm".equals(opName)) {// 预判断:变换矩阵可疑(角度/位置)if (i >= 6 && tokens.get(i - 6) instanceof COSFloat) {COSFloat v = (COSFloat) tokens.get(i - 1); // 最后一个是 Yif (v.floatValue() > 100 || v.floatValue() < 700) { // 你可以调这个阈值,看对应的 COSFloat 参数inWatermarkBlock = true;tempBuffer.clear();}}}if (inWatermarkBlock) {tempBuffer.add(token);// 路径绘制结束if ("S".equals(opName) || "f".equals(opName) || "B".equals(opName)) {// 整段疑似水印块跳过(不添加)inWatermarkBlock = false;tempBuffer.clear();}} else {// 正常添加newTokens.addAll(tempBuffer);tempBuffer.clear();newTokens.add(token);}} else {if (inWatermarkBlock) {tempBuffer.add(token); // 暂存} else {newTokens.add(token);}}}// 重写页面内容流PDStream newContents = new PDStream(document);try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {ContentStreamWriter writer = new ContentStreamWriter(out);writer.writeTokens(newTokens);}page.setContents(newContents);}String textWatermarkRemoveFilename = filename.replace(".pdf", "_cleaned.pdf");document.save(uploadFilesDir + File.separator+ textWatermarkRemoveFilename);document.close();return textWatermarkRemoveFilename;}
3.去除使用图片生成的水印
/*** 去除图片水印,不会误删图片* 水印和正文图片都是图片,很难通过判断图片是否翻转,是否有透明度来验证是图片还是水印,* 所以最终方案是通过abcdef分组,里面数量最大的那个就是水印* 一般图片很少会abcdef都相等的,如果真有,再根据实际情况判断* @param filename* @return* @throws IOException*/public String removeImgWatermark(String filename) throws IOException {String filePath = uploadFilesDir + File.separator + filename;PDDocument document = PDDocument.load(new File(filePath));// Map<MatrixKey, Set<ImageName>>Map<String, Set<String>> matrixToImageNames = new HashMap<>();// System.out.println("文件解析:" + extractAllOperators(filePath));for (PDPage page : document.getPages()) {PDFStreamParser parser = new PDFStreamParser(page);parser.parse();List<Object> tokens = parser.getTokens();for (int i = 0; i < tokens.size(); i++) {Object token = tokens.get(i);if (token instanceof Operator) {Operator op = (Operator) token;if ("cm".equals(op.getName()) && i >= 6) {String matrixKey = extractMatrixKey(tokens, i - 6);if (matrixKey != null && i + 2 < tokens.size()) {Operator nextOp = null;COSName imageName = null;// 试图找到 cm 后面紧跟 Do 的图像调用for (int j = i + 1; j < tokens.size(); j++) {Object nextToken = tokens.get(j);if (nextToken instanceof Operator) {nextOp = (Operator) nextToken;break;} else if (nextToken instanceof COSName) {imageName = (COSName) nextToken;}}if (nextOp != null && "Do".equals(nextOp.getName()) && imageName != null) {matrixToImageNames.computeIfAbsent(matrixKey, k -> new HashSet<>()).add(imageName.getName());}}}}}}// 找出数量最多的 matrixKey(认为是水印)String targetMatrix = matrixToImageNames.entrySet().stream().max(Comparator.comparingInt(e -> e.getValue().size())).map(Map.Entry::getKey).orElse(null);if (targetMatrix == null) {document.close();return filename;}Set<String> watermarkImageNames = matrixToImageNames.get(targetMatrix);System.out.println("识别为水印图像名:" + watermarkImageNames);// 第二次遍历,删除目标图片for (PDPage page : document.getPages()) {PDFStreamParser parser = new PDFStreamParser(page);parser.parse();List<Object> tokens = parser.getTokens();List<Object> newTokens = new ArrayList<>();for (int i = 0; i < tokens.size(); i++) {Object token = tokens.get(i);if (token instanceof Operator && "Do".equals(((Operator) token).getName())) {if (i >= 1 && tokens.get(i - 1) instanceof COSName) {COSName name = (COSName) tokens.get(i - 1);if (watermarkImageNames.contains(name.getName())) {newTokens.remove(newTokens.size() - 1); // remove namecontinue; // skip Do}}}newTokens.add(token);}PDStream newStream = new PDStream(document);try (OutputStream out = newStream.createOutputStream(COSName.FLATE_DECODE)) {ContentStreamWriter writer = new ContentStreamWriter(out);writer.writeTokens(newTokens);}page.setContents(newStream);}String cleanedFile = filename.replace(".pdf", "_cleaned.pdf");document.save(uploadFilesDir + File.separator + cleanedFile);document.close();return cleanedFile;}private String extractMatrixKey(List<Object> tokens, int startIdx) {try {//a=d(等比例缩放),b=c=0(无倾斜),e=f=0(无位移)float a = ((COSNumber) tokens.get(startIdx)).floatValue();float b = ((COSNumber) tokens.get(startIdx + 1)).floatValue();float c = ((COSNumber) tokens.get(startIdx + 2)).floatValue();float d = ((COSNumber) tokens.get(startIdx + 3)).floatValue();float e = ((COSNumber) tokens.get(startIdx + 4)).floatValue();float f = ((COSNumber) tokens.get(startIdx + 5)).floatValue();
//
// float a = ((COSNumber) tokens.get(startIdx - 6)).floatValue(); //水平缩放 2.0 水平放大2倍
// float b = ((COSNumber) tokens.get(startIdx - 5)).floatValue(); //水平倾斜 0.5 垂直方向倾斜(斜率0.5)
// float c = ((COSNumber) tokens.get(startIdx - 4)).floatValue(); //垂直倾斜 -0.3 水平方向倾斜(斜率-0.3)
// float d = ((COSNumber) tokens.get(startIdx - 3)).floatValue(); // 垂直缩放 0.5 垂直缩小50%
// float e = ((COSNumber) tokens.get(startIdx - 2)).floatValue(); // 参数 e((X轴位移)水平平移) 正数=向右,负数=向左
// float f = ((COSNumber) tokens.get(startIdx - 1)).floatValue(); // 参数 f((Y轴位移)垂直平移)正数=向下,负数=向上(PDF坐标系Y轴向下为正)
// //通过这些数据观察,如果页数大于2,水印数量也大于2,那么属于水印的这些abcdef肯定是相等的//普通图片的abcdef很难说是一致的,一般应该都是不一样的,所以可以通过这个思路去判断System.out.println( startIdx+"--cm matrix a="+a+ " && b=" + b + "&&c=" + c+" && d = "+ d+ "&&e=" +e +"&&f="+ f+" \n ");// 保留3位小数用于分组防止浮点误差return String.format("%.3f,%.3f,%.3f,%.3f,%.3f,%.3f", a, b, c, d, e, f);} catch (Exception e) {return null;}}
最后贴一下我的pom.xml 文件的主要内容
<properties><maven.compiler.source>8</maven.compiler.source><maven.compiler.target>8</maven.compiler.target><project.build.sourceEncoding>UTF-8</project.build.sourceEncoding><pdfbox.version>2.0.30</pdfbox.version></properties><dependencies><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-web</artifactId></dependency><dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox</artifactId><version>${pdfbox.version}</version></dependency><dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox-tools</artifactId><version>${pdfbox.version}</version></dependency><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-devtools</artifactId><optional>true</optional></dependency></dependencies>