using System;
using System.IO;
using System.Linq;
using System.Text;
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
public static string GetWordContentByOpenXml(string path, string password)
{try{using (var document = WordprocessingDocument.Open(path, false, new OpenSettings(){Password = password})){if (document.MainDocumentPart?.Document?.Body == null)return null;var contentBuilder = new StringBuilder();var body = document.MainDocumentPart.Document.Body;ExtractBodyContent(body, contentBuilder);string contentWithoutHeaderFooter = contentBuilder.ToString();string content = CleanContent(contentWithoutHeaderFooter);int index = content.LastIndexOf("限公司第");if (index > 0){return content.Substring(0, index).Trim();}else{return content;}}}catch (Exception ex){LogManager.WriteError("GetWordContentByOpenXml()", ex.StackTrace?.ToString());return null;}
}
private static void ExtractBodyContent(Body body, StringBuilder contentBuilder)
{foreach (var element in body.Elements()){ExtractElementContent(element, contentBuilder);}
}
private static void ExtractElementContent(OpenXmlElement element, StringBuilder contentBuilder)
{switch (element){case Paragraph paragraph:ExtractParagraphContent(paragraph, contentBuilder);contentBuilder.AppendLine(); break;case Table table:ExtractTableContent(table, contentBuilder);break;case SectionProperties _:break;default:foreach (var childElement in element.Elements()){ExtractElementContent(childElement, contentBuilder);}break;}
}
private static void ExtractParagraphContent(Paragraph paragraph, StringBuilder contentBuilder)
{foreach (var run in paragraph.Elements<Run>()){foreach (var text in run.Elements<Text>()){contentBuilder.Append(text.Text);}foreach (var tab in run.Elements<TabChar>()){contentBuilder.Append("\t");}foreach (var br in run.Elements<Break>()){contentBuilder.AppendLine();}}
}
private static void ExtractTableContent(Table table, StringBuilder contentBuilder)
{foreach (var row in table.Elements<TableRow>()){foreach (var cell in row.Elements<TableCell>()){foreach (var paragraph in cell.Elements<Paragraph>()){ExtractParagraphContent(paragraph, contentBuilder);}contentBuilder.Append("\t"); }contentBuilder.AppendLine(); }
}
private static string CleanContent(string content)
{if (string.IsNullOrEmpty(content))return string.Empty;content = System.Text.RegularExpressions.Regex.Replace(content, @"\s+", " ");content = content.Trim();content = System.Text.RegularExpressions.Regex.Replace(content, @"\n\s*\n", "\n");content = content.Replace("EvaluationOnly.CreatedwithAspose.Words.Copyright2003-2024AsposePtyLtd.", "");content = System.Text.RegularExpressions.Regex.Replace(content, @"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", "");return content.Trim();
}
public static bool IsPasswordRequired(string path)
{try{using (var document = WordprocessingDocument.Open(path, false)){return false;}}catch (OpenXmlPackageException ex){return ex.Message.Contains("password") || ex.Message.Contains("encrypted") || ex.Message.Contains("protected");}catch{return true;}
}
public static string GetWordContentByOpenXmlAdvanced(string path, string password, bool includeHyperlinks = false, bool includeFootnotes = false)
{try{using (var document = WordprocessingDocument.Open(path, false, new OpenSettings(){Password = password})){if (document.MainDocumentPart?.Document?.Body == null)return null;var contentBuilder = new StringBuilder();var body = document.MainDocumentPart.Document.Body;ExtractBodyContentAdvanced(body, contentBuilder, includeHyperlinks);if (includeFootnotes && document.MainDocumentPart.FootnotesPart != null){ExtractFootnotesContent(document.MainDocumentPart.FootnotesPart, contentBuilder);}string contentWithoutHeaderFooter = contentBuilder.ToString();string content = CleanContent(contentWithoutHeaderFooter);int index = content.LastIndexOf("公司第");if (index > 0){return content.Substring(0, index).Trim();}else{return content;}}}catch (Exception ex){LogManager.WriteError("GetWordContentByOpenXmlAdvanced()", ex.StackTrace?.ToString());return null;}
}
private static void ExtractBodyContentAdvanced(Body body, StringBuilder contentBuilder, bool includeHyperlinks)
{foreach (var element in body.Elements()){if (element is Paragraph paragraph){ExtractParagraphContentAdvanced(paragraph, contentBuilder, includeHyperlinks);contentBuilder.AppendLine();}else if (element is Table table){ExtractTableContentAdvanced(table, contentBuilder, includeHyperlinks);}else if (!(element is SectionProperties)){foreach (var childElement in element.Elements()){ExtractBodyContentAdvanced(new Body(childElement), contentBuilder, includeHyperlinks);}}}
}
private static void ExtractParagraphContentAdvanced(Paragraph paragraph, StringBuilder contentBuilder, bool includeHyperlinks)
{foreach (var element in paragraph.Elements()){if (element is Run run){foreach (var text in run.Elements<Text>()){contentBuilder.Append(text.Text);}}else if (element is Hyperlink hyperlink && includeHyperlinks){foreach (var run2 in hyperlink.Elements<Run>()){foreach (var text in run2.Elements<Text>()){contentBuilder.Append(text.Text);}}}}
}
private static void ExtractTableContentAdvanced(Table table, StringBuilder contentBuilder, bool includeHyperlinks)
{foreach (var row in table.Elements<TableRow>()){foreach (var cell in row.Elements<TableCell>()){foreach (var paragraph in cell.Elements<Paragraph>()){ExtractParagraphContentAdvanced(paragraph, contentBuilder, includeHyperlinks);}contentBuilder.Append("\t");}contentBuilder.AppendLine();}
}
private static void ExtractFootnotesContent(FootnotesPart footnotesPart, StringBuilder contentBuilder)
{if (footnotesPart.Footnotes != null){contentBuilder.AppendLine("\n--- 脚注 ---");foreach (var footnote in footnotesPart.Footnotes.Elements<Footnote>()){foreach (var paragraph in footnote.Elements<Paragraph>()){ExtractParagraphContent(paragraph, contentBuilder);contentBuilder.AppendLine();}}}
}