我试图使用c#中的PDFSharp在现有PDF文档中遍历PdfItem对象树.
我想在创建时创建所有对象的层次结构 - 类似于"PDF Explorer"示例所做的 - 但我希望它是树而不是所有对象的平面列表.
根节点是document.Internals.Catalog.我想要浏览所有document.Internals.Catalog.Elements,直到我访问过每个元素.
我遇到的一个问题是树中有循环引用,我无法弄清楚如何检测它们.
有代码样本吗?
marihanzo在PDFSharp论坛上的这篇文章对我们有用:
http://forum.pdfsharp.net/viewtopic.php?f=2&t=527&p=1603
我们唯一的问题是使用\ r \n处理字段.这是代码的副本,以防论坛帖子丢失.
PDFParser.cs
public class PDFParser { /// BT = Beginning of a text object operator /// ET = End of a text object operator /// Td move to the start of next line /// 5 Ts = superscript /// -5 Ts = subscript #region Fields #region _numberOfCharsToKeep ////// The number of characters to keep, when extracting text. /// private static int _numberOfCharsToKeep = 15; #endregion #endregion #region ExtractTextFromPDFBytes ////// This method processes an uncompressed Adobe (text) object /// and extracts text. /// /// uncompressed ///public string ExtractTextFromPDFBytes(byte[] input) { if (input == null || input.Length == 0) return ""; try { string resultString = ""; // Flag showing if we are we currently inside a text object bool inTextObject = false; // Flag showing if the next character is literal // e.g. '\\' to get a '\' character or '\(' to get '(' bool nextLiteral = false; // () Bracket nesting level. Text appears inside () int bracketDepth = 0; // Keep previous chars to get extract numbers etc.: char[] previousCharacters = new char[_numberOfCharsToKeep]; for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' '; for (int i = 0; i < input.Length; i++) { char c = (char)input[i]; if (inTextObject) { // Position the text if (bracketDepth == 0) { if (CheckToken(new string[] { "TD", "Td" }, previousCharacters)) { resultString += "\n\r"; } else { if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters)) { resultString += "\n"; } else { if (CheckToken(new string[] { "Tj" }, previousCharacters)) { resultString += " "; } } } } // End of a text object, also go to a new line. if (bracketDepth == 0 && CheckToken(new string[] { "ET" }, previousCharacters)) { inTextObject = false; resultString += " "; } else { // Start outputting text if ((c == '(') && (bracketDepth == 0) && (!nextLiteral)) { bracketDepth = 1; } else { // Stop outputting text if ((c == ')') && (bracketDepth == 1) && (!nextLiteral)) { bracketDepth = 0; } else { // Just a normal text character: if (bracketDepth == 1) { // Only print out next character no matter what. // Do not interpret. if (c == '\\' && !nextLiteral) { nextLiteral = true; } else { if (((c >= ' ') && (c <= '~')) || ((c >= 128) && (c < 255))) { resultString += c.ToString(); } nextLiteral = false; } } } } } } // Store the recent characters for // when we have to go back for a checking for (int j = 0; j < _numberOfCharsToKeep - 1; j++) { previousCharacters[j] = previousCharacters[j + 1]; } previousCharacters[_numberOfCharsToKeep - 1] = c; // Start of a text object if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters)) { inTextObject = true; } } return resultString; } catch { return ""; } } #endregion #region CheckToken /// /// Check if a certain 2 character token just came along (e.g. BT) /// /// the searched token /// the recent character array ///private bool CheckToken(string[] tokens, char[] recent) { foreach (string token in tokens) { if (token.Length > 1) { if ((recent[_numberOfCharsToKeep - 3] == token[0]) && (recent[_numberOfCharsToKeep - 2] == token[1]) && ((recent[_numberOfCharsToKeep - 1] == ' ') || (recent[_numberOfCharsToKeep - 1] == 0x0d) || (recent[_numberOfCharsToKeep - 1] == 0x0a)) && ((recent[_numberOfCharsToKeep - 4] == ' ') || (recent[_numberOfCharsToKeep - 4] == 0x0d) || (recent[_numberOfCharsToKeep - 4] == 0x0a)) ) { return true; } } else { return false; } } return false; } #endregion }
和调用代码:
public override String ExtractText() { String outputText = ""; try { PdfDocument inputDocument = PdfReader.Open(this._sDirectory + this._sFileName, PdfDocumentOpenMode.ReadOnly); foreach (PdfPage page in inputDocument.Pages) { for (int index = 0; index < page.Contents.Elements.Count; index++) { PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream; outputText += new PDFParser().ExtractTextFromPDFBytes(stream.Value); } } } catch (Exception e) { PDF_ParseException oEx = new PDF_ParseException(this, e); oEx.Log(); oEx.ToPdf(this._sDirectoryException); } return outputText; }