我发现这个问题,但它使用命令行,我不想使用子进程在命令行中调用Python脚本并解析HTML文件以获取字体信息.
我想使用PDFminer作为库,我发现这个问题,但它们只是提取纯文本,没有其他信息,如字体名称,字体大小等.
#!/usr/bin/env python from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.layout import LAParams from pdfminer.converter import PDFPageAggregator import pdfminer def createPDFDoc(fpath): fp = open(fpath, 'rb') parser = PDFParser(fp) document = PDFDocument(parser, password='') # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise "Not extractable" else: return document def createDeviceInterpreter(): rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return device, interpreter def parse_obj(objs): for obj in objs: if isinstance(obj, pdfminer.layout.LTTextBox): for o in obj._objs: if isinstance(o,pdfminer.layout.LTTextLine): text=o.get_text() if text.strip(): for c in o._objs: if isinstance(c, pdfminer.layout.LTChar): print "fontname %s"%c.fontname # if it's a container, recurse elif isinstance(obj, pdfminer.layout.LTFigure): parse_obj(obj._objs) else: pass document=createPDFDoc("/tmp/simple.pdf") device,interpreter=createDeviceInterpreter() pages=PDFPage.create_pages(document) interpreter.process_page(pages.next()) layout = device.get_result() parse_obj(layout._objs)