converter.py 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252
  1. import io
  2. import logging
  3. import re
  4. from typing import (
  5. BinaryIO,
  6. Dict,
  7. Generic,
  8. List,
  9. Optional,
  10. Sequence,
  11. TextIO,
  12. Tuple,
  13. TypeVar,
  14. Union,
  15. cast,
  16. )
  17. import concurrent.futures
  18. import mtranslate as translator
  19. import unicodedata
  20. import tqdm.auto
  21. from tenacity import retry
  22. from pdf2zh import cache
  23. def remove_control_characters(s):
  24. return "".join(ch for ch in s if unicodedata.category(ch)[0]!="C")
  25. from pdf2zh import utils
  26. from pdf2zh.image import ImageWriter
  27. from pdf2zh.layout import (
  28. LAParams,
  29. LTAnno,
  30. LTChar,
  31. LTComponent,
  32. LTContainer,
  33. LTCurve,
  34. LTFigure,
  35. LTImage,
  36. LTItem,
  37. LTLayoutContainer,
  38. LTLine,
  39. LTPage,
  40. LTRect,
  41. LTText,
  42. LTTextBox,
  43. LTTextBoxVertical,
  44. LTTextGroup,
  45. LTTextLine,
  46. TextGroupElement,
  47. )
  48. from pdf2zh.pdfcolor import PDFColorSpace
  49. from pdf2zh.pdfdevice import PDFTextDevice
  50. from pdf2zh.pdfexceptions import PDFValueError
  51. from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined, PDFCIDFont
  52. from pdf2zh.pdfinterp import PDFGraphicState, PDFResourceManager
  53. from pdf2zh.pdfpage import PDFPage
  54. from pdf2zh.pdftypes import PDFStream
  55. from pdf2zh.utils import (
  56. AnyIO,
  57. Matrix,
  58. PathSegment,
  59. Point,
  60. Rect,
  61. apply_matrix_pt,
  62. bbox2str,
  63. enc,
  64. make_compat_str,
  65. mult_matrix,
  66. )
  67. log = logging.getLogger(__name__)
  68. class PDFLayoutAnalyzer(PDFTextDevice):
  69. cur_item: LTLayoutContainer
  70. ctm: Matrix
  71. def __init__(
  72. self,
  73. rsrcmgr: PDFResourceManager,
  74. pageno: int = 1,
  75. laparams: Optional[LAParams] = None,
  76. ) -> None:
  77. PDFTextDevice.__init__(self, rsrcmgr)
  78. self.pageno = pageno
  79. self.laparams = laparams
  80. self._stack: List[LTLayoutContainer] = []
  81. def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
  82. (x0, y0, x1, y1) = page.mediabox
  83. (x0, y0) = apply_matrix_pt(ctm, (x0, y0))
  84. (x1, y1) = apply_matrix_pt(ctm, (x1, y1))
  85. mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
  86. self.cur_item = LTPage(page.pageno, mediabox)
  87. def end_page(self, page: PDFPage):
  88. assert not self._stack, str(len(self._stack))
  89. assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
  90. # 取消默认排版分析
  91. # if self.laparams is not None:
  92. # self.cur_item.analyze(self.laparams)
  93. self.pageno += 1
  94. return self.receive_layout(self.cur_item)
  95. def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
  96. self._stack.append(self.cur_item)
  97. self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
  98. def end_figure(self, _: str) -> None:
  99. fig = self.cur_item
  100. assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
  101. self.cur_item = self._stack.pop()
  102. self.cur_item.add(fig)
  103. def render_image(self, name: str, stream: PDFStream) -> None:
  104. assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
  105. item = LTImage(
  106. name,
  107. stream,
  108. (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),
  109. )
  110. self.cur_item.add(item)
  111. def paint_path(
  112. self,
  113. gstate: PDFGraphicState,
  114. stroke: bool,
  115. fill: bool,
  116. evenodd: bool,
  117. path: Sequence[PathSegment],
  118. ) -> None:
  119. """Paint paths described in section 4.4 of the PDF reference manual"""
  120. shape = "".join(x[0] for x in path)
  121. if shape[:1] != "m":
  122. # Per PDF Reference Section 4.4.1, "path construction operators may
  123. # be invoked in any sequence, but the first one invoked must be m
  124. # or re to begin a new subpath." Since pdf2zh.six already
  125. # converts all `re` (rectangle) operators to their equivelent
  126. # `mlllh` representation, paths ingested by `.paint_path(...)` that
  127. # do not begin with the `m` operator are invalid.
  128. pass
  129. elif shape.count("m") > 1:
  130. # recurse if there are multiple m's in this shape
  131. for m in re.finditer(r"m[^m]+", shape):
  132. subpath = path[m.start(0) : m.end(0)]
  133. self.paint_path(gstate, stroke, fill, evenodd, subpath)
  134. else:
  135. # Although the 'h' command does not not literally provide a
  136. # point-position, its position is (by definition) equal to the
  137. # subpath's starting point.
  138. #
  139. # And, per Section 4.4's Table 4.9, all other path commands place
  140. # their point-position in their final two arguments. (Any preceding
  141. # arguments represent control points on Bézier curves.)
  142. raw_pts = [
  143. cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path
  144. ]
  145. pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
  146. operators = [str(operation[0]) for operation in path]
  147. transformed_points = [
  148. [
  149. apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))
  150. for operand1, operand2 in zip(operation[1::2], operation[2::2])
  151. ]
  152. for operation in path
  153. ]
  154. transformed_path = [
  155. cast(PathSegment, (o, *p))
  156. for o, p in zip(operators, transformed_points)
  157. ]
  158. if shape in {"mlh", "ml"}:
  159. # single line segment
  160. #
  161. # Note: 'ml', in conditional above, is a frequent anomaly
  162. # that we want to support.
  163. line = LTLine(
  164. gstate.linewidth,
  165. pts[0],
  166. pts[1],
  167. stroke,
  168. fill,
  169. evenodd,
  170. gstate.scolor,
  171. gstate.ncolor,
  172. original_path=transformed_path,
  173. dashing_style=gstate.dash,
  174. )
  175. self.cur_item.add(line)
  176. elif shape in {"mlllh", "mllll"}:
  177. (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
  178. is_closed_loop = pts[0] == pts[4]
  179. has_square_coordinates = (
  180. x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0
  181. ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
  182. if is_closed_loop and has_square_coordinates:
  183. rect = LTRect(
  184. gstate.linewidth,
  185. (*pts[0], *pts[2]),
  186. stroke,
  187. fill,
  188. evenodd,
  189. gstate.scolor,
  190. gstate.ncolor,
  191. transformed_path,
  192. gstate.dash,
  193. )
  194. self.cur_item.add(rect)
  195. else:
  196. curve = LTCurve(
  197. gstate.linewidth,
  198. pts,
  199. stroke,
  200. fill,
  201. evenodd,
  202. gstate.scolor,
  203. gstate.ncolor,
  204. transformed_path,
  205. gstate.dash,
  206. )
  207. self.cur_item.add(curve)
  208. else:
  209. curve = LTCurve(
  210. gstate.linewidth,
  211. pts,
  212. stroke,
  213. fill,
  214. evenodd,
  215. gstate.scolor,
  216. gstate.ncolor,
  217. transformed_path,
  218. gstate.dash,
  219. )
  220. self.cur_item.add(curve)
  221. def render_char(
  222. self,
  223. matrix: Matrix,
  224. font: PDFFont,
  225. fontsize: float,
  226. scaling: float,
  227. rise: float,
  228. cid: int,
  229. ncs: PDFColorSpace,
  230. graphicstate: PDFGraphicState,
  231. ) -> float:
  232. try:
  233. text = font.to_unichr(cid)
  234. assert isinstance(text, str), str(type(text))
  235. except PDFUnicodeNotDefined:
  236. text = self.handle_undefined_char(font, cid)
  237. textwidth = font.char_width(cid)
  238. textdisp = font.char_disp(cid)
  239. item = LTChar(
  240. matrix,
  241. font,
  242. fontsize,
  243. scaling,
  244. rise,
  245. text,
  246. textwidth,
  247. textdisp,
  248. ncs,
  249. graphicstate,
  250. )
  251. self.cur_item.add(item)
  252. item.cid=cid # hack
  253. return item.adv
  254. def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
  255. # log.debug("undefined: %r, %r", font, cid)
  256. return "(cid:%d)" % cid
  257. def receive_layout(self, ltpage: LTPage) -> None:
  258. pass
  259. class PDFPageAggregator(PDFLayoutAnalyzer):
  260. def __init__(
  261. self,
  262. rsrcmgr: PDFResourceManager,
  263. pageno: int = 1,
  264. laparams: Optional[LAParams] = None,
  265. ) -> None:
  266. PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
  267. self.result: Optional[LTPage] = None
  268. def receive_layout(self, ltpage: LTPage) -> None:
  269. self.result = ltpage
  270. def get_result(self) -> LTPage:
  271. assert self.result is not None
  272. return self.result
  273. # Some PDFConverter children support only binary I/O
  274. IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)
  275. class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
  276. def __init__(
  277. self,
  278. rsrcmgr: PDFResourceManager,
  279. outfp: IOType,
  280. codec: str = "utf-8",
  281. pageno: int = 1,
  282. laparams: Optional[LAParams] = None,
  283. ) -> None:
  284. PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
  285. self.outfp: IOType = outfp
  286. self.codec = codec
  287. self.outfp_binary = self._is_binary_stream(self.outfp)
  288. @staticmethod
  289. def _is_binary_stream(outfp: AnyIO) -> bool:
  290. """Test if an stream is binary or not"""
  291. if "b" in getattr(outfp, "mode", ""):
  292. return True
  293. elif hasattr(outfp, "mode"):
  294. # output stream has a mode, but it does not contain 'b'
  295. return False
  296. elif isinstance(outfp, io.BytesIO):
  297. return True
  298. elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase):
  299. return False
  300. return True
  301. class TextConverter(PDFConverter[AnyIO]):
  302. def __init__(
  303. self,
  304. rsrcmgr: PDFResourceManager,
  305. outfp: AnyIO,
  306. codec: str = "utf-8",
  307. pageno: int = 1,
  308. laparams: Optional[LAParams] = None,
  309. showpageno: bool = False,
  310. imagewriter: Optional[ImageWriter] = None,
  311. vfont: str = None,
  312. vchar: str = None,
  313. thread: int = 0,
  314. layout = {},
  315. ) -> None:
  316. super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
  317. self.showpageno = showpageno
  318. self.imagewriter = imagewriter
  319. self.vfont = vfont
  320. self.vchar = vchar
  321. self.thread = thread
  322. self.layout = layout
  323. def write_text(self, text: str) -> None:
  324. text = utils.compatible_encode_method(text, self.codec, "ignore")
  325. if self.outfp_binary:
  326. cast(BinaryIO, self.outfp).write(text.encode())
  327. else:
  328. cast(TextIO, self.outfp).write(text)
  329. def receive_layout(self, ltpage: LTPage):
  330. def render(item: LTItem) -> None:
  331. xt=None
  332. lt=None
  333. rt=None
  334. sstk=[]
  335. vstk=[]
  336. vlstk=[]
  337. pstk=[]
  338. lstk=[]
  339. var=[]
  340. varl=[]
  341. vlen=[]
  342. ops=""
  343. def vflag(font,char): # 匹配公式(和角标)字体
  344. if self.vfont:
  345. if re.match(self.vfont,font):
  346. return True
  347. else:
  348. if re.match(r'.*\+(CM.*|MS.*|XY.*|MT.*|BL.*|.*0700|.*0500|.*Italic)',font):
  349. return True
  350. if self.vchar and re.match(self.vchar,char):
  351. return True
  352. return False
  353. ptr=0
  354. item=list(item)
  355. xt_ind=False
  356. while ptr<len(item): # 识别文字和公式
  357. child=item[ptr]
  358. if isinstance(child, LTChar):
  359. cur_v=False
  360. ind_v=False
  361. if vflag(child.fontname,child.get_text()): # 识别公式和字符
  362. cur_v=True
  363. for box in self.layout[ltpage.pageid]: # 识别独立公式
  364. b=box.block
  365. if child.x1>b.x_1 and child.x0<b.x_2 and child.y1>ltpage.height-b.y_2 and child.y0<ltpage.height-b.y_1:
  366. cur_v=True
  367. ind_v=True
  368. # lstk.append(LTLine(1,(b.x_1,ltpage.height-b.y_2),(b.x_2,ltpage.height-b.y_2)))
  369. # lstk.append(LTLine(1,(b.x_1,ltpage.height-b.y_1),(b.x_2,ltpage.height-b.y_1)))
  370. break
  371. if ptr==len(item)-1 or not cur_v or (ind_v and not xt_ind) or (vstk and child.x0<vstk[-1].x1-ltpage.width/3): # 公式结束或公式换行截断
  372. if vstk: # 公式出栈
  373. sstk[-1]+=f'$v{len(var)}$'
  374. var.append(vstk)
  375. varl.append(vlstk)
  376. vstk=[]
  377. vlstk=[]
  378. if ptr==len(item)-1 and cur_v: # 文档以公式结尾
  379. var[-1].append(child)
  380. break
  381. if not vstk: # 非公式或是公式开头
  382. if not ind_v and xt and child.y1 > xt.y0 - child.size*0.5 and child.y0 < xt.y1 + child.size: # 非独立公式且位于同段落
  383. if child.x0 > xt.x1 + child.size*2: # 行内分离
  384. lt,rt=child,child
  385. sstk.append("")
  386. pstk.append([child.y0,child.x0,child.x0,child.x0,child.size,child.font,False])
  387. elif child.x0 > xt.x1 + 1: # 行内空格
  388. sstk[-1]+=' '
  389. elif child.x1 < xt.x0: # 换行,这里需要考虑一下字母修饰符的情况
  390. if child.x0 < lt.x0 - child.size*2 or child.x0 > lt.x0 + child.size*1: # 基于初始位置的行间分离
  391. lt,rt=child,child
  392. sstk.append("")
  393. pstk.append([child.y0,child.x0,child.x0,child.x0,child.size,child.font,False])
  394. else: # 换行空格
  395. sstk[-1]+=' '
  396. pstk[-1][6]=True # 标记原文段落存在换行
  397. else: # 基于纵向距离的行间分离
  398. lt,rt=child,child
  399. sstk.append("")
  400. pstk.append([child.y0,child.x0,child.x0,child.x0,child.size,child.font,False])
  401. if not cur_v: # 文字入栈
  402. sstk[-1]+=child.get_text()
  403. if vflag(pstk[-1][5].fontname,''): # 公式开头,后续接文字,需要校正字体
  404. pstk[-1][5]=child.font
  405. else: # 公式入栈
  406. vstk.append(child)
  407. # if re.match(r'.*\+(CMEX.*)',child.fontname) and child.cid in [40]: # 大括号
  408. # # ops+=f"ET q 1 0 0 1 0 {child.y0} cm [] 0 d 0 J 1 w 0 0 m {ltpage.width} 0 l S Q BT "
  409. # # ops+=f"ET q 1 0 0 1 0 {child.y0-child.size*3} cm [] 0 d 0 J 1 w 0 0 m {ltpage.width} 0 l S Q BT "
  410. # while ptr+1<len(item):
  411. # child_=item[ptr+1]
  412. # if isinstance(child_, LTChar): # 公式字符
  413. # # print(child_.y0,child.y0-child.size*3,child_.y1,child.y0)
  414. # if child_.y0>child.y0-child.size*3 and child_.y1<child.y0:
  415. # vstk.append(child_)
  416. # else:
  417. # break
  418. # elif isinstance(child_, LTLine): # 公式线条
  419. # vlstk.append(child_)
  420. # ptr+=1
  421. xt=child
  422. xt_ind=ind_v
  423. # 更新左右边界
  424. if child.x0<lt.x0:
  425. pstk[-1][2]=child.x0
  426. lt=child
  427. if child.x1>rt.x1:
  428. pstk[-1][3]=child.x1
  429. rt=child
  430. elif isinstance(child, LTFigure): # 图表
  431. # print(f'\n\n[FIGURE] {child.name}')
  432. pass
  433. elif isinstance(child, LTLine): # 线条
  434. if vstk and child.x1-child.x0<ltpage.width/3: # 公式线条
  435. vlstk.append(child)
  436. else: # 全局线条
  437. lstk.append(child)
  438. else:
  439. # print(child)
  440. pass
  441. ptr+=1
  442. log.debug('\n==========[VSTACK]==========\n')
  443. for id,v in enumerate(var):
  444. l=max([vch.x1 for vch in v])-v[0].x0
  445. log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
  446. vlen.append(l)
  447. log.debug('\n==========[SSTACK]==========\n')
  448. hash_key=cache.deterministic_hash("PDFMathTranslate")
  449. cache.create_cache(hash_key)
  450. @retry
  451. def worker(s): # 多线程翻译
  452. try:
  453. if sum(map(str.islower,s))>1:
  454. hash_key_paragraph = cache.deterministic_hash(s)
  455. new = cache.load_paragraph(hash_key, hash_key_paragraph)
  456. if new is None:
  457. new=translator.translate(s,'zh-CN','en')
  458. new=remove_control_characters(new)
  459. cache.write_paragraph(hash_key, hash_key_paragraph, new)
  460. else:
  461. new=s
  462. return new
  463. except BaseException as e:
  464. log.exception(e,exc_info=False)
  465. raise e
  466. with concurrent.futures.ThreadPoolExecutor(max_workers=self.thread) as executor:
  467. news = list(executor.map(worker, sstk))
  468. def raw_string(fcur,cstk): # 编码字符串
  469. if isinstance(self.fontmap[fcur],PDFCIDFont):
  470. return "".join(["%04x" % ord(c) for c in cstk])
  471. else:
  472. return "".join(["%02x" % ord(c) for c in cstk])
  473. for id,new in enumerate(news): # 排版文字和公式
  474. tx=x=pstk[id][1];y=pstk[id][0];lt=pstk[id][2];rt=pstk[id][3];ptr=0;size=pstk[id][4];font=pstk[id][5];lb=pstk[id][6];cstk='';fcur=fcur_=None
  475. log.debug(f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}")
  476. while True:
  477. if ptr==len(new): # 到达段落结尾
  478. if cstk:
  479. ops+=f'/{fcur} {size} Tf 1 0 0 1 {tx} {y} Tm [<{raw_string(fcur,cstk)}>] TJ '
  480. break
  481. vy_regex=re.match(r'\$\s*v([\d\s]*)\$',new[ptr:]) # 匹配 $vn$ 公式标记
  482. if vy_regex: # 加载公式
  483. vid=int(vy_regex.group(1).replace(' ',''))
  484. ptr+=len(vy_regex.group(0))
  485. if vid<len(vlen):
  486. adv=vlen[vid]
  487. else:
  488. continue # 翻译器可能会自动补个越界的公式标记
  489. else: # 加载文字
  490. ch=new[ptr]
  491. if font.char_width(ord(ch)):
  492. fcur_=font.fontid
  493. else:
  494. if ch==' ':
  495. fcur_='helv' # 半角空格
  496. else:
  497. fcur_='china-ss'
  498. adv=self.fontmap[fcur_].char_width(ord(ch))*size
  499. ptr+=1
  500. if fcur_!=fcur or vy_regex or x+adv>rt: # 输出文字缓冲区:1.字体更新 2.插入公式 3.到达右边界
  501. if cstk:
  502. ops+=f'/{fcur} {size} Tf 1 0 0 1 {tx} {y} Tm [<{raw_string(fcur,cstk)}>] TJ '
  503. cstk=''
  504. if lb and x+adv>rt: # 到达右边界且原文段落存在换行
  505. x=lt
  506. y-=size*1.5
  507. if vy_regex: # 插入公式
  508. fix=0
  509. if fcur!=None: # 段落内公式修正
  510. if re.match(r'.*\+(CMEX.*)',var[vid][0].fontname) and var[vid][0].cid in [80,88,112,33,82]: # 根式、积分与大小求和
  511. fix=var[vid][0].size*0.85
  512. if re.match(r'.*\+(CMSY.*)',var[vid][0].fontname) and var[vid][0].cid in [112]: # 根式
  513. fix=var[vid][0].size*0.85
  514. if re.match(r'.*\+(MSAM.*)',var[vid][0].fontname) and var[vid][0].cid in [97]: # 特殊上标
  515. fix=var[vid][0].size*0.85
  516. if re.match(r'.*\+(CMR.*)',var[vid][0].fontname) and var[vid][0].cid in [94,126]: # 特殊上标
  517. fix=var[vid][0].size*0.25
  518. if re.match(r'.*\+(CM.*)7',var[vid][0].fontname): # 修正分式
  519. fix=var[vid][0].size*0.55
  520. for vch in var[vid]: # 排版公式字符
  521. vc=chr(vch.cid)
  522. ops+=f"/{vch.font.fontid} {vch.size} Tf 1 0 0 1 {x+vch.x0-var[vid][0].x0} {fix+y+vch.y0-var[vid][0].y0} Tm [<{raw_string(vch.font.fontid,vc)}>] TJ "
  523. for l in varl[vid]: # 排版公式线条
  524. if l.linewidth<5: # hack
  525. ops+=f"ET q 1 0 0 1 {l.pts[0][0]+x-var[vid][0].x0} {l.pts[0][1]+fix+y-var[vid][0].y0} cm [] 0 d 0 J {l.linewidth} w 0 0 m {l.pts[1][0]-l.pts[0][0]} {l.pts[1][1]-l.pts[0][1]} l S Q BT "
  526. else: # 插入文字缓冲区
  527. if not cstk:
  528. tx=x
  529. if x==lt and ch==' ': # 消除段落换行空格
  530. adv=0
  531. else:
  532. cstk+=ch
  533. else:
  534. cstk+=ch
  535. fcur=fcur_
  536. x+=adv
  537. for l in lstk: # 排版全局线条
  538. if l.linewidth<5: # hack
  539. ops+=f"ET q 1 0 0 1 {l.pts[0][0]} {l.pts[0][1]} cm [] 0 d 0 J {l.linewidth} w 0 0 m {l.pts[1][0]-l.pts[0][0]} {l.pts[1][1]-l.pts[0][1]} l S Q BT "
  540. ops=f'BT {ops}ET '
  541. return ops
  542. ops=render(ltpage)
  543. return ops
  544. # Some dummy functions to save memory/CPU when all that is wanted
  545. # is text. This stops all the image and drawing output from being
  546. # recorded and taking up RAM.
  547. def render_image(self, name: str, stream: PDFStream) -> None:
  548. if self.imagewriter is not None:
  549. PDFConverter.render_image(self, name, stream)
  550. # def paint_path(
  551. # self,
  552. # gstate: PDFGraphicState,
  553. # stroke: bool,
  554. # fill: bool,
  555. # evenodd: bool,
  556. # path: Sequence[PathSegment],
  557. # ) -> None:
  558. # pass
  559. class HTMLConverter(PDFConverter[AnyIO]):
  560. RECT_COLORS = {
  561. "figure": "yellow",
  562. "textline": "magenta",
  563. "textbox": "cyan",
  564. "textgroup": "red",
  565. "curve": "black",
  566. "page": "gray",
  567. }
  568. TEXT_COLORS = {
  569. "textbox": "blue",
  570. "char": "black",
  571. }
  572. def __init__(
  573. self,
  574. rsrcmgr: PDFResourceManager,
  575. outfp: AnyIO,
  576. codec: str = "utf-8",
  577. pageno: int = 1,
  578. laparams: Optional[LAParams] = None,
  579. scale: float = 1,
  580. fontscale: float = 1.0,
  581. layoutmode: str = "normal",
  582. showpageno: bool = True,
  583. pagemargin: int = 50,
  584. imagewriter: Optional[ImageWriter] = None,
  585. debug: int = 0,
  586. rect_colors: Optional[Dict[str, str]] = None,
  587. text_colors: Optional[Dict[str, str]] = None,
  588. ) -> None:
  589. PDFConverter.__init__(
  590. self,
  591. rsrcmgr,
  592. outfp,
  593. codec=codec,
  594. pageno=pageno,
  595. laparams=laparams,
  596. )
  597. # write() assumes a codec for binary I/O, or no codec for text I/O.
  598. if self.outfp_binary and not self.codec:
  599. raise PDFValueError("Codec is required for a binary I/O output")
  600. if not self.outfp_binary and self.codec:
  601. raise PDFValueError("Codec must not be specified for a text I/O output")
  602. if text_colors is None:
  603. text_colors = {"char": "black"}
  604. if rect_colors is None:
  605. rect_colors = {"curve": "black", "page": "gray"}
  606. self.scale = scale
  607. self.fontscale = fontscale
  608. self.layoutmode = layoutmode
  609. self.showpageno = showpageno
  610. self.pagemargin = pagemargin
  611. self.imagewriter = imagewriter
  612. self.rect_colors = rect_colors
  613. self.text_colors = text_colors
  614. if debug:
  615. self.rect_colors.update(self.RECT_COLORS)
  616. self.text_colors.update(self.TEXT_COLORS)
  617. self._yoffset: float = self.pagemargin
  618. self._font: Optional[Tuple[str, float]] = None
  619. self._fontstack: List[Optional[Tuple[str, float]]] = []
  620. self.write_header()
  621. def write(self, text: str) -> None:
  622. if self.codec:
  623. cast(BinaryIO, self.outfp).write(text.encode(self.codec))
  624. else:
  625. cast(TextIO, self.outfp).write(text)
  626. def write_header(self) -> None:
  627. self.write("<html><head>\n")
  628. if self.codec:
  629. s = (
  630. '<meta http-equiv="Content-Type" content="text/html; '
  631. 'charset=%s">\n' % self.codec
  632. )
  633. else:
  634. s = '<meta http-equiv="Content-Type" content="text/html">\n'
  635. self.write(s)
  636. self.write("</head><body>\n")
  637. def write_footer(self) -> None:
  638. page_links = [f'<a href="#{i}">{i}</a>' for i in range(1, self.pageno)]
  639. s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % ", ".join(
  640. page_links,
  641. )
  642. self.write(s)
  643. self.write("</body></html>\n")
  644. def write_text(self, text: str) -> None:
  645. self.write(enc(text))
  646. def place_rect(
  647. self,
  648. color: str,
  649. borderwidth: int,
  650. x: float,
  651. y: float,
  652. w: float,
  653. h: float,
  654. ) -> None:
  655. color2 = self.rect_colors.get(color)
  656. if color2 is not None:
  657. s = (
  658. '<span style="position:absolute; border: %s %dpx solid; '
  659. 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n'
  660. % (
  661. color2,
  662. borderwidth,
  663. x * self.scale,
  664. (self._yoffset - y) * self.scale,
  665. w * self.scale,
  666. h * self.scale,
  667. )
  668. )
  669. self.write(s)
  670. def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:
  671. self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
  672. def place_image(
  673. self,
  674. item: LTImage,
  675. borderwidth: int,
  676. x: float,
  677. y: float,
  678. w: float,
  679. h: float,
  680. ) -> None:
  681. if self.imagewriter is not None:
  682. name = self.imagewriter.export_image(item)
  683. s = (
  684. '<img src="%s" border="%d" style="position:absolute; '
  685. 'left:%dpx; top:%dpx;" width="%d" height="%d" />\n'
  686. % (
  687. enc(name),
  688. borderwidth,
  689. x * self.scale,
  690. (self._yoffset - y) * self.scale,
  691. w * self.scale,
  692. h * self.scale,
  693. )
  694. )
  695. self.write(s)
  696. def place_text(
  697. self,
  698. color: str,
  699. text: str,
  700. x: float,
  701. y: float,
  702. size: float,
  703. ) -> None:
  704. color2 = self.text_colors.get(color)
  705. if color2 is not None:
  706. s = (
  707. '<span style="position:absolute; color:%s; left:%dpx; '
  708. 'top:%dpx; font-size:%dpx;">'
  709. % (
  710. color2,
  711. x * self.scale,
  712. (self._yoffset - y) * self.scale,
  713. size * self.scale * self.fontscale,
  714. )
  715. )
  716. self.write(s)
  717. self.write_text(text)
  718. self.write("</span>\n")
  719. def begin_div(
  720. self,
  721. color: str,
  722. borderwidth: int,
  723. x: float,
  724. y: float,
  725. w: float,
  726. h: float,
  727. writing_mode: str = "False",
  728. ) -> None:
  729. self._fontstack.append(self._font)
  730. self._font = None
  731. s = (
  732. '<div style="position:absolute; border: %s %dpx solid; '
  733. "writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; "
  734. 'height:%dpx;">'
  735. % (
  736. color,
  737. borderwidth,
  738. writing_mode,
  739. x * self.scale,
  740. (self._yoffset - y) * self.scale,
  741. w * self.scale,
  742. h * self.scale,
  743. )
  744. )
  745. self.write(s)
  746. def end_div(self, color: str) -> None:
  747. if self._font is not None:
  748. self.write("</span>")
  749. self._font = self._fontstack.pop()
  750. self.write("</div>")
  751. def put_text(self, text: str, fontname: str, fontsize: float) -> None:
  752. font = (fontname, fontsize)
  753. if font != self._font:
  754. if self._font is not None:
  755. self.write("</span>")
  756. # Remove subset tag from fontname, see PDF Reference 5.5.3
  757. fontname_without_subset_tag = fontname.split("+")[-1]
  758. self.write(
  759. '<span style="font-family: %s; font-size:%dpx">'
  760. % (fontname_without_subset_tag, fontsize * self.scale * self.fontscale),
  761. )
  762. self._font = font
  763. self.write_text(text)
  764. def put_newline(self) -> None:
  765. self.write("<br>")
  766. def receive_layout(self, ltpage: LTPage) -> None:
  767. def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None:
  768. if isinstance(item, LTTextGroup):
  769. self.place_border("textgroup", 1, item)
  770. for child in item:
  771. show_group(child)
  772. def render(item: LTItem) -> None:
  773. child: LTItem
  774. if isinstance(item, LTPage):
  775. self._yoffset += item.y1
  776. self.place_border("page", 1, item)
  777. if self.showpageno:
  778. self.write(
  779. '<div style="position:absolute; top:%dpx;">'
  780. % ((self._yoffset - item.y1) * self.scale),
  781. )
  782. self.write(
  783. f'<a name="{item.pageid}">Page {item.pageid}</a></div>\n',
  784. )
  785. for child in item:
  786. render(child)
  787. if item.groups is not None:
  788. for group in item.groups:
  789. show_group(group)
  790. elif isinstance(item, LTCurve):
  791. self.place_border("curve", 1, item)
  792. elif isinstance(item, LTFigure):
  793. self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)
  794. for child in item:
  795. render(child)
  796. self.end_div("figure")
  797. elif isinstance(item, LTImage):
  798. self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
  799. elif self.layoutmode == "exact":
  800. if isinstance(item, LTTextLine):
  801. self.place_border("textline", 1, item)
  802. for child in item:
  803. render(child)
  804. elif isinstance(item, LTTextBox):
  805. self.place_border("textbox", 1, item)
  806. self.place_text(
  807. "textbox",
  808. str(item.index + 1),
  809. item.x0,
  810. item.y1,
  811. 20,
  812. )
  813. for child in item:
  814. render(child)
  815. elif isinstance(item, LTChar):
  816. self.place_border("char", 1, item)
  817. self.place_text(
  818. "char",
  819. item.get_text(),
  820. item.x0,
  821. item.y1,
  822. item.size,
  823. )
  824. elif isinstance(item, LTTextLine):
  825. for child in item:
  826. render(child)
  827. if self.layoutmode != "loose":
  828. self.put_newline()
  829. elif isinstance(item, LTTextBox):
  830. self.begin_div(
  831. "textbox",
  832. 1,
  833. item.x0,
  834. item.y1,
  835. item.width,
  836. item.height,
  837. item.get_writing_mode(),
  838. )
  839. for child in item:
  840. render(child)
  841. self.end_div("textbox")
  842. elif isinstance(item, LTChar):
  843. fontname = make_compat_str(item.fontname)
  844. self.put_text(item.get_text(), fontname, item.size)
  845. elif isinstance(item, LTText):
  846. self.write_text(item.get_text())
  847. render(ltpage)
  848. self._yoffset += self.pagemargin
  849. def close(self) -> None:
  850. self.write_footer()
  851. class XMLConverter(PDFConverter[AnyIO]):
  852. CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")
  853. def __init__(
  854. self,
  855. rsrcmgr: PDFResourceManager,
  856. outfp: AnyIO,
  857. codec: str = "utf-8",
  858. pageno: int = 1,
  859. laparams: Optional[LAParams] = None,
  860. imagewriter: Optional[ImageWriter] = None,
  861. stripcontrol: bool = False,
  862. ) -> None:
  863. PDFConverter.__init__(
  864. self,
  865. rsrcmgr,
  866. outfp,
  867. codec=codec,
  868. pageno=pageno,
  869. laparams=laparams,
  870. )
  871. # write() assumes a codec for binary I/O, or no codec for text I/O.
  872. if self.outfp_binary == (not self.codec):
  873. raise PDFValueError("Codec is required for a binary I/O output")
  874. self.imagewriter = imagewriter
  875. self.stripcontrol = stripcontrol
  876. self.write_header()
  877. def write(self, text: str) -> None:
  878. if self.codec:
  879. cast(BinaryIO, self.outfp).write(text.encode(self.codec))
  880. else:
  881. cast(TextIO, self.outfp).write(text)
  882. def write_header(self) -> None:
  883. if self.codec:
  884. self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
  885. else:
  886. self.write('<?xml version="1.0" ?>\n')
  887. self.write("<pages>\n")
  888. def write_footer(self) -> None:
  889. self.write("</pages>\n")
  890. def write_text(self, text: str) -> None:
  891. if self.stripcontrol:
  892. text = self.CONTROL.sub("", text)
  893. self.write(enc(text))
  894. def receive_layout(self, ltpage: LTPage) -> None:
  895. def show_group(item: LTItem) -> None:
  896. if isinstance(item, LTTextBox):
  897. self.write(
  898. '<textbox id="%d" bbox="%s" />\n'
  899. % (item.index, bbox2str(item.bbox)),
  900. )
  901. elif isinstance(item, LTTextGroup):
  902. self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
  903. for child in item:
  904. show_group(child)
  905. self.write("</textgroup>\n")
  906. def render(item: LTItem) -> None:
  907. child: LTItem
  908. if isinstance(item, LTPage):
  909. s = '<page id="%s" bbox="%s" rotate="%d">\n' % (
  910. item.pageid,
  911. bbox2str(item.bbox),
  912. item.rotate,
  913. )
  914. self.write(s)
  915. for child in item:
  916. render(child)
  917. if item.groups is not None:
  918. self.write("<layout>\n")
  919. for group in item.groups:
  920. show_group(group)
  921. self.write("</layout>\n")
  922. self.write("</page>\n")
  923. elif isinstance(item, LTLine):
  924. s = '<line linewidth="%d" bbox="%s" />\n' % (
  925. item.linewidth,
  926. bbox2str(item.bbox),
  927. )
  928. self.write(s)
  929. elif isinstance(item, LTRect):
  930. s = '<rect linewidth="%d" bbox="%s" />\n' % (
  931. item.linewidth,
  932. bbox2str(item.bbox),
  933. )
  934. self.write(s)
  935. elif isinstance(item, LTCurve):
  936. s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (
  937. item.linewidth,
  938. bbox2str(item.bbox),
  939. item.get_pts(),
  940. )
  941. self.write(s)
  942. elif isinstance(item, LTFigure):
  943. s = f'<figure name="{item.name}" bbox="{bbox2str(item.bbox)}">\n'
  944. self.write(s)
  945. for child in item:
  946. render(child)
  947. self.write("</figure>\n")
  948. elif isinstance(item, LTTextLine):
  949. self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
  950. for child in item:
  951. render(child)
  952. self.write("</textline>\n")
  953. elif isinstance(item, LTTextBox):
  954. wmode = ""
  955. if isinstance(item, LTTextBoxVertical):
  956. wmode = ' wmode="vertical"'
  957. s = '<textbox id="%d" bbox="%s"%s>\n' % (
  958. item.index,
  959. bbox2str(item.bbox),
  960. wmode,
  961. )
  962. self.write(s)
  963. for child in item:
  964. render(child)
  965. self.write("</textbox>\n")
  966. elif isinstance(item, LTChar):
  967. s = (
  968. '<text font="%s" bbox="%s" colourspace="%s" '
  969. 'ncolour="%s" size="%.3f">'
  970. % (
  971. enc(item.fontname),
  972. bbox2str(item.bbox),
  973. item.ncs.name,
  974. item.graphicstate.ncolor,
  975. item.size,
  976. )
  977. )
  978. self.write(s)
  979. self.write_text(item.get_text())
  980. self.write("</text>\n")
  981. elif isinstance(item, LTText):
  982. self.write("<text>%s</text>\n" % item.get_text())
  983. elif isinstance(item, LTImage):
  984. if self.imagewriter is not None:
  985. name = self.imagewriter.export_image(item)
  986. self.write(
  987. '<image src="%s" width="%d" height="%d" />\n'
  988. % (enc(name), item.width, item.height),
  989. )
  990. else:
  991. self.write(
  992. '<image width="%d" height="%d" />\n'
  993. % (item.width, item.height),
  994. )
  995. else:
  996. assert False, str(("Unhandled", item))
  997. render(ltpage)
  998. def close(self) -> None:
  999. self.write_footer()
  1000. class HOCRConverter(PDFConverter[AnyIO]):
  1001. """Extract an hOCR representation from explicit text information within a PDF."""
  1002. # Where text is being extracted from a variety of types of PDF within a
  1003. # business process, those PDFs where the text is only present in image
  1004. # form will need to be analysed using an OCR tool which will typically
  1005. # output hOCR. This converter extracts the explicit text information from
  1006. # those PDFs that do have it and uses it to genxerate a basic hOCR
  1007. # representation that is designed to be used in conjunction with the image
  1008. # of the PDF in the same way as genuine OCR output would be, but without the
  1009. # inevitable OCR errors.
  1010. # The converter does not handle images, diagrams or text colors.
  1011. # In the examples processed by the contributor it was necessary to set
  1012. # LAParams.all_texts to True.
  1013. CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
  1014. def __init__(
  1015. self,
  1016. rsrcmgr: PDFResourceManager,
  1017. outfp: AnyIO,
  1018. codec: str = "utf8",
  1019. pageno: int = 1,
  1020. laparams: Optional[LAParams] = None,
  1021. stripcontrol: bool = False,
  1022. ):
  1023. PDFConverter.__init__(
  1024. self,
  1025. rsrcmgr,
  1026. outfp,
  1027. codec=codec,
  1028. pageno=pageno,
  1029. laparams=laparams,
  1030. )
  1031. self.stripcontrol = stripcontrol
  1032. self.within_chars = False
  1033. self.write_header()
  1034. def bbox_repr(self, bbox: Rect) -> str:
  1035. (in_x0, in_y0, in_x1, in_y1) = bbox
  1036. # PDF y-coordinates are the other way round from hOCR coordinates
  1037. out_x0 = int(in_x0)
  1038. out_y0 = int(self.page_bbox[3] - in_y1)
  1039. out_x1 = int(in_x1)
  1040. out_y1 = int(self.page_bbox[3] - in_y0)
  1041. return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"
  1042. def write(self, text: str) -> None:
  1043. if self.codec:
  1044. encoded_text = text.encode(self.codec)
  1045. cast(BinaryIO, self.outfp).write(encoded_text)
  1046. else:
  1047. cast(TextIO, self.outfp).write(text)
  1048. def write_header(self) -> None:
  1049. if self.codec:
  1050. self.write(
  1051. "<html xmlns='http://www.w3.org/1999/xhtml' "
  1052. "xml:lang='en' lang='en' charset='%s'>\n" % self.codec,
  1053. )
  1054. else:
  1055. self.write(
  1056. "<html xmlns='http://www.w3.org/1999/xhtml' "
  1057. "xml:lang='en' lang='en'>\n",
  1058. )
  1059. self.write("<head>\n")
  1060. self.write("<title></title>\n")
  1061. self.write(
  1062. "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />\n",
  1063. )
  1064. self.write(
  1065. "<meta name='ocr-system' content='pdf2zh.six HOCR Converter' />\n",
  1066. )
  1067. self.write(
  1068. " <meta name='ocr-capabilities'"
  1069. " content='ocr_page ocr_block ocr_line ocrx_word'/>\n",
  1070. )
  1071. self.write("</head>\n")
  1072. self.write("<body>\n")
  1073. def write_footer(self) -> None:
  1074. self.write("<!-- comment in the following line to debug -->\n")
  1075. self.write(
  1076. "<!--script src='https://unpkg.com/hocrjs'></script--></body></html>\n",
  1077. )
  1078. def write_text(self, text: str) -> None:
  1079. if self.stripcontrol:
  1080. text = self.CONTROL.sub("", text)
  1081. self.write(text)
  1082. def write_word(self) -> None:
  1083. if len(self.working_text) > 0:
  1084. bold_and_italic_styles = ""
  1085. if "Italic" in self.working_font:
  1086. bold_and_italic_styles = "font-style: italic; "
  1087. if "Bold" in self.working_font:
  1088. bold_and_italic_styles += "font-weight: bold; "
  1089. self.write(
  1090. "<span style='font:\"%s\"; font-size:%d; %s' "
  1091. "class='ocrx_word' title='%s; x_font %s; "
  1092. "x_fsize %d'>%s</span>"
  1093. % (
  1094. (
  1095. self.working_font,
  1096. self.working_size,
  1097. bold_and_italic_styles,
  1098. self.bbox_repr(self.working_bbox),
  1099. self.working_font,
  1100. self.working_size,
  1101. self.working_text.strip(),
  1102. )
  1103. ),
  1104. )
  1105. self.within_chars = False
  1106. def receive_layout(self, ltpage: LTPage) -> None:
  1107. def render(item: LTItem) -> None:
  1108. if self.within_chars and isinstance(item, LTAnno):
  1109. self.write_word()
  1110. if isinstance(item, LTPage):
  1111. self.page_bbox = item.bbox
  1112. self.write(
  1113. "<div class='ocr_page' id='%s' title='%s'>\n"
  1114. % (item.pageid, self.bbox_repr(item.bbox)),
  1115. )
  1116. for child in item:
  1117. render(child)
  1118. self.write("</div>\n")
  1119. elif isinstance(item, LTTextLine):
  1120. self.write(
  1121. "<span class='ocr_line' title='%s'>" % (self.bbox_repr(item.bbox)),
  1122. )
  1123. for child_line in item:
  1124. render(child_line)
  1125. self.write("</span>\n")
  1126. elif isinstance(item, LTTextBox):
  1127. self.write(
  1128. "<div class='ocr_block' id='%d' title='%s'>\n"
  1129. % (item.index, self.bbox_repr(item.bbox)),
  1130. )
  1131. for child in item:
  1132. render(child)
  1133. self.write("</div>\n")
  1134. elif isinstance(item, LTChar):
  1135. if not self.within_chars:
  1136. self.within_chars = True
  1137. self.working_text = item.get_text()
  1138. self.working_bbox = item.bbox
  1139. self.working_font = item.fontname
  1140. self.working_size = item.size
  1141. elif len(item.get_text().strip()) == 0:
  1142. self.write_word()
  1143. self.write(item.get_text())
  1144. else:
  1145. if (
  1146. self.working_bbox[1] != item.bbox[1]
  1147. or self.working_font != item.fontname
  1148. or self.working_size != item.size
  1149. ):
  1150. self.write_word()
  1151. self.working_bbox = item.bbox
  1152. self.working_font = item.fontname
  1153. self.working_size = item.size
  1154. self.working_text += item.get_text()
  1155. self.working_bbox = (
  1156. self.working_bbox[0],
  1157. self.working_bbox[1],
  1158. item.bbox[2],
  1159. self.working_bbox[3],
  1160. )
  1161. render(ltpage)
  1162. def close(self) -> None:
  1163. self.write_footer()