Byaidu 1 年之前
父节点
当前提交
ac2e14192c
共有 12 个文件被更改,包括 87 次插入84 次删除
  1. 1 1
      pdf2zh/__init__.py
  2. 4 4
      pdf2zh/cmapdb.py
  3. 7 6
      pdf2zh/converter.py
  4. 2 1
      pdf2zh/encodingdb.py
  5. 7 7
      pdf2zh/lzw.py
  6. 7 7
      pdf2zh/pdf2zh.py
  7. 16 16
      pdf2zh/pdfdocument.py
  8. 2 1
      pdf2zh/pdffont.py
  9. 12 12
      pdf2zh/pdfinterp.py
  10. 2 2
      pdf2zh/pdfpage.py
  11. 7 7
      pdf2zh/pdfparser.py
  12. 20 20
      pdf2zh/psparser.py

+ 1 - 1
pdf2zh/__init__.py

@@ -1,2 +1,2 @@
-__version__ = "1.0.4"
+__version__ = "1.0.5"
 __author__ = "Byaidu"

+ 4 - 4
pdf2zh/cmapdb.py

@@ -93,7 +93,7 @@ class CMap(CMapBase):
         copy(self.code2cid, cmap.code2cid)
 
     def decode(self, code: bytes) -> Iterator[int]:
-        log.debug("decode: %r, %r", self, code)
+        # log.debug("decode: %r, %r", self, code)
         d = self.code2cid
         for i in iter(code):
             if i in d:
@@ -150,7 +150,7 @@ class UnicodeMap(CMapBase):
         return "<UnicodeMap: %s>" % self.attrs.get("CMapName")
 
     def get_unichr(self, cid: int) -> str:
-        log.debug("get_unichr: %r, %r", self, cid)
+        # log.debug("get_unichr: %r, %r", self, cid)
         return self.cid2unichr[cid]
 
     def dump(self, out: TextIO = sys.stdout) -> None:
@@ -161,7 +161,7 @@ class UnicodeMap(CMapBase):
 class IdentityUnicodeMap(UnicodeMap):
     def get_unichr(self, cid: int) -> str:
         """Interpret character id as unicode codepoint"""
-        log.debug("get_unichr: %r, %r", self, cid)
+        # log.debug("get_unichr: %r, %r", self, cid)
         return chr(cid)
 
 
@@ -233,7 +233,7 @@ class CMapDB:
     def _load_data(cls, name: str) -> Any:
         name = name.replace("\0", "")
         filename = "%s.pickle.gz" % name
-        log.debug("loading: %r", name)
+        # log.debug("loading: %r", name)
         cmap_paths = (
             os.environ.get("CMAP_PATH", "/usr/share/pdf2zh/"),
             os.path.join(os.path.dirname(__file__), "cmap"),

+ 7 - 6
pdf2zh/converter.py

@@ -272,7 +272,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
         return item.adv
 
     def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
-        log.debug("undefined: %r, %r", font, cid)
+        # log.debug("undefined: %r, %r", font, cid)
         return "(cid:%d)" % cid
 
     def receive_layout(self, ltpage: LTPage) -> None:
@@ -435,12 +435,12 @@ class TextConverter(PDFConverter[AnyIO]):
                 else:
                     # print(child)
                     pass
-            print('\n==========[VSTACK]==========\n')
+            log.debug('\n==========[VSTACK]==========\n')
             for id,v in enumerate(var):
                 l=v[-1].x1-v[0].x0
-                print(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
+                log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
                 vlen.append(l)
-            print('\n==========[SSTACK]==========\n')
+            log.debug('\n==========[SSTACK]==========\n')
             hash_key=cache.deterministic_hash("PDFMathTranslate")
             # if cache.is_cached(hash_key):
             #     print('Cache is found')
@@ -459,7 +459,8 @@ class TextConverter(PDFConverter[AnyIO]):
                 return new
             # tqdm with concurrent.futures.ThreadPoolExecutor()
             with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
-                news = list(tqdm.auto.tqdm(executor.map(worker, sstk), total=len(sstk), position=1))
+                # news = list(tqdm.auto.tqdm(executor.map(worker, sstk), total=len(sstk), position=1))
+                news = list(executor.map(worker, sstk))
             for id,new in enumerate(news):
                 x=pstk[id][1];y=pstk[id][0];lt=pstk[id][2];rt=pstk[id][3];ptr=0;size=pstk[id][4];font=pstk[id][5];lb=pstk[id][6];cstk='';fcur=fcur_=None
                 tx=x
@@ -524,7 +525,7 @@ class TextConverter(PDFConverter[AnyIO]):
                             cstk+=ch
                     fcur=fcur_
                     x+=adv
-                print("<",' '.join([f'{j:.1f}' for j in pstk[id][:5]]),pstk[id][5].fontname,pstk[id][6],">",new)
+                log.debug(f"< {' '.join([f'{j:.1f}' for j in pstk[id][:5]])} {pstk[id][5].fontname} {pstk[id][6]} > {new}")
             for l in lstk:
                 ops+=f"ET q 1 0 0 1 {l.pts[0][0]} {l.pts[0][1]} cm [] 0 d 0 J {l.linewidth} w 0 0 m {l.pts[1][0]-l.pts[0][0]} {l.pts[1][1]-l.pts[0][1]} l S Q BT "
                 pass

+ 2 - 1
pdf2zh/encodingdb.py

@@ -121,6 +121,7 @@ class EncodingDB:
                     try:
                         cid2unicode[cid] = name2unicode(cast(str, x.name))
                     except (KeyError, ValueError) as e:
-                        log.debug(str(e))
+                        # log.debug(str(e))
+                        pass
                     cid += 1
         return cid2unicode

+ 7 - 7
pdf2zh/lzw.py

@@ -90,13 +90,13 @@ class LZWDecoder:
                 break
             yield x
 
-            logger.debug(
-                "nbits=%d, code=%d, output=%r, table=%r",
-                self.nbits,
-                code,
-                x,
-                self.table[258:],
-            )
+            # logger.debug(
+            #     "nbits=%d, code=%d, output=%r, table=%r",
+            #     self.nbits,
+            #     code,
+            #     x,
+            #     self.table[258:],
+            # )
 
 
 def lzwdecode(data: bytes) -> bytes:

+ 7 - 7
pdf2zh/pdf2zh.py

@@ -110,13 +110,13 @@ def create_parser() -> argparse.ArgumentParser:
     #     action="version",
     #     version=f"pdf2zh.six v{pdf2zh.__version__}",
     # )
-    # parser.add_argument(
-    #     "--debug",
-    #     "-d",
-    #     default=False,
-    #     action="store_true",
-    #     help="Use debug logging level.",
-    # )
+    parser.add_argument(
+        "--debug",
+        "-d",
+        default=False,
+        action="store_true",
+        help="Use debug logging level.",
+    )
     # parser.add_argument(
     #     "--disable-caching",
     #     "-C",

+ 16 - 16
pdf2zh/pdfdocument.py

@@ -171,7 +171,7 @@ class PDFXRef(PDFBaseXRef):
                 if use_b != b"n":
                     continue
                 self.offsets[objid] = (None, int(pos_b), int(genno_b))
-        log.debug("xref objects: %r", self.offsets)
+        # log.debug("xref objects: %r", self.offsets)
         self.load_trailer(parser)
 
     def load_trailer(self, parser: PDFParser) -> None:
@@ -185,7 +185,7 @@ class PDFXRef(PDFBaseXRef):
                 raise PDFNoValidXRef("Unexpected EOF - file corrupted")
             (_, dic) = x[0]
         self.trailer.update(dict_value(dic))
-        log.debug("trailer=%r", self.trailer)
+        # log.debug("trailer=%r", self.trailer)
 
     def get_trailer(self) -> Dict[str, Any]:
         return self.trailer
@@ -213,7 +213,7 @@ class PDFXRefFallback(PDFXRef):
             if line_bytes.startswith(b"trailer"):
                 parser.seek(pos)
                 self.load_trailer(parser)
-                log.debug("trailer: %r", self.trailer)
+                # log.debug("trailer: %r", self.trailer)
                 break
             line = line_bytes.decode("latin-1")  # default pdf encoding
             m = self.PDFOBJ_CUE.match(line)
@@ -277,13 +277,13 @@ class PDFXRefStream(PDFBaseXRef):
         self.data = stream.get_data()
         self.entlen = self.fl1 + self.fl2 + self.fl3
         self.trailer = stream.attrs
-        log.debug(
-            "xref stream: objid=%s, fields=%d,%d,%d",
-            ", ".join(map(repr, self.ranges)),
-            self.fl1,
-            self.fl2,
-            self.fl3,
-        )
+        # log.debug(
+        #     "xref stream: objid=%s, fields=%d,%d,%d",
+        #     ", ".join(map(repr, self.ranges)),
+        #     self.fl1,
+        #     self.fl2,
+        #     self.fl3,
+        # )
 
     def get_trailer(self) -> Dict[str, Any]:
         return self.trailer
@@ -835,7 +835,7 @@ class PDFDocument:
         """
         if not self.xrefs:
             raise PDFException("PDFDocument is not initialized")
-        log.debug("getobj: objid=%r", objid)
+        # log.debug("getobj: objid=%r", objid)
         if objid in self._cached_objs:
             (obj, genno) = self._cached_objs[objid]
         else:
@@ -860,7 +860,7 @@ class PDFDocument:
                     continue
             else:
                 raise PDFObjectNotFound(objid)
-            log.debug("register: objid=%r: %r", objid, obj)
+            # log.debug("register: objid=%r: %r", objid, obj)
             if self.caching:
                 self._cached_objs[objid] = (obj, genno)
         return obj
@@ -953,10 +953,10 @@ class PDFDocument:
         prev = b""
         for line in parser.revreadlines():
             line = line.strip()
-            log.debug("find_xref: %r", line)
+            # log.debug("find_xref: %r", line)
 
             if line == b"startxref":
-                log.debug("xref found: pos=%r", prev)
+                # log.debug("xref found: pos=%r", prev)
 
                 if not prev.isdigit():
                     raise PDFNoValidXRef(f"Invalid xref position: {prev!r}")
@@ -987,7 +987,7 @@ class PDFDocument:
             (pos, token) = parser.nexttoken()
         except PSEOF:
             raise PDFNoValidXRef("Unexpected EOF")
-        log.debug("read_xref_from: start=%d, token=%r", start, token)
+        # log.debug("read_xref_from: start=%d, token=%r", start, token)
         if isinstance(token, int):
             # XRefStream: PDF-1.5
             parser.seek(pos)
@@ -1001,7 +1001,7 @@ class PDFDocument:
             xref.load(parser)
         xrefs.append(xref)
         trailer = xref.get_trailer()
-        log.debug("trailer: %r", trailer)
+        # log.debug("trailer: %r", trailer)
         if "XRefStm" in trailer:
             pos = int_value(trailer["XRefStm"])
             self.read_xref_from(parser, pos, xrefs)

+ 2 - 1
pdf2zh/pdffont.py

@@ -141,7 +141,8 @@ class Type1FontHeaderParser(PSStackParser[int]):
             try:
                 self._cid2unicode[cid] = name2unicode(cast(str, name))
             except KeyError as e:
-                log.debug(str(e))
+                # log.debug(str(e))
+                pass
         return self._cid2unicode
 
     def do_keyword(self, pos: int, token: PSKeyword) -> None:

+ 12 - 12
pdf2zh/pdfinterp.py

@@ -207,7 +207,7 @@ class PDFResourceManager:
         if objid and objid in self._cached_fonts:
             font = self._cached_fonts[objid]
         else:
-            log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
+            # log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
             if settings.STRICT:
                 if spec["Type"] is not LITERAL_FONT:
                     raise PDFFontError("Type is not /Font")
@@ -394,7 +394,7 @@ class PDFPageInterpreter:
                 return PREDEFINED_COLORSPACE.get(name)
 
         for k, v in dict_value(resources).items():
-            log.debug("Resource: %r: %r", k, v)
+            # log.debug("Resource: %r: %r", k, v)
             if k == "Font":
                 for fontid, spec in dict_value(v).items():
                     objid = None
@@ -929,7 +929,7 @@ class PDFPageInterpreter:
             if settings.STRICT:
                 raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
             return
-        log.debug("Processing xobj: %r", xobj)
+        # log.debug("Processing xobj: %r", xobj)
         subtype = xobj.get("Subtype")
         if subtype is LITERAL_FORM and "BBox" in xobj:
             interpreter = self.dup()
@@ -959,7 +959,7 @@ class PDFPageInterpreter:
             pass
 
     def process_page(self, page: PDFPage) -> None:
-        log.debug("Processing page: %r", page)
+        # log.debug("Processing page: %r", page)
         (x0, y0, x1, y1) = page.mediabox
         if page.rotate == 90:
             ctm = (0, -1, 1, 0, -y0, x1)
@@ -990,12 +990,12 @@ class PDFPageInterpreter:
 
         This method may be called recursively.
         """
-        log.debug(
-            "render_contents: resources=%r, streams=%r, ctm=%r",
-            resources,
-            streams,
-            ctm,
-        )
+        # log.debug(
+        #     "render_contents: resources=%r, streams=%r, ctm=%r",
+        #     resources,
+        #     streams,
+        #     ctm,
+        # )
         self.init_resources(resources)
         self.init_state(ctm)
         return self.execute(list_value(streams))
@@ -1023,14 +1023,14 @@ class PDFPageInterpreter:
                     nargs = func.__code__.co_argcount - 1
                     if nargs:
                         args = self.pop(nargs)
-                        log.debug("exec: %s %r", name, args)
+                        # log.debug("exec: %s %r", name, args)
                         if len(args) == nargs:
                             func(*args)
                             if not name in ['TJ','Tj','Tm','Td','Tf','BT','ET','l']:
                                 p=" ".join([str(x).replace("\'","") for x in args])
                                 ops+=f'{p} {name} '
                     else:
-                        log.debug("exec: %s", name)
+                        # log.debug("exec: %s", name)
                         func()
                         ops+=f'{name} '
                 elif settings.STRICT:

+ 2 - 2
pdf2zh/pdfpage.py

@@ -126,12 +126,12 @@ class PDFPage:
                 object_type = object_properties.get("type")
 
             if object_type is LITERAL_PAGES and "Kids" in object_properties:
-                log.debug("Pages: Kids=%r", object_properties["Kids"])
+                # log.debug("Pages: Kids=%r", object_properties["Kids"])
                 for child in list_value(object_properties["Kids"]):
                     yield from depth_first_search(child, object_properties, visited)
 
             elif object_type is LITERAL_PAGE:
-                log.debug("Page: %r", object_properties)
+                # log.debug("Page: %r", object_properties)
                 yield (object_id, object_properties)
 
         try:

+ 7 - 7
pdf2zh/pdfparser.py

@@ -113,13 +113,13 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
                     data += line
             self.seek(pos + objlen)
             # XXX limit objlen not to exceed object boundary
-            log.debug(
-                "Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
-                pos,
-                objlen,
-                dic,
-                data[:10],
-            )
+            # log.debug(
+            #     "Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
+            #     pos,
+            #     objlen,
+            #     dic,
+            #     data[:10],
+            # )
             assert self.doc is not None
             stream = PDFStream(dic, bytes(data), self.doc.decipher)
             self.push((pos, stream))

+ 20 - 20
pdf2zh/psparser.py

@@ -188,12 +188,12 @@ class PSBaseParser:
         if not pos:
             pos = self.bufpos + self.charpos
         self.fp.seek(pos)
-        log.debug("poll(%d): %r", pos, self.fp.read(n))
+        # log.debug("poll(%d): %r", pos, self.fp.read(n))
         self.fp.seek(pos0)
 
     def seek(self, pos: int) -> None:
         """Seeks the parser to the given position."""
-        log.debug("seek: %r", pos)
+        # log.debug("seek: %r", pos)
         self.fp.seek(pos)
         # reset the status for nextline()
         self.bufpos = pos
@@ -240,7 +240,7 @@ class PSBaseParser:
             else:
                 linebuf += self.buf[self.charpos :]
                 self.charpos = len(self.buf)
-        log.debug("nextline: %r, %r", linepos, linebuf)
+        # log.debug("nextline: %r, %r", linepos, linebuf)
 
         return (linepos, linebuf)
 
@@ -506,7 +506,7 @@ class PSBaseParser:
             self.fillbuf()
             self.charpos = self._parse1(self.buf, self.charpos)
         token = self._tokens.pop(0)
-        log.debug("nexttoken: %r", token)
+        # log.debug("nexttoken: %r", token)
         return token
 
 
@@ -550,23 +550,23 @@ class PSStackParser(PSBaseParser, Generic[ExtraT]):
         return objs
 
     def add_results(self, *objs: PSStackEntry[ExtraT]) -> None:
-        try:
-            log.debug("add_results: %r", objs)
-        except Exception:
-            log.debug("add_results: (unprintable object)")
+        # try:
+        #     log.debug("add_results: %r", objs)
+        # except Exception:
+        #     log.debug("add_results: (unprintable object)")
         self.results.extend(objs)
 
     def start_type(self, pos: int, type: str) -> None:
         self.context.append((pos, self.curtype, self.curstack))
         (self.curtype, self.curstack) = (type, [])
-        log.debug("start_type: pos=%r, type=%r", pos, type)
+        # log.debug("start_type: pos=%r, type=%r", pos, type)
 
     def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]:
         if self.curtype != type:
             raise PSTypeError(f"Type mismatch: {self.curtype!r} != {type!r}")
         objs = [obj for (_, obj) in self.curstack]
         (pos, self.curtype, self.curstack) = self.context.pop()
-        log.debug("end_type: pos=%r, type=%r, objs=%r", pos, type, objs)
+        # log.debug("end_type: pos=%r, type=%r, objs=%r", pos, type, objs)
         return (pos, objs)
 
     def do_keyword(self, pos: int, token: PSKeyword) -> None:
@@ -626,12 +626,12 @@ class PSStackParser(PSBaseParser, Generic[ExtraT]):
                     if settings.STRICT:
                         raise
             elif isinstance(token, PSKeyword):
-                log.debug(
-                    "do_keyword: pos=%r, token=%r, stack=%r",
-                    pos,
-                    token,
-                    self.curstack,
-                )
+                # log.debug(
+                #     "do_keyword: pos=%r, token=%r, stack=%r",
+                #     pos,
+                #     token,
+                #     self.curstack,
+                # )
                 if token.name==b'endobj':
                     end=pos+7
                 self.do_keyword(pos, token)
@@ -649,8 +649,8 @@ class PSStackParser(PSBaseParser, Generic[ExtraT]):
             else:
                 self.flush()
         obj = self.results.pop(0)
-        try:
-            log.debug("nextobject: %r", obj)
-        except Exception:
-            log.debug("nextobject: (unprintable object)")
+        # try:
+        #     log.debug("nextobject: %r", obj)
+        # except Exception:
+        #     log.debug("nextobject: (unprintable object)")
         return end,obj