1 سال پیش · 953764773e
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ pdf2zh example.pdf -p 1-3,5
 
				 注：从 `\ufb00` 开始是英文风格连字
			
 
				 
			
 
				 ```bash
			
 
				-pdf2zh BDA3.pdf -f ".*+(CM[^RT].*|MS.*|XY.*|MT.*|BL.*|.*0700|.*0500|.*Italic)" -c "(\(|\||\)|\+|=|\d|[\u0080-\ufaff])"
			
 
				+pdf2zh BDA3.pdf -f ".*\+(CM[^RT].*|MS.*|XY.*|MT.*|BL.*|.*0700|.*0500|.*Italic)" -c "(\(|\||\)|\+|=|\d|[\u0080-\ufaff])"
			
 
				 ```
			
 
				 
			
 
				 ## 致谢
			
--- a/pdf2zh/__init__.py
+++ b/pdf2zh/__init__.py
@@ -1,2 +1,2 @@
 
				-__version__ = "1.2.1"
			
 
				+__version__ = "1.2.2"
			
 
				 __author__ = "Byaidu"
			
--- a/pdf2zh/converter.py
+++ b/pdf2zh/converter.py
@@ -380,7 +380,7 @@ class TextConverter(PDFConverter[AnyIO]):
 
				                     if re.match(self.vfont,font):
			
 
				                         return True
			
 
				                 else:
			
 
				-                    if re.match(r'.*+(CM.*|MS.*|XY.*|MT.*|BL.*|.*0700|.*0500|.*Italic)',font):
			
 
				+                    if re.match(r'.*\+(CM.*|MS.*|XY.*|MT.*|BL.*|.*0700|.*0500|.*Italic)',font):
			
 
				                         return True
			
 
				                 if self.vchar and re.match(self.vchar,char):
			
 
				                     return True
			
@@ -393,9 +393,9 @@ class TextConverter(PDFConverter[AnyIO]):
 
				                 if isinstance(child, LTChar):
			
 
				                     cur_v=False
			
 
				                     ind_v=False
			
 
				-                    if vflag(child.fontname,child.get_text()):
			
 
				+                    if vflag(child.fontname,child.get_text()): # 识别公式和字符
			
 
				                         cur_v=True
			
 
				-                    for box in self.layout[ltpage.pageid]: # 独立公式
			
 
				+                    for box in self.layout[ltpage.pageid]: # 识别独立公式
			
 
				                         b=box.block
			
 
				                         if child.x1>b.x_1 and child.x0<b.x_2 and child.y1>ltpage.height-b.y_2 and child.y0<ltpage.height-b.y_1:
			
 
				                             cur_v=True
			
@@ -406,7 +406,6 @@ class TextConverter(PDFConverter[AnyIO]):
 
				                     if ptr==len(item)-1 or not cur_v or (ind_v and not xt_ind) or (vstk and child.x0<vstk[-1].x1-ltpage.width/3): # 公式结束或公式换行截断
			
 
				                         if vstk: # 公式出栈
			
 
				                             sstk[-1]+=f'$v{len(var)}$'
			
 
				-                            # print(f'$v{len(var)}$',end='')
			
 
				                             var.append(vstk)
			
 
				                             varl.append(vlstk)
			
 
				                             vstk=[]
			
@@ -416,55 +415,44 @@ class TextConverter(PDFConverter[AnyIO]):
 
				                                 break
			
 
				                     if not vstk: # 非公式或是公式开头
			
 
				                         if not ind_v and xt and child.y1 > xt.y0 - child.size*0.5 and child.y0 < xt.y1 + child.size: # 非独立公式且位于同段落
			
 
				-                            if False and (child.size>xt.size*1.2 or child.size<xt.size*0.8): # 字体分离（处理角标有误，更新pstk会导致段落断开）
			
 
				+                            if child.x0 > xt.x1 + child.size*2: # 行内分离
			
 
				                                 lt,rt=child,child
			
 
				                                 sstk.append("")
			
 
				                                 pstk.append([child.y0,child.x0,child.x0,child.x0,child.size,child.font,False])
			
 
				-                                # print(f'\n\n[TEXT D] {(child.y0,child.x0,child.size)}')
			
 
				-                            elif child.x0 > xt.x1 + child.size*2: # 行内分离
			
 
				-                                lt,rt=child,child
			
 
				-                                sstk.append("")
			
 
				-                                pstk.append([child.y0,child.x0,child.x0,child.x0,child.size,child.font,False])
			
 
				-                                # print(f'\n\n[TEXT A] {(child.y0,child.x0,child.size)}')
			
 
				                             elif child.x0 > xt.x1 + 1: # 行内空格
			
 
				                                 sstk[-1]+=' '
			
 
				-                                # print(' ',end='')
			
 
				                             elif child.x1 < xt.x0: # 换行，这里需要考虑一下字母修饰符的情况
			
 
				                                 if child.x0 < lt.x0 - child.size*2 or child.x0 > lt.x0 + child.size*1: # 基于初始位置的行间分离
			
 
				                                     lt,rt=child,child
			
 
				                                     sstk.append("")
			
 
				                                     pstk.append([child.y0,child.x0,child.x0,child.x0,child.size,child.font,False])
			
 
				-                                    # print(f'\n\n[TEXT B] {(child.y0,child.x0,child.size)}')
			
 
				                                 else: # 换行空格
			
 
				                                     sstk[-1]+=' '
			
 
				                                     pstk[-1][6]=True # 标记原文段落存在换行
			
 
				-                                    # print(' ',end='')
			
 
				                         else: # 基于纵向距离的行间分离
			
 
				                             lt,rt=child,child
			
 
				                             sstk.append("")
			
 
				                             pstk.append([child.y0,child.x0,child.x0,child.x0,child.size,child.font,False])
			
 
				-                            # print(f'\n\n[TEXT C] {(child.y0,child.x0,child.size)}')
			
 
				                     if not cur_v: # 文字入栈
			
 
				                         sstk[-1]+=child.get_text()
			
 
				-                        # print(child.get_text(),end='')
			
 
				                         if vflag(pstk[-1][5].fontname,''): # 公式开头，后续接文字，需要校正字体
			
 
				                             pstk[-1][5]=child.font
			
 
				                     else: # 公式入栈
			
 
				                         vstk.append(child)
			
 
				-                        if re.match(r'.*\+(CMEX.*)',child.fontname) and child.cid in [40]: # 大括号
			
 
				-                            # ops+=f"ET q 1 0 0 1 0 {child.y0} cm [] 0 d 0 J 1 w 0 0 m {ltpage.width} 0 l S Q BT "
			
 
				-                            # ops+=f"ET q 1 0 0 1 0 {child.y0-child.size*3} cm [] 0 d 0 J 1 w 0 0 m {ltpage.width} 0 l S Q BT "
			
 
				-                            while ptr+1<len(item):
			
 
				-                                child_=item[ptr+1]
			
 
				-                                if isinstance(child_, LTChar): # 公式字符
			
 
				-                                    # print(child_.y0,child.y0-child.size*3,child_.y1,child.y0)
			
 
				-                                    if child_.y0>child.y0-child.size*3 and child_.y1<child.y0:
			
 
				-                                        vstk.append(child_)
			
 
				-                                    else:
			
 
				-                                        break
			
 
				-                                elif isinstance(child_, LTLine): # 公式线条
			
 
				-                                    vlstk.append(child_)
			
 
				-                                ptr+=1
			
 
				+                        # if re.match(r'.*\+(CMEX.*)',child.fontname) and child.cid in [40]: # 大括号
			
 
				+                        #     # ops+=f"ET q 1 0 0 1 0 {child.y0} cm [] 0 d 0 J 1 w 0 0 m {ltpage.width} 0 l S Q BT "
			
 
				+                        #     # ops+=f"ET q 1 0 0 1 0 {child.y0-child.size*3} cm [] 0 d 0 J 1 w 0 0 m {ltpage.width} 0 l S Q BT "
			
 
				+                        #     while ptr+1<len(item):
			
 
				+                        #         child_=item[ptr+1]
			
 
				+                        #         if isinstance(child_, LTChar): # 公式字符
			
 
				+                        #             # print(child_.y0,child.y0-child.size*3,child_.y1,child.y0)
			
 
				+                        #             if child_.y0>child.y0-child.size*3 and child_.y1<child.y0:
			
 
				+                        #                 vstk.append(child_)
			
 
				+                        #             else:
			
 
				+                        #                 break
			
 
				+                        #         elif isinstance(child_, LTLine): # 公式线条
			
 
				+                        #             vlstk.append(child_)
			
 
				+                        #         ptr+=1
			
 
				                     xt=child
			
 
				                     xt_ind=ind_v
			
 
				                     # 更新左右边界
			
@@ -493,8 +481,6 @@ class TextConverter(PDFConverter[AnyIO]):
 
				                 vlen.append(l)
			
 
				             log.debug('\n==========[SSTACK]==========\n')
			
 
				             hash_key=cache.deterministic_hash("PDFMathTranslate")
			
 
				-            # if cache.is_cached(hash_key):
			
 
				-            #     print('Cache is found')
			
 
				             cache.create_cache(hash_key)
			
 
				             @retry
			
 
				             def worker(s): # 多线程翻译
			
@@ -512,9 +498,7 @@ class TextConverter(PDFConverter[AnyIO]):
 
				                 except BaseException as e:
			
 
				                     log.exception(e,exc_info=False)
			
 
				                     raise e
			
 
				-            # tqdm with concurrent.futures.ThreadPoolExecutor()
			
 
				             with concurrent.futures.ThreadPoolExecutor(max_workers=self.thread) as executor:
			
 
				-                # news = list(tqdm.auto.tqdm(executor.map(worker, sstk), total=len(sstk), position=1))
			
 
				                 news = list(executor.map(worker, sstk))
			
 
				             def raw_string(fcur,cstk): # 编码字符串
			
 
				                 if isinstance(self.fontmap[fcur],PDFCIDFont):
			
@@ -525,10 +509,8 @@ class TextConverter(PDFConverter[AnyIO]):
 
				                 tx=x=pstk[id][1];y=pstk[id][0];lt=pstk[id][2];rt=pstk[id][3];ptr=0;size=pstk[id][4];font=pstk[id][5];lb=pstk[id][6];cstk='';fcur=fcur_=None
			
 
				                 log.debug(f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}")
			
 
				                 while True:
			
 
				-                    # print(new,ptr)
			
 
				                     if ptr==len(new): # 到达段落结尾
			
 
				                         if cstk:
			
 
				-                            # print(cstk,tx,x,rt,y)
			
 
				                             ops+=f'/{fcur} {size} Tf 1 0 0 1 {tx} {y} Tm [<{raw_string(fcur,cstk)}>] TJ '
			
 
				                         break
			
 
				                     vy_regex=re.match(r'\$\s*v([\d\s]*)\$',new[ptr:]) # 匹配 $vn$ 公式标记
			
@@ -541,7 +523,6 @@ class TextConverter(PDFConverter[AnyIO]):
 
				                             continue # 翻译器可能会自动补个越界的公式标记
			
 
				                     else: # 加载文字
			
 
				                         ch=new[ptr]
			
 
				-                        # cid=self.china.decode(ch.encode())
			
 
				                         if font.char_width(ord(ch)):
			
 
				                             fcur_=font.fontid
			
 
				                         else:
			
@@ -553,7 +534,6 @@ class TextConverter(PDFConverter[AnyIO]):
 
				                         ptr+=1
			
 
				                     if fcur_!=fcur or vy_regex or x+adv>rt: # 输出文字缓冲区：1.字体更新 2.插入公式 3.到达右边界
			
 
				                         if cstk:
			
 
				-                            # print(cstk,tx,x,rt,y)
			
 
				                             ops+=f'/{fcur} {size} Tf 1 0 0 1 {tx} {y} Tm [<{raw_string(fcur,cstk)}>] TJ '
			
 
				                             cstk=''
			
 
				                     if lb and x+adv>rt: # 到达右边界且原文段落存在换行
			
@@ -575,10 +555,9 @@ class TextConverter(PDFConverter[AnyIO]):
 
				                         for vch in var[vid]: # 排版公式字符
			
 
				                             vc=chr(vch.cid)
			
 
				                             ops+=f"/{vch.font.fontid} {vch.size} Tf 1 0 0 1 {x+vch.x0-var[vid][0].x0} {fix+y+vch.y0-var[vid][0].y0} Tm [<{raw_string(vch.font.fontid,vc)}>] TJ "
			
 
				-                            # print(vc,vch,vch.x0,vch.x1,vch.y0,vch.y1)
			
 
				                         for l in varl[vid]: # 排版公式线条
			
 
				-                            ops+=f"ET q 1 0 0 1 {l.pts[0][0]+x-var[vid][0].x0} {l.pts[0][1]+fix+y-var[vid][0].y0} cm [] 0 d 0 J {l.linewidth} w 0 0 m {l.pts[1][0]-l.pts[0][0]} {l.pts[1][1]-l.pts[0][1]} l S Q BT "
			
 
				-                            pass
			
 
				+                            if l.linewidth<5: # hack
			
 
				+                                ops+=f"ET q 1 0 0 1 {l.pts[0][0]+x-var[vid][0].x0} {l.pts[0][1]+fix+y-var[vid][0].y0} cm [] 0 d 0 J {l.linewidth} w 0 0 m {l.pts[1][0]-l.pts[0][0]} {l.pts[1][1]-l.pts[0][1]} l S Q BT "
			
 
				                     else: # 插入文字缓冲区
			
 
				                         if not cstk:
			
 
				                             tx=x
			
@@ -595,11 +574,7 @@ class TextConverter(PDFConverter[AnyIO]):
 
				                     ops+=f"ET q 1 0 0 1 {l.pts[0][0]} {l.pts[0][1]} cm [] 0 d 0 J {l.linewidth} w 0 0 m {l.pts[1][0]-l.pts[0][0]} {l.pts[1][1]-l.pts[0][1]} l S Q BT "
			
 
				             ops=f'BT {ops}ET '
			
 
				             return ops
			
 
				-
			
 
				-        # if self.showpageno:
			
 
				-        #     self.write_text("Page %s\n" % ltpage.pageid)
			
 
				         ops=render(ltpage)
			
 
				-        # self.write_text("\f")
			
 
				         return ops
			
 
				 
			
 
				     # Some dummy functions to save memory/CPU when all that is wanted
			
--- a/pdf2zh/pdf2zh.py
+++ b/pdf2zh/pdf2zh.py
@@ -61,12 +61,6 @@ def extract_text(
 
				             if outfile.endswith(override):
			
 
				                 output_type = alttype
			
 
				 
			
 
				-    # if outfile == "-":
			
 
				-    #     outfp: AnyIO = sys.stdout
			
 
				-    #     if sys.stdout.encoding is not None:
			
 
				-    #         codec = "utf-8"
			
 
				-    # else:
			
 
				-    #     outfp = open(outfile, "wb")
			
 
				     outfp: AnyIO = sys.stdout
			
 
				     pth = os.path.join(tempfile.gettempdir(), 'mfd-tf_efficientdet_d0.pth.tar')
			
 
				     if not os.path.exists(pth):
			
@@ -84,9 +78,7 @@ def extract_text(
 
				             page.insert_font('china-ss')
			
 
				             page.insert_font('helv')
			
 
				         doc_en.save('output-en.pdf')
			
 
				-        # doc_en.close()
			
 
				 
			
 
				-        # for fname in files:
			
 
				         with open('output-en.pdf', "rb") as fp:
			
 
				             pdf2zh.high_level.extract_text_to_fp(fp, **locals())
			
 
				 
			
@@ -116,7 +108,6 @@ def create_parser() -> argparse.ArgumentParser:
 
				         nargs="+",
			
 
				         help="One or more paths to PDF files.",
			
 
				     )
			
 
				-
			
 
				     parser.add_argument(
			
 
				         "--version",
			
 
				         "-v",
			
@@ -130,14 +121,6 @@ def create_parser() -> argparse.ArgumentParser:
 
				         action="store_true",
			
 
				         help="Use debug logging level.",
			
 
				     )
			
 
				-    # parser.add_argument(
			
 
				-    #     "--disable-caching",
			
 
				-    #     "-C",
			
 
				-    #     default=False,
			
 
				-    #     action="store_true",
			
 
				-    #     help="If caching or resources, such as fonts, should be disabled.",
			
 
				-    # )
			
 
				-
			
 
				     parse_params = parser.add_argument_group(
			
 
				         "Parser",
			
 
				         description="Used during PDF parsing",
			
@@ -148,21 +131,6 @@ def create_parser() -> argparse.ArgumentParser:
 
				         type=str,
			
 
				         help="The list of page numbers to parse.",
			
 
				     )
			
 
				-    # parse_params.add_argument(
			
 
				-    #     "--pagenos",
			
 
				-    #     "-p",
			
 
				-    #     type=str,
			
 
				-    #     help="A comma-separated list of page numbers to parse. "
			
 
				-    #     "Included for legacy applications, use --page-numbers "
			
 
				-    #     "for more idiomatic argument entry.",
			
 
				-    # )
			
 
				-    # parse_params.add_argument(
			
 
				-    #     "--maxpages",
			
 
				-    #     "-m",
			
 
				-    #     type=int,
			
 
				-    #     default=0,
			
 
				-    #     help="The maximum number of pages to parse.",
			
 
				-    # )
			
 
				     parse_params.add_argument(
			
 
				         "--password",
			
 
				         "-P",
			
@@ -191,153 +159,6 @@ def create_parser() -> argparse.ArgumentParser:
 
				         default=4,
			
 
				         help="The number of threads to execute translation.",
			
 
				     )
			
 
				-    # parse_params.add_argument(
			
 
				-    #     "--rotation",
			
 
				-    #     "-R",
			
 
				-    #     default=0,
			
 
				-    #     type=int,
			
 
				-    #     help="The number of degrees to rotate the PDF "
			
 
				-    #     "before other types of processing.",
			
 
				-    # )
			
 
				-
			
 
				-    # la_params = LAParams()  # will be used for defaults
			
 
				-    # la_param_group = parser.add_argument_group(
			
 
				-    #     "Layout analysis",
			
 
				-    #     description="Used during layout analysis.",
			
 
				-    # )
			
 
				-    # la_param_group.add_argument(
			
 
				-    #     "--no-laparams",
			
 
				-    #     "-n",
			
 
				-    #     default=False,
			
 
				-    #     action="store_true",
			
 
				-    #     help="If layout analysis parameters should be ignored.",
			
 
				-    # )
			
 
				-    # la_param_group.add_argument(
			
 
				-    #     "--detect-vertical",
			
 
				-    #     "-V",
			
 
				-    #     default=la_params.detect_vertical,
			
 
				-    #     action="store_true",
			
 
				-    #     help="If vertical text should be considered during layout analysis",
			
 
				-    # )
			
 
				-    # la_param_group.add_argument(
			
 
				-    #     "--line-overlap",
			
 
				-    #     type=float,
			
 
				-    #     default=la_params.line_overlap,
			
 
				-    #     help="If two characters have more overlap than this they "
			
 
				-    #     "are considered to be on the same line. The overlap is specified "
			
 
				-    #     "relative to the minimum height of both characters.",
			
 
				-    # )
			
 
				-    # la_param_group.add_argument(
			
 
				-    #     "--char-margin",
			
 
				-    #     "-M",
			
 
				-    #     type=float,
			
 
				-    #     default=la_params.char_margin,
			
 
				-    #     help="If two characters are closer together than this margin they "
			
 
				-    #     "are considered to be part of the same line. The margin is "
			
 
				-    #     "specified relative to the width of the character.",
			
 
				-    # )
			
 
				-    # la_param_group.add_argument(
			
 
				-    #     "--word-margin",
			
 
				-    #     "-W",
			
 
				-    #     type=float,
			
 
				-    #     default=la_params.word_margin,
			
 
				-    #     help="If two characters on the same line are further apart than this "
			
 
				-    #     "margin then they are considered to be two separate words, and "
			
 
				-    #     "an intermediate space will be added for readability. The margin "
			
 
				-    #     "is specified relative to the width of the character.",
			
 
				-    # )
			
 
				-    # la_param_group.add_argument(
			
 
				-    #     "--line-margin",
			
 
				-    #     "-L",
			
 
				-    #     type=float,
			
 
				-    #     default=la_params.line_margin,
			
 
				-    #     help="If two lines are close together they are considered to "
			
 
				-    #     "be part of the same paragraph. The margin is specified "
			
 
				-    #     "relative to the height of a line.",
			
 
				-    # )
			
 
				-    # la_param_group.add_argument(
			
 
				-    #     "--boxes-flow",
			
 
				-    #     "-F",
			
 
				-    #     type=float_or_disabled,
			
 
				-    #     default=la_params.boxes_flow,
			
 
				-    #     help="Specifies how much a horizontal and vertical position of a "
			
 
				-    #     "text matters when determining the order of lines. The value "
			
 
				-    #     "should be within the range of -1.0 (only horizontal position "
			
 
				-    #     "matters) to +1.0 (only vertical position matters). You can also "
			
 
				-    #     "pass `disabled` to disable advanced layout analysis, and "
			
 
				-    #     "instead return text based on the position of the bottom left "
			
 
				-    #     "corner of the text box.",
			
 
				-    # )
			
 
				-    # la_param_group.add_argument(
			
 
				-    #     "--all-texts",
			
 
				-    #     "-A",
			
 
				-    #     default=la_params.all_texts,
			
 
				-    #     action="store_true",
			
 
				-    #     help="If layout analysis should be performed on text in figures.",
			
 
				-    # )
			
 
				-
			
 
				-    # output_params = parser.add_argument_group(
			
 
				-    #     "Output",
			
 
				-    #     description="Used during output generation.",
			
 
				-    # )
			
 
				-    # output_params.add_argument(
			
 
				-    #     "--outfile",
			
 
				-    #     "-o",
			
 
				-    #     type=str,
			
 
				-    #     default="-",
			
 
				-    #     help="Path to file where output is written. "
			
 
				-    #     'Or "-" (default) to write to stdout.',
			
 
				-    # )
			
 
				-    # output_params.add_argument(
			
 
				-    #     "--output_type",
			
 
				-    #     "-t",
			
 
				-    #     type=str,
			
 
				-    #     default="text",
			
 
				-    #     help="Type of output to generate {text,html,xml,tag}.",
			
 
				-    # )
			
 
				-    # output_params.add_argument(
			
 
				-    #     "--codec",
			
 
				-    #     "-c",
			
 
				-    #     type=str,
			
 
				-    #     default="utf-8",
			
 
				-    #     help="Text encoding to use in output file.",
			
 
				-    # )
			
 
				-    # output_params.add_argument(
			
 
				-    #     "--output-dir",
			
 
				-    #     "-O",
			
 
				-    #     default=None,
			
 
				-    #     help="The output directory to put extracted images in. If not given, "
			
 
				-    #     "images are not extracted.",
			
 
				-    # )
			
 
				-    # output_params.add_argument(
			
 
				-    #     "--layoutmode",
			
 
				-    #     "-Y",
			
 
				-    #     default="normal",
			
 
				-    #     type=str,
			
 
				-    #     help="Type of layout to use when generating html "
			
 
				-    #     "{normal,exact,loose}. If normal,each line is"
			
 
				-    #     " positioned separately in the html. If exact"
			
 
				-    #     ", each character is positioned separately in"
			
 
				-    #     " the html. If loose, same result as normal "
			
 
				-    #     "but with an additional newline after each "
			
 
				-    #     "text line. Only used when output_type is html.",
			
 
				-    # )
			
 
				-    # output_params.add_argument(
			
 
				-    #     "--scale",
			
 
				-    #     "-s",
			
 
				-    #     type=float,
			
 
				-    #     default=1.0,
			
 
				-    #     help="The amount of zoom to use when generating html file. "
			
 
				-    #     "Only used when output_type is html.",
			
 
				-    # )
			
 
				-    # output_params.add_argument(
			
 
				-    #     "--strip-control",
			
 
				-    #     "-S",
			
 
				-    #     default=False,
			
 
				-    #     action="store_true",
			
 
				-    #     help="Remove control statement from text. "
			
 
				-    #     "Only used when output_type is xml.",
			
 
				-    # )
			
 
				 
			
 
				     return parser
			
 
				 
			
@@ -345,20 +166,6 @@ def create_parser() -> argparse.ArgumentParser:
 
				 def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
			
 
				     parsed_args = create_parser().parse_args(args=args)
			
 
				 
			
 
				-    # Propagate parsed layout parameters to LAParams object
			
 
				-    # if parsed_args.no_laparams:
			
 
				-    #     parsed_args.laparams = None
			
 
				-    # else:
			
 
				-    #     parsed_args.laparams = LAParams(
			
 
				-    #         line_overlap=parsed_args.line_overlap,
			
 
				-    #         char_margin=parsed_args.char_margin,
			
 
				-    #         line_margin=parsed_args.line_margin,
			
 
				-    #         word_margin=parsed_args.word_margin,
			
 
				-    #         boxes_flow=parsed_args.boxes_flow,
			
 
				-    #         detect_vertical=parsed_args.detect_vertical,
			
 
				-    #         all_texts=parsed_args.all_texts,
			
 
				-    #     )
			
 
				-
			
 
				     if parsed_args.pages:
			
 
				         pages = []
			
 
				         for p in parsed_args.pages.split(","):
			
@@ -369,14 +176,6 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
 
				                 pages.append(int(p) - 1)
			
 
				         parsed_args.pages = pages
			
 
				 
			
 
				-    # if parsed_args.pagenos:
			
 
				-    #     parsed_args.page_numbers = {int(x) - 1 for x in parsed_args.pagenos.split(",")}
			
 
				-
			
 
				-    # if parsed_args.output_type == "text" and parsed_args.outfile != "-":
			
 
				-    #     for override, alttype in OUTPUT_TYPES:
			
 
				-    #         if parsed_args.outfile.endswith(override):
			
 
				-    #             parsed_args.output_type = alttype
			
 
				-
			
 
				     return parsed_args