|
|
@@ -103,26 +103,26 @@ def create_parser() -> argparse.ArgumentParser:
|
|
|
help="One or more paths to PDF files.",
|
|
|
)
|
|
|
|
|
|
- parser.add_argument(
|
|
|
- "--version",
|
|
|
- "-v",
|
|
|
- action="version",
|
|
|
- version=f"pdf2zh.six v{pdf2zh.__version__}",
|
|
|
- )
|
|
|
- parser.add_argument(
|
|
|
- "--debug",
|
|
|
- "-d",
|
|
|
- default=False,
|
|
|
- action="store_true",
|
|
|
- help="Use debug logging level.",
|
|
|
- )
|
|
|
- parser.add_argument(
|
|
|
- "--disable-caching",
|
|
|
- "-C",
|
|
|
- default=False,
|
|
|
- action="store_true",
|
|
|
- help="If caching or resources, such as fonts, should be disabled.",
|
|
|
- )
|
|
|
+ # parser.add_argument(
|
|
|
+ # "--version",
|
|
|
+ # "-v",
|
|
|
+ # action="version",
|
|
|
+ # version=f"pdf2zh.six v{pdf2zh.__version__}",
|
|
|
+ # )
|
|
|
+ # parser.add_argument(
|
|
|
+ # "--debug",
|
|
|
+ # "-d",
|
|
|
+ # default=False,
|
|
|
+ # action="store_true",
|
|
|
+ # help="Use debug logging level.",
|
|
|
+ # )
|
|
|
+ # parser.add_argument(
|
|
|
+ # "--disable-caching",
|
|
|
+ # "-C",
|
|
|
+ # default=False,
|
|
|
+ # action="store_true",
|
|
|
+ # help="If caching or resources, such as fonts, should be disabled.",
|
|
|
+ # )
|
|
|
|
|
|
parse_params = parser.add_argument_group(
|
|
|
"Parser",
|
|
|
@@ -135,21 +135,21 @@ def create_parser() -> argparse.ArgumentParser:
|
|
|
nargs="+",
|
|
|
help="A space-seperated list of page numbers to parse.",
|
|
|
)
|
|
|
- parse_params.add_argument(
|
|
|
- "--pagenos",
|
|
|
- "-p",
|
|
|
- type=str,
|
|
|
- help="A comma-separated list of page numbers to parse. "
|
|
|
- "Included for legacy applications, use --page-numbers "
|
|
|
- "for more idiomatic argument entry.",
|
|
|
- )
|
|
|
- parse_params.add_argument(
|
|
|
- "--maxpages",
|
|
|
- "-m",
|
|
|
- type=int,
|
|
|
- default=0,
|
|
|
- help="The maximum number of pages to parse.",
|
|
|
- )
|
|
|
+ # parse_params.add_argument(
|
|
|
+ # "--pagenos",
|
|
|
+ # "-p",
|
|
|
+ # type=str,
|
|
|
+ # help="A comma-separated list of page numbers to parse. "
|
|
|
+ # "Included for legacy applications, use --page-numbers "
|
|
|
+ # "for more idiomatic argument entry.",
|
|
|
+ # )
|
|
|
+ # parse_params.add_argument(
|
|
|
+ # "--maxpages",
|
|
|
+ # "-m",
|
|
|
+ # type=int,
|
|
|
+ # default=0,
|
|
|
+ # help="The maximum number of pages to parse.",
|
|
|
+ # )
|
|
|
parse_params.add_argument(
|
|
|
"--password",
|
|
|
"-P",
|
|
|
@@ -157,153 +157,153 @@ def create_parser() -> argparse.ArgumentParser:
|
|
|
default="",
|
|
|
help="The password to use for decrypting PDF file.",
|
|
|
)
|
|
|
- parse_params.add_argument(
|
|
|
- "--rotation",
|
|
|
- "-R",
|
|
|
- default=0,
|
|
|
- type=int,
|
|
|
- help="The number of degrees to rotate the PDF "
|
|
|
- "before other types of processing.",
|
|
|
- )
|
|
|
-
|
|
|
- la_params = LAParams() # will be used for defaults
|
|
|
- la_param_group = parser.add_argument_group(
|
|
|
- "Layout analysis",
|
|
|
- description="Used during layout analysis.",
|
|
|
- )
|
|
|
- la_param_group.add_argument(
|
|
|
- "--no-laparams",
|
|
|
- "-n",
|
|
|
- default=False,
|
|
|
- action="store_true",
|
|
|
- help="If layout analysis parameters should be ignored.",
|
|
|
- )
|
|
|
- la_param_group.add_argument(
|
|
|
- "--detect-vertical",
|
|
|
- "-V",
|
|
|
- default=la_params.detect_vertical,
|
|
|
- action="store_true",
|
|
|
- help="If vertical text should be considered during layout analysis",
|
|
|
- )
|
|
|
- la_param_group.add_argument(
|
|
|
- "--line-overlap",
|
|
|
- type=float,
|
|
|
- default=la_params.line_overlap,
|
|
|
- help="If two characters have more overlap than this they "
|
|
|
- "are considered to be on the same line. The overlap is specified "
|
|
|
- "relative to the minimum height of both characters.",
|
|
|
- )
|
|
|
- la_param_group.add_argument(
|
|
|
- "--char-margin",
|
|
|
- "-M",
|
|
|
- type=float,
|
|
|
- default=la_params.char_margin,
|
|
|
- help="If two characters are closer together than this margin they "
|
|
|
- "are considered to be part of the same line. The margin is "
|
|
|
- "specified relative to the width of the character.",
|
|
|
- )
|
|
|
- la_param_group.add_argument(
|
|
|
- "--word-margin",
|
|
|
- "-W",
|
|
|
- type=float,
|
|
|
- default=la_params.word_margin,
|
|
|
- help="If two characters on the same line are further apart than this "
|
|
|
- "margin then they are considered to be two separate words, and "
|
|
|
- "an intermediate space will be added for readability. The margin "
|
|
|
- "is specified relative to the width of the character.",
|
|
|
- )
|
|
|
- la_param_group.add_argument(
|
|
|
- "--line-margin",
|
|
|
- "-L",
|
|
|
- type=float,
|
|
|
- default=la_params.line_margin,
|
|
|
- help="If two lines are close together they are considered to "
|
|
|
- "be part of the same paragraph. The margin is specified "
|
|
|
- "relative to the height of a line.",
|
|
|
- )
|
|
|
- la_param_group.add_argument(
|
|
|
- "--boxes-flow",
|
|
|
- "-F",
|
|
|
- type=float_or_disabled,
|
|
|
- default=la_params.boxes_flow,
|
|
|
- help="Specifies how much a horizontal and vertical position of a "
|
|
|
- "text matters when determining the order of lines. The value "
|
|
|
- "should be within the range of -1.0 (only horizontal position "
|
|
|
- "matters) to +1.0 (only vertical position matters). You can also "
|
|
|
- "pass `disabled` to disable advanced layout analysis, and "
|
|
|
- "instead return text based on the position of the bottom left "
|
|
|
- "corner of the text box.",
|
|
|
- )
|
|
|
- la_param_group.add_argument(
|
|
|
- "--all-texts",
|
|
|
- "-A",
|
|
|
- default=la_params.all_texts,
|
|
|
- action="store_true",
|
|
|
- help="If layout analysis should be performed on text in figures.",
|
|
|
- )
|
|
|
-
|
|
|
- output_params = parser.add_argument_group(
|
|
|
- "Output",
|
|
|
- description="Used during output generation.",
|
|
|
- )
|
|
|
- output_params.add_argument(
|
|
|
- "--outfile",
|
|
|
- "-o",
|
|
|
- type=str,
|
|
|
- default="-",
|
|
|
- help="Path to file where output is written. "
|
|
|
- 'Or "-" (default) to write to stdout.',
|
|
|
- )
|
|
|
- output_params.add_argument(
|
|
|
- "--output_type",
|
|
|
- "-t",
|
|
|
- type=str,
|
|
|
- default="text",
|
|
|
- help="Type of output to generate {text,html,xml,tag}.",
|
|
|
- )
|
|
|
- output_params.add_argument(
|
|
|
- "--codec",
|
|
|
- "-c",
|
|
|
- type=str,
|
|
|
- default="utf-8",
|
|
|
- help="Text encoding to use in output file.",
|
|
|
- )
|
|
|
- output_params.add_argument(
|
|
|
- "--output-dir",
|
|
|
- "-O",
|
|
|
- default=None,
|
|
|
- help="The output directory to put extracted images in. If not given, "
|
|
|
- "images are not extracted.",
|
|
|
- )
|
|
|
- output_params.add_argument(
|
|
|
- "--layoutmode",
|
|
|
- "-Y",
|
|
|
- default="normal",
|
|
|
- type=str,
|
|
|
- help="Type of layout to use when generating html "
|
|
|
- "{normal,exact,loose}. If normal,each line is"
|
|
|
- " positioned separately in the html. If exact"
|
|
|
- ", each character is positioned separately in"
|
|
|
- " the html. If loose, same result as normal "
|
|
|
- "but with an additional newline after each "
|
|
|
- "text line. Only used when output_type is html.",
|
|
|
- )
|
|
|
- output_params.add_argument(
|
|
|
- "--scale",
|
|
|
- "-s",
|
|
|
- type=float,
|
|
|
- default=1.0,
|
|
|
- help="The amount of zoom to use when generating html file. "
|
|
|
- "Only used when output_type is html.",
|
|
|
- )
|
|
|
- output_params.add_argument(
|
|
|
- "--strip-control",
|
|
|
- "-S",
|
|
|
- default=False,
|
|
|
- action="store_true",
|
|
|
- help="Remove control statement from text. "
|
|
|
- "Only used when output_type is xml.",
|
|
|
- )
|
|
|
+ # parse_params.add_argument(
|
|
|
+ # "--rotation",
|
|
|
+ # "-R",
|
|
|
+ # default=0,
|
|
|
+ # type=int,
|
|
|
+ # help="The number of degrees to rotate the PDF "
|
|
|
+ # "before other types of processing.",
|
|
|
+ # )
|
|
|
+
|
|
|
+ # la_params = LAParams() # will be used for defaults
|
|
|
+ # la_param_group = parser.add_argument_group(
|
|
|
+ # "Layout analysis",
|
|
|
+ # description="Used during layout analysis.",
|
|
|
+ # )
|
|
|
+ # la_param_group.add_argument(
|
|
|
+ # "--no-laparams",
|
|
|
+ # "-n",
|
|
|
+ # default=False,
|
|
|
+ # action="store_true",
|
|
|
+ # help="If layout analysis parameters should be ignored.",
|
|
|
+ # )
|
|
|
+ # la_param_group.add_argument(
|
|
|
+ # "--detect-vertical",
|
|
|
+ # "-V",
|
|
|
+ # default=la_params.detect_vertical,
|
|
|
+ # action="store_true",
|
|
|
+ # help="If vertical text should be considered during layout analysis",
|
|
|
+ # )
|
|
|
+ # la_param_group.add_argument(
|
|
|
+ # "--line-overlap",
|
|
|
+ # type=float,
|
|
|
+ # default=la_params.line_overlap,
|
|
|
+ # help="If two characters have more overlap than this they "
|
|
|
+ # "are considered to be on the same line. The overlap is specified "
|
|
|
+ # "relative to the minimum height of both characters.",
|
|
|
+ # )
|
|
|
+ # la_param_group.add_argument(
|
|
|
+ # "--char-margin",
|
|
|
+ # "-M",
|
|
|
+ # type=float,
|
|
|
+ # default=la_params.char_margin,
|
|
|
+ # help="If two characters are closer together than this margin they "
|
|
|
+ # "are considered to be part of the same line. The margin is "
|
|
|
+ # "specified relative to the width of the character.",
|
|
|
+ # )
|
|
|
+ # la_param_group.add_argument(
|
|
|
+ # "--word-margin",
|
|
|
+ # "-W",
|
|
|
+ # type=float,
|
|
|
+ # default=la_params.word_margin,
|
|
|
+ # help="If two characters on the same line are further apart than this "
|
|
|
+ # "margin then they are considered to be two separate words, and "
|
|
|
+ # "an intermediate space will be added for readability. The margin "
|
|
|
+ # "is specified relative to the width of the character.",
|
|
|
+ # )
|
|
|
+ # la_param_group.add_argument(
|
|
|
+ # "--line-margin",
|
|
|
+ # "-L",
|
|
|
+ # type=float,
|
|
|
+ # default=la_params.line_margin,
|
|
|
+ # help="If two lines are close together they are considered to "
|
|
|
+ # "be part of the same paragraph. The margin is specified "
|
|
|
+ # "relative to the height of a line.",
|
|
|
+ # )
|
|
|
+ # la_param_group.add_argument(
|
|
|
+ # "--boxes-flow",
|
|
|
+ # "-F",
|
|
|
+ # type=float_or_disabled,
|
|
|
+ # default=la_params.boxes_flow,
|
|
|
+ # help="Specifies how much a horizontal and vertical position of a "
|
|
|
+ # "text matters when determining the order of lines. The value "
|
|
|
+ # "should be within the range of -1.0 (only horizontal position "
|
|
|
+ # "matters) to +1.0 (only vertical position matters). You can also "
|
|
|
+ # "pass `disabled` to disable advanced layout analysis, and "
|
|
|
+ # "instead return text based on the position of the bottom left "
|
|
|
+ # "corner of the text box.",
|
|
|
+ # )
|
|
|
+ # la_param_group.add_argument(
|
|
|
+ # "--all-texts",
|
|
|
+ # "-A",
|
|
|
+ # default=la_params.all_texts,
|
|
|
+ # action="store_true",
|
|
|
+ # help="If layout analysis should be performed on text in figures.",
|
|
|
+ # )
|
|
|
+
|
|
|
+ # output_params = parser.add_argument_group(
|
|
|
+ # "Output",
|
|
|
+ # description="Used during output generation.",
|
|
|
+ # )
|
|
|
+ # output_params.add_argument(
|
|
|
+ # "--outfile",
|
|
|
+ # "-o",
|
|
|
+ # type=str,
|
|
|
+ # default="-",
|
|
|
+ # help="Path to file where output is written. "
|
|
|
+ # 'Or "-" (default) to write to stdout.',
|
|
|
+ # )
|
|
|
+ # output_params.add_argument(
|
|
|
+ # "--output_type",
|
|
|
+ # "-t",
|
|
|
+ # type=str,
|
|
|
+ # default="text",
|
|
|
+ # help="Type of output to generate {text,html,xml,tag}.",
|
|
|
+ # )
|
|
|
+ # output_params.add_argument(
|
|
|
+ # "--codec",
|
|
|
+ # "-c",
|
|
|
+ # type=str,
|
|
|
+ # default="utf-8",
|
|
|
+ # help="Text encoding to use in output file.",
|
|
|
+ # )
|
|
|
+ # output_params.add_argument(
|
|
|
+ # "--output-dir",
|
|
|
+ # "-O",
|
|
|
+ # default=None,
|
|
|
+ # help="The output directory to put extracted images in. If not given, "
|
|
|
+ # "images are not extracted.",
|
|
|
+ # )
|
|
|
+ # output_params.add_argument(
|
|
|
+ # "--layoutmode",
|
|
|
+ # "-Y",
|
|
|
+ # default="normal",
|
|
|
+ # type=str,
|
|
|
+ # help="Type of layout to use when generating html "
|
|
|
+ # "{normal,exact,loose}. If normal,each line is"
|
|
|
+ # " positioned separately in the html. If exact"
|
|
|
+ # ", each character is positioned separately in"
|
|
|
+ # " the html. If loose, same result as normal "
|
|
|
+ # "but with an additional newline after each "
|
|
|
+ # "text line. Only used when output_type is html.",
|
|
|
+ # )
|
|
|
+ # output_params.add_argument(
|
|
|
+ # "--scale",
|
|
|
+ # "-s",
|
|
|
+ # type=float,
|
|
|
+ # default=1.0,
|
|
|
+ # help="The amount of zoom to use when generating html file. "
|
|
|
+ # "Only used when output_type is html.",
|
|
|
+ # )
|
|
|
+ # output_params.add_argument(
|
|
|
+ # "--strip-control",
|
|
|
+ # "-S",
|
|
|
+ # default=False,
|
|
|
+ # action="store_true",
|
|
|
+ # help="Remove control statement from text. "
|
|
|
+ # "Only used when output_type is xml.",
|
|
|
+ # )
|
|
|
|
|
|
return parser
|
|
|
|
|
|
@@ -312,29 +312,29 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
|
|
|
parsed_args = create_parser().parse_args(args=args)
|
|
|
|
|
|
# Propagate parsed layout parameters to LAParams object
|
|
|
- if parsed_args.no_laparams:
|
|
|
- parsed_args.laparams = None
|
|
|
- else:
|
|
|
- parsed_args.laparams = LAParams(
|
|
|
- line_overlap=parsed_args.line_overlap,
|
|
|
- char_margin=parsed_args.char_margin,
|
|
|
- line_margin=parsed_args.line_margin,
|
|
|
- word_margin=parsed_args.word_margin,
|
|
|
- boxes_flow=parsed_args.boxes_flow,
|
|
|
- detect_vertical=parsed_args.detect_vertical,
|
|
|
- all_texts=parsed_args.all_texts,
|
|
|
- )
|
|
|
+ # if parsed_args.no_laparams:
|
|
|
+ # parsed_args.laparams = None
|
|
|
+ # else:
|
|
|
+ # parsed_args.laparams = LAParams(
|
|
|
+ # line_overlap=parsed_args.line_overlap,
|
|
|
+ # char_margin=parsed_args.char_margin,
|
|
|
+ # line_margin=parsed_args.line_margin,
|
|
|
+ # word_margin=parsed_args.word_margin,
|
|
|
+ # boxes_flow=parsed_args.boxes_flow,
|
|
|
+ # detect_vertical=parsed_args.detect_vertical,
|
|
|
+ # all_texts=parsed_args.all_texts,
|
|
|
+ # )
|
|
|
|
|
|
if parsed_args.page_numbers:
|
|
|
parsed_args.page_numbers = {x - 1 for x in parsed_args.page_numbers}
|
|
|
|
|
|
- if parsed_args.pagenos:
|
|
|
- parsed_args.page_numbers = {int(x) - 1 for x in parsed_args.pagenos.split(",")}
|
|
|
+ # if parsed_args.pagenos:
|
|
|
+ # parsed_args.page_numbers = {int(x) - 1 for x in parsed_args.pagenos.split(",")}
|
|
|
|
|
|
- if parsed_args.output_type == "text" and parsed_args.outfile != "-":
|
|
|
- for override, alttype in OUTPUT_TYPES:
|
|
|
- if parsed_args.outfile.endswith(override):
|
|
|
- parsed_args.output_type = alttype
|
|
|
+ # if parsed_args.output_type == "text" and parsed_args.outfile != "-":
|
|
|
+ # for override, alttype in OUTPUT_TYPES:
|
|
|
+ # if parsed_args.outfile.endswith(override):
|
|
|
+ # parsed_args.output_type = alttype
|
|
|
|
|
|
return parsed_args
|
|
|
|