trafilatura_html.py 842 B

123456789101112131415
  1. from trafilatura import fetch_url, extract
  2. from pathlib import Path
  3. from mylib.base import save_to_file
  4. file = Path(r'K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha manniana essential oil\10.html')
  5. # downloaded = fetch_url()
  6. downloaded = file.read_text(encoding='utf-8')
  7. result = extract(downloaded, output_format="xml", with_metadata=True)
  8. print(f"result: {result}")
  9. # define output format :"csv", "html", "json", "markdown", "txt", "xml", and "xmltei".
  10. output_format_list = ["csv", "html", "json", "markdown", "txt", "xml", "xmltei"]
  11. for output_format in output_format_list:
  12. result = extract(downloaded, output_format=output_format, with_metadata=True)
  13. save_path = save_to_file(result, 'ext' / file.with_suffix(f'.{output_format}'))
  14. print(f"save_path: {save_path}")
  15. # save_to_file(result, file.with_suffix('.xml'))