| 123456789101112131415 |
- from trafilatura import fetch_url, extract
- from pathlib import Path
- from mylib.base import save_to_file
- file = Path(r'K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha manniana essential oil\10.html')
- # downloaded = fetch_url()
- downloaded = file.read_text(encoding='utf-8')
- result = extract(downloaded, output_format="xml", with_metadata=True)
- print(f"result: {result}")
- # define output format :"csv", "html", "json", "markdown", "txt", "xml", and "xmltei".
- output_format_list = ["csv", "html", "json", "markdown", "txt", "xml", "xmltei"]
- for output_format in output_format_list:
- result = extract(downloaded, output_format=output_format, with_metadata=True)
- save_path = save_to_file(result, 'ext' / file.with_suffix(f'.{output_format}'))
- print(f"save_path: {save_path}")
- # save_to_file(result, file.with_suffix('.xml'))
|