| 12345678910111213141516171819202122232425262728293031 |
- import asyncio
- from crawl4ai import *
- from get_article_info import get_rearch_result_links
- from mylib.base import save_to_file, save_all_result,OUTPUT_DIR,load_from_pickle
- async def main():
- async with AsyncWebCrawler() as crawler:
- result = await crawler.arun(
- url="https://en.wikipedia.org/wiki/Aciphylla",
- cache_mode=CacheMode.ENABLED,
- )
- # output/Aciphylla/
- save_all_result(result, OUTPUT_DIR / "Aciphylla")
- def result_dump(output_dir):
- resutl:CrawlResult = load_from_pickle(output_dir /"result.pickle")
- save_to_file(resutl.markdown_v2.raw_markdown, output_dir / "markdown_v2_raw_markdown.md")
- save_to_file(resutl.markdown_v2.references_markdown, output_dir / "markdown_v2_references_markdown.md")
- save_to_file(resutl.markdown_v2.markdown_with_citations, output_dir / "markdown_v2_markdown_with_citations.md")
- save_to_file(resutl.markdown_v2.fit_markdown, output_dir / "markdown_v2_fit_markdown.md")
- save_to_file(resutl.markdown_v2.fit_html, output_dir / "markdown_v2_fit_html.md")
- save_to_file(resutl.markdown, output_dir / "markdown.md")
- save_to_file(resutl.markdown_v2, output_dir / "markdown_v2.md")
- # model_dump_json
- save_to_file(resutl.model_dump_json(), output_dir / "result.json")
- save_to_file(resutl.markdown.model_dump_json(), output_dir / "markdown.json")
-
- if __name__ == "__main__":
- # asyncio.run(main())
- result_dump(OUTPUT_DIR / "Aciphylla")
|