article.py 1.4 KB

12345678910111213141516171819202122232425262728293031
  1. import asyncio
  2. from crawl4ai import *
  3. from get_article_info import get_rearch_result_links
  4. from mylib.base import save_to_file, save_all_result,OUTPUT_DIR,load_from_pickle
  5. async def main():
  6. async with AsyncWebCrawler() as crawler:
  7. result = await crawler.arun(
  8. url="https://en.wikipedia.org/wiki/Aciphylla",
  9. cache_mode=CacheMode.ENABLED,
  10. )
  11. # output/Aciphylla/
  12. save_all_result(result, OUTPUT_DIR / "Aciphylla")
  13. def result_dump(output_dir):
  14. resutl:CrawlResult = load_from_pickle(output_dir /"result.pickle")
  15. save_to_file(resutl.markdown_v2.raw_markdown, output_dir / "markdown_v2_raw_markdown.md")
  16. save_to_file(resutl.markdown_v2.references_markdown, output_dir / "markdown_v2_references_markdown.md")
  17. save_to_file(resutl.markdown_v2.markdown_with_citations, output_dir / "markdown_v2_markdown_with_citations.md")
  18. save_to_file(resutl.markdown_v2.fit_markdown, output_dir / "markdown_v2_fit_markdown.md")
  19. save_to_file(resutl.markdown_v2.fit_html, output_dir / "markdown_v2_fit_html.md")
  20. save_to_file(resutl.markdown, output_dir / "markdown.md")
  21. save_to_file(resutl.markdown_v2, output_dir / "markdown_v2.md")
  22. # model_dump_json
  23. save_to_file(resutl.model_dump_json(), output_dir / "result.json")
  24. save_to_file(resutl.markdown.model_dump_json(), output_dir / "markdown.json")
  25. if __name__ == "__main__":
  26. # asyncio.run(main())
  27. result_dump(OUTPUT_DIR / "Aciphylla")