crawl_t.py 11 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. import asyncio
  2. import datetime
  3. import json
  4. import aiofiles
  5. import os
  6. import sys
  7. import time
  8. from camoufox import Camoufox
  9. from camoufox.server import launch_server
  10. from camoufox.async_api import AsyncCamoufox
  11. import asyncio
  12. import signal
  13. from worker.search_engine.camoufox_broswer import BrowserConfig
  14. from worker.search_engine.google_search import GoogleSearchHandler
  15. from mylib.logu import get_logger
  16. import asyncio
  17. import pickle
  18. from pathlib import Path
  19. import random
  20. from typing import List
  21. import httpx
  22. import ssl
  23. from sqlmodel import select, Session
  24. from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
  25. from worker.search_engine.search_result_db import SearchResultManager, KeywordTask, SearchPageResult, SearchResultItem
  26. from mylib.base import ensure_output_dir, save_to_file,load_from_pickle
  27. from utils.proxy_pool import get_random_proxy
  28. from mylib.drission_page import load_random_ua_chrome, load_chrome_from_ini,test_normal_chrome
  29. logger = get_logger('test')
  30. page = None
  31. async def task():
  32. page = load_chrome_from_ini(
  33. proxy='http://localhost:1881',
  34. browser_path=r'G:\code\upwork\zhang_crawl_bio\download\GoogleChromePortable\GoogleChromePortable.exe',
  35. auto_port=False,)
  36. logger.info(f"{page.browser._chromium_options._address}")
  37. port = page.browser._chromium_options._address.split(':')[-1]
  38. logger.info(f"{page.browser._driver.get(f'http://{page.browser._chromium_options._address}/json').json()}")
  39. logger.info(f"{page.browser._driver._websocket_url}")
  40. item_id = 1
  41. # url = 'https://greg.app/acalypha-marissima-overview/'
  42. url = 'https://fr.florame.com/en/essential-oils'
  43. # url = 'https://repository.arizona.edu/bitstream/10150/550946/1/dp_04_01-04.pdf'
  44. # url = 'https://baidu.com'
  45. browser_config = BrowserConfig(
  46. headless=False,
  47. # verbose=False,
  48. # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
  49. # debugging_port=int(port),
  50. use_managed_browser=True,
  51. cdp_url=page.browser._driver._websocket_url
  52. # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce'
  53. )
  54. # async with AsyncWebCrawler(config=browser_config) as crawler:
  55. # crawler_config = CrawlerRunConfig(
  56. # cache_mode=CacheMode.BYPASS
  57. # )
  58. # result = await crawler.arun(url=url, config=crawler_config)
  59. # print(result.markdown)
  60. crawler = AsyncWebCrawler(config=browser_config)
  61. await crawler.start()
  62. crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
  63. result:CrawlResult = await crawler.arun(url=url, config=crawl_config)
  64. logger.info(f"{item_id} crawler.arun result.success: {result.model_dump_json(indent=2)} ")
  65. print(result.markdown)
  66. input('press enter to continue')
  67. await crawler.close()
  68. # page.quit()
  69. def test():
  70. data = [{'search_result_model': SearchResultItem(id=106, content='由 MK Oumarou 著作 · 2018 · 被引用 14 次 — of the essential oils from the leaves of Acalypha ornata and Acalypha ciliata in southwest Nigeria.', keyword_id=11, page_id=13, title='Toxic effect of Chenopodium ambrosoides, Hyptis', url='https://www.dipterajournal.com/pdf/2018/vol5issue1/PartA/5-1-7-427.pdf', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\106.pdf', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 485748)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=107, content='Toxicity and mosquito larvicidal activities of the from the leaves of and ciliata in southwest Nigeria. Journal of\xa0...', keyword_id=11, page_id=13, title='references - ACALYPHA | Taxonomic Information System', url='https://www.acalypha.es/secc/biblio.asp?todas=si', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\107.html', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 487275)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=108, content=' is a species in the botanical family Euphorbiaceae. In Africa it is widely used as a medicinal plant. The stems are used as fibres for weaving\xa0...', keyword_id=11, page_id=13, title='Acalypha ornata', url='https://en.wikipedia.org/wiki/Acalypha_ornata', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\108.html', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 487275)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=109, content='Chemical composition, cytotoxicity and antioxidant activity of essential oils of Acalypha hispida flowers ... (Hochst). PA Onocha\xa0...', keyword_id=11, page_id=13, title='Ganiyat Kehinde Oloyede', url='https://scholar.google.com.pk/citations?user=mLjCRAMAAAAJ&hl=en', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\109.html', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 488285)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=110, content='Toxicity and mosquito larvicidal activities of the from the leaves of and ciliata in Southwest Nigeria. Journal of\xa0...', keyword_id=11, page_id=13, title='Sherifat Aboaba (0000-0003-1877-4306)', url='https://orcid.org/0000-0003-1877-4306', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\110.html', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 489280)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=111, content='由 AK Asekunowo 著作 · 2021 · 被引用 2 次 — ABSTRACT. Background: Medicinal plants such as plant has been employed in traditional medicine for the treatment of fungal skin.', keyword_id=11, page_id=13, title='Evaluation of Phytochemical Constituents and Antifungal ...', url='https://jbms.unilag.ng/index.php/jbms/article/download/20/20', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\111.pdf', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 489280)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=112, content='Toxicity and mosquito larvicidal activities of the from the leaves of. and ciliata in southwest Nigeria. J Vector\xa0...', keyword_id=11, page_id=13, title='Review Article Acalypha ciliata Forssk (Euphorbiaceae), used ...', url='https://www.sdiarticle4.com/prh/doc/Ms_AJRAVS_67037.pdf', html_path=None, keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 490279)), 'crawl_result': CrawlResult(url='https://www.sdiarticle4.com/prh/doc/Ms_AJRAVS_67037.pdf', html='', success=False, cleaned_html=None, media={}, links={}, downloaded_files=None, screenshot=None, pdf=None, markdown=None, markdown_v2=None, fit_markdown=None, fit_html=None, extracted_content=None, metadata=None, error_message='Unexpected error in _crawl_web at line 1355 in _crawl_web (crawl_env\\Lib\\site-packages\\crawl4ai\\async_crawler_strategy.py):\nError: Failed on navigating ACS-GOTO:\nPage.goto: net::ERR_ABORTED at https://www.sdiarticle4.com/prh/doc/Ms_AJRAVS_67037.pdf\nCall log:\n - navigating to "https://www.sdiarticle4.com/prh/doc/Ms_AJRAVS_67037.pdf", waiting until "domcontentloaded"\n\n\nCode context:\n1350 response = await page.goto(\n1351 url, wait_until=config.wait_until, timeout=config.page_timeout\n1352 )\n1353 redirected_url = page.url\n1354 except Error as e:\n1355 → raise RuntimeError(f"Failed on navigating ACS-GOTO:\\n{str(e)}")\n1356 \n1357 await self.execute_hook(\n1358 "after_goto", page, context=context, url=url, response=response, config=config\n1359 )\n1360 ', session_id=None, response_headers=None, status_code=None, ssl_certificate=None, dispatch_result=None, redirected_url=None)}, {'search_result_model': SearchResultItem(id=113, content='由 ME Maffei 著作 · 2020 · 被引用 39 次 — The were found to be extracted from 13 plant parts and samples originated from 56 countries worldwide. Statistical analyses included the\xa0...', keyword_id=11, page_id=13, title='Plant Natural Sources of the Endocannabinoid (E)', url='https://pmc.ncbi.nlm.nih.gov/articles/PMC7554841/', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\113.html', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 490279)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=114, content='Acalypha ornata Hochst. ex A.Rich. var. bracteosa Müll.Arg. that adds a splash of color to gardens and landscapes.', keyword_id=11, page_id=13, title='Acalypha Ornata Hochst. Ex A.rich. Var. Bracteosa Müll.arg.', url='https://www.botanikks.com/plants/acalypha-ornata-hochst-ex-arich-var-bracteosa-mllarg/636729/1', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\114.html', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 669064)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=115, content='由 WM Din 著作 · 2013 · 被引用 15 次 — This is the first report showing that the wound healing property of wilkesiana var. macafeana hort. is mediated by a geraniin containing extract.', keyword_id=11, page_id=13, title='Antioxidant and Cytoprotective Effects of an Ethanol Extract ...', url='https://journals.sagepub.com/doi/abs/10.1177/1934578X1300800325', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\115.html', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 669705)), 'crawl_result': None, 'message': 'already has html_path'}]
  71. for item in data:
  72. item['search_result_model'].html_path = None
  73. jsonstr = json.dumps(data, indent=4)
  74. print(jsonstr)
  75. def main():
  76. asyncio.run(task())
  77. # test()
  78. if __name__ == "__main__":
  79. main()