| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- import asyncio
- import datetime
- import json
- import aiofiles
- import os
- import sys
- import time
- from camoufox import Camoufox
- from camoufox.server import launch_server
- from camoufox.async_api import AsyncCamoufox
- import asyncio
- import signal
- from worker.search_engine.camoufox_broswer import BrowserConfig
- from worker.search_engine.google_search import GoogleSearchHandler
- from mylib.logu import get_logger
- import asyncio
- import pickle
- from pathlib import Path
- import random
- from typing import List
- import httpx
- import ssl
- from sqlmodel import select, Session
- from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
- from worker.search_engine.search_result_db import SearchResultManager, KeywordTask, SearchPageResult, SearchResultItem
- from mylib.base import ensure_output_dir, save_to_file,load_from_pickle
- from utils.proxy_pool import get_random_proxy
- from mylib.drission_page import load_random_ua_chrome, load_chrome_from_ini,test_normal_chrome
- logger = get_logger('test')
- page = None
- async def task():
- page = load_chrome_from_ini(
- proxy='http://localhost:1881',
- browser_path=r'G:\code\upwork\zhang_crawl_bio\download\GoogleChromePortable\GoogleChromePortable.exe',
- auto_port=False,)
- logger.info(f"{page.browser._chromium_options._address}")
- port = page.browser._chromium_options._address.split(':')[-1]
- logger.info(f"{page.browser._driver.get(f'http://{page.browser._chromium_options._address}/json').json()}")
- logger.info(f"{page.browser._driver._websocket_url}")
- item_id = 1
- # url = 'https://greg.app/acalypha-marissima-overview/'
- url = 'https://fr.florame.com/en/essential-oils'
- # url = 'https://repository.arizona.edu/bitstream/10150/550946/1/dp_04_01-04.pdf'
- # url = 'https://baidu.com'
- browser_config = BrowserConfig(
- headless=False,
- # verbose=False,
- # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
- # debugging_port=int(port),
- use_managed_browser=True,
- cdp_url=page.browser._driver._websocket_url
- # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce'
- )
- # async with AsyncWebCrawler(config=browser_config) as crawler:
- # crawler_config = CrawlerRunConfig(
- # cache_mode=CacheMode.BYPASS
- # )
- # result = await crawler.arun(url=url, config=crawler_config)
- # print(result.markdown)
- crawler = AsyncWebCrawler(config=browser_config)
- await crawler.start()
- crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
- result:CrawlResult = await crawler.arun(url=url, config=crawl_config)
- logger.info(f"{item_id} crawler.arun result.success: {result.model_dump_json(indent=2)} ")
- print(result.markdown)
- input('press enter to continue')
- await crawler.close()
- # page.quit()
- def test():
- data = [{'search_result_model': SearchResultItem(id=106, content='由 MK Oumarou 著作 · 2018 · 被引用 14 次 — of the essential oils from the leaves of Acalypha ornata and Acalypha ciliata in southwest Nigeria.', keyword_id=11, page_id=13, title='Toxic effect of Chenopodium ambrosoides, Hyptis', url='https://www.dipterajournal.com/pdf/2018/vol5issue1/PartA/5-1-7-427.pdf', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\106.pdf', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 485748)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=107, content='Toxicity and mosquito larvicidal activities of the from the leaves of and ciliata in southwest Nigeria. Journal of\xa0...', keyword_id=11, page_id=13, title='references - ACALYPHA | Taxonomic Information System', url='https://www.acalypha.es/secc/biblio.asp?todas=si', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\107.html', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 487275)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=108, content=' is a species in the botanical family Euphorbiaceae. In Africa it is widely used as a medicinal plant. The stems are used as fibres for weaving\xa0...', keyword_id=11, page_id=13, title='Acalypha ornata', url='https://en.wikipedia.org/wiki/Acalypha_ornata', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\108.html', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 487275)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=109, content='Chemical composition, cytotoxicity and antioxidant activity of essential oils of Acalypha hispida flowers ... (Hochst). PA Onocha\xa0...', keyword_id=11, page_id=13, title='Ganiyat Kehinde Oloyede', url='https://scholar.google.com.pk/citations?user=mLjCRAMAAAAJ&hl=en', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\109.html', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 488285)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=110, content='Toxicity and mosquito larvicidal activities of the from the leaves of and ciliata in Southwest Nigeria. Journal of\xa0...', keyword_id=11, page_id=13, title='Sherifat Aboaba (0000-0003-1877-4306)', url='https://orcid.org/0000-0003-1877-4306', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\110.html', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 489280)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=111, content='由 AK Asekunowo 著作 · 2021 · 被引用 2 次 — ABSTRACT. Background: Medicinal plants such as plant has been employed in traditional medicine for the treatment of fungal skin.', keyword_id=11, page_id=13, title='Evaluation of Phytochemical Constituents and Antifungal ...', url='https://jbms.unilag.ng/index.php/jbms/article/download/20/20', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\111.pdf', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 489280)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=112, content='Toxicity and mosquito larvicidal activities of the from the leaves of. and ciliata in southwest Nigeria. J Vector\xa0...', keyword_id=11, page_id=13, title='Review Article Acalypha ciliata Forssk (Euphorbiaceae), used ...', url='https://www.sdiarticle4.com/prh/doc/Ms_AJRAVS_67037.pdf', html_path=None, keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 490279)), 'crawl_result': CrawlResult(url='https://www.sdiarticle4.com/prh/doc/Ms_AJRAVS_67037.pdf', html='', success=False, cleaned_html=None, media={}, links={}, downloaded_files=None, screenshot=None, pdf=None, markdown=None, markdown_v2=None, fit_markdown=None, fit_html=None, extracted_content=None, metadata=None, error_message='Unexpected error in _crawl_web at line 1355 in _crawl_web (crawl_env\\Lib\\site-packages\\crawl4ai\\async_crawler_strategy.py):\nError: Failed on navigating ACS-GOTO:\nPage.goto: net::ERR_ABORTED at https://www.sdiarticle4.com/prh/doc/Ms_AJRAVS_67037.pdf\nCall log:\n - navigating to "https://www.sdiarticle4.com/prh/doc/Ms_AJRAVS_67037.pdf", waiting until "domcontentloaded"\n\n\nCode context:\n1350 response = await page.goto(\n1351 url, wait_until=config.wait_until, timeout=config.page_timeout\n1352 )\n1353 redirected_url = page.url\n1354 except Error as e:\n1355 → raise RuntimeError(f"Failed on navigating ACS-GOTO:\\n{str(e)}")\n1356 \n1357 await self.execute_hook(\n1358 "after_goto", page, context=context, url=url, response=response, config=config\n1359 )\n1360 ', session_id=None, response_headers=None, status_code=None, ssl_certificate=None, dispatch_result=None, redirected_url=None)}, {'search_result_model': SearchResultItem(id=113, content='由 ME Maffei 著作 · 2020 · 被引用 39 次 — The were found to be extracted from 13 plant parts and samples originated from 56 countries worldwide. Statistical analyses included the\xa0...', keyword_id=11, page_id=13, title='Plant Natural Sources of the Endocannabinoid (E)', url='https://pmc.ncbi.nlm.nih.gov/articles/PMC7554841/', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\113.html', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 490279)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=114, content='Acalypha ornata Hochst. ex A.Rich. var. bracteosa Müll.Arg. that adds a splash of color to gardens and landscapes.', keyword_id=11, page_id=13, title='Acalypha Ornata Hochst. Ex A.rich. Var. Bracteosa Müll.arg.', url='https://www.botanikks.com/plants/acalypha-ornata-hochst-ex-arich-var-bracteosa-mllarg/636729/1', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\114.html', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 669064)), 'crawl_result': None, 'message': 'already has html_path'}, {'search_result_model': SearchResultItem(id=115, content='由 WM Din 著作 · 2013 · 被引用 15 次 — This is the first report showing that the wound healing property of wilkesiana var. macafeana hort. is mediated by a geraniin containing extract.', keyword_id=11, page_id=13, title='Antioxidant and Cytoprotective Effects of an Ethanol Extract ...', url='https://journals.sagepub.com/doi/abs/10.1177/1934578X1300800325', html_path='G:\\code\\upwork\\zhang_crawl_bio\\output\\results\\Acalypha ornata essential oil\\crawled_urls\\115.html', keyword='Acalypha ornata essential oil', created_at=datetime.datetime(2025, 3, 2, 1, 42, 57, 669705)), 'crawl_result': None, 'message': 'already has html_path'}]
- for item in data:
- item['search_result_model'].html_path = None
- jsonstr = json.dumps(data, indent=4)
- print(jsonstr)
- def main():
- asyncio.run(task())
- # test()
- if __name__ == "__main__":
- main()
|