t_crawler.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. def download_img():
  2. from DrissionPage import SessionPage
  3. c = Crawler(ChromeOptions())
  4. url = 'https://m.media-amazon.com/images/I/41hY78XIaiL._AC_US200_.jpg'
  5. save_path = r'G:\code\amazone\copywriting_production\output\page\temp'
  6. res = c.page.download(url, save_path)
  7. logger.info(f"{res}")
  8. async def task():
  9. asin = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8']
  10. c = Crawler(ChromeOptions())
  11. page = c.run_browser()
  12. logger.info(f"{str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')}")
  13. tab = page.latest_tab
  14. file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml'
  15. # file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html'
  16. # tab.get(file_path)
  17. # page.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
  18. # save_to_file(tab.save(), str(ASIN_HTML_DIR / 'B0CQ1SHD8V.png'))
  19. data = tab.save()
  20. # logger.info(f"{type(data)} , {data[:50]}")
  21. save_to_file(data, 's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',Metadata={'mykey':'myvalue','mykey2':'myvalue2'})
  22. return
  23. # 附带源信息上传 https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html#the-extraargs-parameter
  24. with open(file_path, 'rb') as data:
  25. s3.upload_fileobj(
  26. data, "public", 'amazone/copywriting_production/output/B0CQ1SHD8V.mhtml',
  27. ExtraArgs={
  28. 'Metadata': {'mykey': 'myvalue'},
  29. 'ContentType': 'text/html'
  30. })
  31. # c.get_asin_and_save_page(asin[0], 'JP', save_path=str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
  32. # logger.info(f"{CFG.s3_secret_key}")
  33. # c.get_asin_and_save_page(
  34. # asin[0],
  35. # 'JP',
  36. # save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
  37. # overwrite=True
  38. # )
  39. # page.save(str(ASIN_HTML_DIR), name=f'{asin[0]}.html')
  40. # save_to_file(page.html, str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
  41. # await c.run('https://fr.florame.com/en/essential-oils')
  42. return
  43. port = page.browser._chromium_options._address.split(':')[-1]
  44. logger.info(f"{page.browser._driver.get(f'http://{page.browser._chromium_options._address}/json').json()}")
  45. logger.info(f"{page.browser._driver._websocket_url}")
  46. item_id = 1
  47. # url = 'https://greg.app/acalypha-marissima-overview/'
  48. url = 'https://fr.florame.com/en/essential-oils'
  49. # url = 'https://repository.arizona.edu/bitstream/10150/550946/1/dp_04_01-04.pdf'
  50. # url = 'https://baidu.com'
  51. browser_config = BrowserConfig(
  52. headless=False,
  53. # verbose=False,
  54. # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
  55. # debugging_port=int(port),
  56. use_managed_browser=True,
  57. cdp_url=page.browser._driver._websocket_url
  58. # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce'
  59. )
  60. # async with AsyncWebCrawler(config=browser_config) as crawler:
  61. # crawler_config = CrawlerRunConfig(
  62. # cache_mode=CacheMode.BYPASS
  63. # )
  64. # result = await crawler.arun(url=url, config=crawler_config)
  65. # print(result.markdown)
  66. crawler = AsyncWebCrawler(config=browser_config)
  67. await crawler.start()
  68. crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
  69. result:CrawlResult = await crawler.arun(url=url, config=crawl_config)
  70. logger.info(f"{item_id} crawler.arun result.success: {result.model_dump_json(indent=2)} ")
  71. print(result.markdown)
  72. input('press enter to continue')
  73. await crawler.close()
  74. # page.quit()
  75. def main():
  76. asyncio.run(task())
  77. # test()
  78. if __name__ == "__main__":
  79. main()