def download_img(): from DrissionPage import SessionPage c = Crawler(ChromeOptions()) url = 'https://m.media-amazon.com/images/I/41hY78XIaiL._AC_US200_.jpg' save_path = r'G:\code\amazone\copywriting_production\output\page\temp' res = c.page.download(url, save_path) logger.info(f"{res}") async def task(): asin = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8'] c = Crawler(ChromeOptions()) page = c.run_browser() logger.info(f"{str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')}") tab = page.latest_tab file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml' # file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html' # tab.get(file_path) # page.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml') # save_to_file(tab.save(), str(ASIN_HTML_DIR / 'B0CQ1SHD8V.png')) data = tab.save() # logger.info(f"{type(data)} , {data[:50]}") save_to_file(data, 's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',Metadata={'mykey':'myvalue','mykey2':'myvalue2'}) return # 附带源信息上传 https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html#the-extraargs-parameter with open(file_path, 'rb') as data: s3.upload_fileobj( data, "public", 'amazone/copywriting_production/output/B0CQ1SHD8V.mhtml', ExtraArgs={ 'Metadata': {'mykey': 'myvalue'}, 'ContentType': 'text/html' }) # c.get_asin_and_save_page(asin[0], 'JP', save_path=str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')) # logger.info(f"{CFG.s3_secret_key}") # c.get_asin_and_save_page( # asin[0], # 'JP', # save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html', # overwrite=True # ) # page.save(str(ASIN_HTML_DIR), name=f'{asin[0]}.html') # save_to_file(page.html, str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')) # await c.run('https://fr.florame.com/en/essential-oils') return port = page.browser._chromium_options._address.split(':')[-1] logger.info(f"{page.browser._driver.get(f'http://{page.browser._chromium_options._address}/json').json()}") logger.info(f"{page.browser._driver._websocket_url}") item_id = 1 # url = 'https://greg.app/acalypha-marissima-overview/' url = 'https://fr.florame.com/en/essential-oils' # url = 'https://repository.arizona.edu/bitstream/10150/550946/1/dp_04_01-04.pdf' # url = 'https://baidu.com' browser_config = BrowserConfig( headless=False, # verbose=False, # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"], # debugging_port=int(port), use_managed_browser=True, cdp_url=page.browser._driver._websocket_url # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce' ) # async with AsyncWebCrawler(config=browser_config) as crawler: # crawler_config = CrawlerRunConfig( # cache_mode=CacheMode.BYPASS # ) # result = await crawler.arun(url=url, config=crawler_config) # print(result.markdown) crawler = AsyncWebCrawler(config=browser_config) await crawler.start() crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED) result:CrawlResult = await crawler.arun(url=url, config=crawl_config) logger.info(f"{item_id} crawler.arun result.success: {result.model_dump_json(indent=2)} ") print(result.markdown) input('press enter to continue') await crawler.close() # page.quit() def main(): asyncio.run(task()) # test() if __name__ == "__main__": main()