|
|
@@ -0,0 +1,133 @@
|
|
|
+import asyncio
|
|
|
+import datetime
|
|
|
+import json
|
|
|
+import os
|
|
|
+import sys
|
|
|
+import time
|
|
|
+import asyncio
|
|
|
+import signal
|
|
|
+import asyncio
|
|
|
+import pickle
|
|
|
+from pathlib import Path
|
|
|
+import random
|
|
|
+from typing import List
|
|
|
+import httpx
|
|
|
+import ssl
|
|
|
+from sqlmodel import select, Session
|
|
|
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
|
|
|
+from utils.logu import get_logger
|
|
|
+from config.settings import OUTPUT_DIR
|
|
|
+from utils.drission_page import load_chrome_from_ini,ChromeOptions
|
|
|
+from utils.file import save_to_file
|
|
|
+logger = get_logger('browser')
|
|
|
+ASIN_HTML_DIR = OUTPUT_DIR / 'page' / 'asin'
|
|
|
+ASIN_HTML_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+class Crawler():
|
|
|
+ def __init__(self, chrome_options:ChromeOptions):
|
|
|
+ self.chrome_options = chrome_options
|
|
|
+
|
|
|
+ async def run(self, url:str):
|
|
|
+ page = load_chrome_from_ini(
|
|
|
+ self.chrome_options
|
|
|
+ )
|
|
|
+ craw_ai_browser_config = BrowserConfig(
|
|
|
+ headless=self.chrome_options.headless,
|
|
|
+ # verbose=False,
|
|
|
+ # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
|
|
|
+ # debugging_port=int(port),
|
|
|
+ use_managed_browser=True,
|
|
|
+ cdp_url=page.browser._driver._websocket_url
|
|
|
+ # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce'
|
|
|
+ )
|
|
|
+ try:
|
|
|
+ async with AsyncWebCrawler(config=craw_ai_browser_config) as crawler:
|
|
|
+ crawler_config = CrawlerRunConfig(
|
|
|
+ cache_mode=CacheMode.BYPASS
|
|
|
+ )
|
|
|
+ result:CrawlResult = await crawler.arun(url=url, config=crawler_config)
|
|
|
+ logger.info(f"{result.markdown}")
|
|
|
+ logger.info(f"{result.model_dump_json()}")
|
|
|
+ except Exception as e:
|
|
|
+ logger.exception(f"{e}")
|
|
|
+ finally:
|
|
|
+ page.quit()
|
|
|
+ return result
|
|
|
+
|
|
|
+ def run_browser(self):
|
|
|
+ page = load_chrome_from_ini(
|
|
|
+ self.chrome_options
|
|
|
+ )
|
|
|
+ return page
|
|
|
+
|
|
|
+ def get_asin_url(self, asin:str, asin_area:str):
|
|
|
+ # https://www.asinseed.com/en/JP?q=B0CQ1SHD8V
|
|
|
+ return f"https://www.asinseed.com/en/{asin_area}?q={asin}"
|
|
|
+
|
|
|
+ def get_asin_page_data(self, asin:str, asin_area:str, mthml_type:bool=True):
|
|
|
+ page = load_chrome_from_ini(
|
|
|
+ self.chrome_options
|
|
|
+ )
|
|
|
+ url = self.get_asin_url(asin, asin_area)
|
|
|
+ page.get(url)
|
|
|
+ if mthml_type:
|
|
|
+ return page.save(str(ASIN_HTML_DIR), name=f'{asin}')
|
|
|
+ else:
|
|
|
+ return page.html
|
|
|
+ def get_asin_and_save_page(self, asin:str, asin_area:str='JP', mthml_type:bool=True, save_path:str=None):
|
|
|
+ data = self.get_asin_page_data(asin, asin_area, mthml_type)
|
|
|
+ save_path = save_path or str(ASIN_HTML_DIR / f'{asin}.html')
|
|
|
+ save_to_file(data, save_path)
|
|
|
+ return save_path
|
|
|
+async def task():
|
|
|
+ asin = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8']
|
|
|
+ c = Crawler(ChromeOptions())
|
|
|
+ page = c.run_browser()
|
|
|
+ logger.info(f"{str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')}")
|
|
|
+ tab = page.latest_tab
|
|
|
+ tab.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
|
|
|
+ save_to_file(page.html, str(OUTPUT_DIR / 'page/debug' / f'{asin[0]}-from-mthml.html'))
|
|
|
+ # page.save(str(ASIN_HTML_DIR), name=f'{asin[0]}.html')
|
|
|
+ # save_to_file(page.html, str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
|
|
|
+ # await c.run('https://fr.florame.com/en/essential-oils')
|
|
|
+ return
|
|
|
+ port = page.browser._chromium_options._address.split(':')[-1]
|
|
|
+ logger.info(f"{page.browser._driver.get(f'http://{page.browser._chromium_options._address}/json').json()}")
|
|
|
+ logger.info(f"{page.browser._driver._websocket_url}")
|
|
|
+ item_id = 1
|
|
|
+ # url = 'https://greg.app/acalypha-marissima-overview/'
|
|
|
+ url = 'https://fr.florame.com/en/essential-oils'
|
|
|
+ # url = 'https://repository.arizona.edu/bitstream/10150/550946/1/dp_04_01-04.pdf'
|
|
|
+ # url = 'https://baidu.com'
|
|
|
+ browser_config = BrowserConfig(
|
|
|
+ headless=False,
|
|
|
+ # verbose=False,
|
|
|
+ # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
|
|
|
+ # debugging_port=int(port),
|
|
|
+ use_managed_browser=True,
|
|
|
+ cdp_url=page.browser._driver._websocket_url
|
|
|
+ # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce'
|
|
|
+ )
|
|
|
+ # async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
+ # crawler_config = CrawlerRunConfig(
|
|
|
+ # cache_mode=CacheMode.BYPASS
|
|
|
+ # )
|
|
|
+ # result = await crawler.arun(url=url, config=crawler_config)
|
|
|
+ # print(result.markdown)
|
|
|
+
|
|
|
+ crawler = AsyncWebCrawler(config=browser_config)
|
|
|
+ await crawler.start()
|
|
|
+ crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
|
|
+ result:CrawlResult = await crawler.arun(url=url, config=crawl_config)
|
|
|
+ logger.info(f"{item_id} crawler.arun result.success: {result.model_dump_json(indent=2)} ")
|
|
|
+ print(result.markdown)
|
|
|
+ input('press enter to continue')
|
|
|
+ await crawler.close()
|
|
|
+ # page.quit()
|
|
|
+
|
|
|
+def main():
|
|
|
+ asyncio.run(task())
|
|
|
+ # test()
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|