|
|
@@ -15,10 +15,15 @@ import httpx
|
|
|
import ssl
|
|
|
from sqlmodel import select, Session
|
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
|
|
|
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
|
|
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy,JsonXPathExtractionStrategy
|
|
|
+from crawl4ai.content_filter_strategy import BM25ContentFilter
|
|
|
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|
|
+
|
|
|
from utils.logu import get_logger
|
|
|
from config.settings import OUTPUT_DIR
|
|
|
from utils.drission_page import load_chrome_from_ini,ChromeOptions
|
|
|
-from utils.file import save_to_file,check_exists,s3
|
|
|
+from utils.file import save_to_file,check_exists,s3,read_file
|
|
|
from utils.config import CFG
|
|
|
|
|
|
logger = get_logger('browser')
|
|
|
@@ -28,19 +33,31 @@ ASIN_HTML_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
class Crawler():
|
|
|
def __init__(self, chrome_options:ChromeOptions, storage_config:dict=None):
|
|
|
self.chrome_options = chrome_options
|
|
|
+ self.page = None
|
|
|
+ self.browser_config = {
|
|
|
+ "headless": self.chrome_options.headless,
|
|
|
+ "use_managed_browser": True,
|
|
|
+ }
|
|
|
+ # BrowserConfig(
|
|
|
+ # headless=chrome_options.headless,
|
|
|
+ # verbose=False,
|
|
|
+ # use_managed_browser=True,
|
|
|
+ # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce'
|
|
|
+ # )
|
|
|
+ def get(self, url:str):
|
|
|
+ if not self.page:
|
|
|
+ self.page = load_chrome_from_ini(
|
|
|
+ self.chrome_options
|
|
|
+ )
|
|
|
+ self.page.get(url)
|
|
|
|
|
|
async def run(self, url:str):
|
|
|
page = load_chrome_from_ini(
|
|
|
self.chrome_options
|
|
|
)
|
|
|
craw_ai_browser_config = BrowserConfig(
|
|
|
- headless=self.chrome_options.headless,
|
|
|
- # verbose=False,
|
|
|
- # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
|
|
|
- # debugging_port=int(port),
|
|
|
- use_managed_browser=True,
|
|
|
+ **self.browser_config,
|
|
|
cdp_url=page.browser._driver._websocket_url
|
|
|
- # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce'
|
|
|
)
|
|
|
try:
|
|
|
async with AsyncWebCrawler(config=craw_ai_browser_config) as crawler:
|
|
|
@@ -83,75 +100,88 @@ class Crawler():
|
|
|
data = self.get_asin_page_data(asin, asin_area, mthml_type)
|
|
|
save_path = save_path or str(ASIN_HTML_DIR / f'{asin}.html')
|
|
|
return save_to_file(data, save_path)
|
|
|
+
|
|
|
+ async def cralw4ai_run(self, uri:str) -> CrawlResult:
|
|
|
+ browser_config = BrowserConfig(
|
|
|
+ **self.browser_config,
|
|
|
+ cdp_url = self.page.browser._driver._websocket_url
|
|
|
+ )
|
|
|
+ schema = {
|
|
|
+ "baseSelector": "table.table tbody tr", # 每行数据对应一个tr
|
|
|
+ "fields": [
|
|
|
+ {
|
|
|
+ "name": "traffic_keyword",
|
|
|
+ "selector": "td:first-child a", # 关键词文本
|
|
|
+ "type": "text"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "name": "keyword_link",
|
|
|
+ "selector": "td:first-child a", # 关键词超链接
|
|
|
+ "type": "attribute",
|
|
|
+ "attribute": "href"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "name": "monthly_searches",
|
|
|
+ "selector": "td:nth-child(2) span", # 搜索量数值
|
|
|
+ "type": "text",
|
|
|
+ "transform": lambda x: x.replace(",", "") if x else None # 移除逗号转数字
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "name": "search_trend_link",
|
|
|
+ "selector": "td:nth-child(2) a", # 搜索量趋势链接(带图表)
|
|
|
+ "type": "attribute",
|
|
|
+ "attribute": "href"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "name": "weight",
|
|
|
+ "selector": "td:nth-child(3) i.leaf", # 统计叶子图标数量
|
|
|
+ "type": "count" # 通过计数获取权重值
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "name": "amazon_search_link",
|
|
|
+ "selector": "td:last-child a", # Amazon搜索链接
|
|
|
+ "type": "attribute",
|
|
|
+ "attribute": "href"
|
|
|
+ }
|
|
|
+ ]
|
|
|
+ }
|
|
|
|
|
|
+ dummy_html = self.page.html
|
|
|
+ raw_url = f"raw://{dummy_html}"
|
|
|
+ async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
+ result:CrawlResult = await crawler.arun(
|
|
|
+ url=raw_url,
|
|
|
+ config=CrawlerRunConfig(
|
|
|
+ cache_mode=CacheMode.BYPASS,
|
|
|
+ extraction_strategy=JsonCssExtractionStrategy(schema,verbose=True)
|
|
|
+ )
|
|
|
+ )
|
|
|
+
|
|
|
+ if not result.success:
|
|
|
+ logger.error(f"Crawl failed: {result.error_message}")
|
|
|
+ return
|
|
|
+ data = json.loads(result.extracted_content)
|
|
|
+ logger.info(f"Extracted {len(data)} coin rows")
|
|
|
+ logger.debug(f"First item: {result.extracted_content}")
|
|
|
+ return data
|
|
|
async def task():
|
|
|
asin = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8']
|
|
|
c = Crawler(ChromeOptions())
|
|
|
- page = c.run_browser()
|
|
|
- logger.info(f"{str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')}")
|
|
|
- tab = page.latest_tab
|
|
|
file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml'
|
|
|
# file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html'
|
|
|
# tab.get(file_path)
|
|
|
- # page.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
|
|
|
- # save_to_file(tab.save(), str(ASIN_HTML_DIR / 'B0CQ1SHD8V.png'))
|
|
|
+ c.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
|
|
|
+ res = await c.cralw4ai_run(file_path)
|
|
|
+ # logger.info(f"{res.model_dump()}")
|
|
|
+ # logger.info(f"{json.loads(res.extracted_content)}")
|
|
|
+ # save_to_file(res.model_dump(), OUTPUT_DIR/'page\debug\B0CQ1SHD8V.json')
|
|
|
+ return
|
|
|
+ page = c.run_browser()
|
|
|
+ tab = page.latest_tab
|
|
|
data = tab.save()
|
|
|
- # logger.info(f"{type(data)} , {data[:50]}")
|
|
|
+ logger.info(f"{type(data)} , {data[:50]}")
|
|
|
save_to_file(data, 's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',Metadata={'mykey':'myvalue','mykey2':'myvalue2'})
|
|
|
- return
|
|
|
- # 附带源信息上传 https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html#the-extraargs-parameter
|
|
|
- with open(file_path, 'rb') as data:
|
|
|
- s3.upload_fileobj(
|
|
|
- data, "public", 'amazone/copywriting_production/output/B0CQ1SHD8V.mhtml',
|
|
|
- ExtraArgs={
|
|
|
- 'Metadata': {'mykey': 'myvalue'},
|
|
|
- 'ContentType': 'text/html'
|
|
|
- })
|
|
|
- # c.get_asin_and_save_page(asin[0], 'JP', save_path=str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
|
|
|
- # logger.info(f"{CFG.s3_secret_key}")
|
|
|
- # c.get_asin_and_save_page(
|
|
|
- # asin[0],
|
|
|
- # 'JP',
|
|
|
- # save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
|
|
|
- # overwrite=True
|
|
|
- # )
|
|
|
- # page.save(str(ASIN_HTML_DIR), name=f'{asin[0]}.html')
|
|
|
- # save_to_file(page.html, str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
|
|
|
- # await c.run('https://fr.florame.com/en/essential-oils')
|
|
|
- return
|
|
|
- port = page.browser._chromium_options._address.split(':')[-1]
|
|
|
- logger.info(f"{page.browser._driver.get(f'http://{page.browser._chromium_options._address}/json').json()}")
|
|
|
- logger.info(f"{page.browser._driver._websocket_url}")
|
|
|
- item_id = 1
|
|
|
- # url = 'https://greg.app/acalypha-marissima-overview/'
|
|
|
- url = 'https://fr.florame.com/en/essential-oils'
|
|
|
- # url = 'https://repository.arizona.edu/bitstream/10150/550946/1/dp_04_01-04.pdf'
|
|
|
- # url = 'https://baidu.com'
|
|
|
- browser_config = BrowserConfig(
|
|
|
- headless=False,
|
|
|
- # verbose=False,
|
|
|
- # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
|
|
|
- # debugging_port=int(port),
|
|
|
- use_managed_browser=True,
|
|
|
- cdp_url=page.browser._driver._websocket_url
|
|
|
- # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce'
|
|
|
- )
|
|
|
- # async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
- # crawler_config = CrawlerRunConfig(
|
|
|
- # cache_mode=CacheMode.BYPASS
|
|
|
- # )
|
|
|
- # result = await crawler.arun(url=url, config=crawler_config)
|
|
|
- # print(result.markdown)
|
|
|
-
|
|
|
- crawler = AsyncWebCrawler(config=browser_config)
|
|
|
- await crawler.start()
|
|
|
- crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
|
|
- result:CrawlResult = await crawler.arun(url=url, config=crawl_config)
|
|
|
- logger.info(f"{item_id} crawler.arun result.success: {result.model_dump_json(indent=2)} ")
|
|
|
- print(result.markdown)
|
|
|
- input('press enter to continue')
|
|
|
- await crawler.close()
|
|
|
- # page.quit()
|
|
|
+
|
|
|
|
|
|
def main():
|
|
|
asyncio.run(task())
|