| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218 |
- import asyncio
- import datetime
- import json
- import os
- import sys
- import time
- import asyncio
- import signal
- import asyncio
- import pickle
- from pathlib import Path,PurePath,PurePosixPath
- import random
- from typing import List
- import httpx
- import ssl
- from sqlmodel import select, Session
- from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
- from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
- from crawl4ai.extraction_strategy import JsonCssExtractionStrategy,JsonXPathExtractionStrategy,ExtractionStrategy
- from crawl4ai.content_filter_strategy import BM25ContentFilter
- from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
- import base64
- from utils.logu import get_logger
- from config.settings import OUTPUT_DIR,TEMP_PAGE_DIR
- from utils.drission_page import load_chrome_from_ini,ChromeOptions
- from utils.file import save_to_file,check_exists,s3,read_file
- from config.settings import CFG
- from src.browser.crawl_base import CrawlerBase
- from upath import UPath
- logger = get_logger('browser')
- ASIN_HTML_DIR = OUTPUT_DIR / 'page' / 'asin'
- ASIN_HTML_DIR.mkdir(parents=True, exist_ok=True)
- class Crawler(CrawlerBase):
- s3_prefix = f"{CFG.s3_prefix}/output/asinseed/"
- def __init__(self, chrome_options:ChromeOptions):
- super().__init__(chrome_options)
-
- def get_asin_url(self, asin:str, asin_area:str):
- # https://www.asinseed.com/en/JP?q=B0CQ1SHD8V
- return f"https://www.asinseed.com/en/{asin_area}?q={asin}"
- def get_asin_page_data(self, asin:str, asin_area:str, mthml_type:bool=True):
- page = load_chrome_from_ini(
- self.chrome_options
- )
- url = self.get_asin_url(asin, asin_area)
- page.get(url)
- if mthml_type:
- return page.save()
- else:
- return page.html
- def get_asin_and_save_page(self, asin:str, asin_area:str='JP', mthml_type:bool=True, save_path:str=None, overwrite:bool=False):
- if not overwrite and check_exists(save_path):
- logger.info(f"exists {save_path} ")
- return save_path
- data = self.get_asin_page_data(asin, asin_area, mthml_type)
- save_path = save_path or str(ASIN_HTML_DIR / f'{asin}{".mhtml" if mthml_type else ".html"}')
- return save_to_file(data, save_path)
-
- async def extra_result_table(self, html:str, input_schema:dict={}) -> CrawlResult:
- schema = input_schema or{
- "baseSelector": "table.table tbody tr", # 每行数据对应一个tr
- "fields": [
- {
- "name": "traffic_keyword",
- "selector": "td:first-child a", # 关键词文本
- "type": "text"
- },
- {
- "name": "keyword_link",
- "selector": "td:first-child a", # 关键词超链接
- "type": "attribute",
- "attribute": "href"
- },
- {
- "name": "monthly_searches",
- "selector": "td:nth-child(2) span", # 搜索量数值
- "type": "text",
- "transform": lambda x: x.replace(",", "") if x else None # 移除逗号转数字
- },
- {
- "name": "search_trend_link",
- "selector": "td:nth-child(2) a", # 搜索量趋势链接(带图表)
- "type": "attribute",
- "attribute": "href"
- },
- {
- "name": "weight",
- "selector": "td:nth-child(3) i.leaf", # 统计叶子图标数量
- "type": "count" # 通过计数获取权重值
- },
- {
- "name": "amazon_search_link",
- "selector": "td:last-child a", # Amazon搜索链接
- "type": "attribute",
- "attribute": "href"
- }
- ]
- }
-
- result = await self.excra_strategy_raw_html(html, schema, JsonCssExtractionStrategy)
- if not result.success:
- logger.error(f"Crawl failed: {result.error_message}")
- return
- data = json.loads(result.extracted_content)
- logger.info(f"Extracted {len(data)} coin rows")
- logger.debug(f"First item: {result.extracted_content}")
- # [{"traffic_keyword":"","keyword_link":"..."}, {}]
- return data
-
- async def excra_product_info(self, html:str, input_schema:dict={}, strategy:ExtractionStrategy=JsonXPathExtractionStrategy) -> CrawlResult:
- schema = input_schema or {
- "name": "Product Details",
- "baseSelector": "div.js-sticky-block",
- "fields": [
- {
- "name": "product_info",
- "selector": "#div-asin-product-infor",
- "type": "nested",
- "fields": [
- {
- "name": "image_url",
- "selector": "div.avatar-self-pic img",
- "type": "attribute",
- "attribute": "src"
- },
- {
- "name": "goto_amazon",
- "selector": "a.btn-asinseed-link",
- "type": "attribute",
- "attribute": "href"
- },
- {
- "name": "main_text",
- "selector": "div.media-body h4",
- "type": "text",
- "transform": ["strip"]
- }
- ]
- },
- {
- "name": "unique_words",
- "selector": "h3:has(+ article) + article span.badge-asinseed-keywords-weight", # 精准定位到目标article
- "type": "list",
- "fields": [
- {"name": "word", "type": "text"} # 保持字段结构
- ],
- "transform": ["extract_list"] # 添加转换器将对象列表转为纯文本列表
- }
- ]
- }
- result:CrawlResult = await self.excra_strategy_raw_html(html, schema, JsonCssExtractionStrategy)
- if not result.success:
- logger.error(f"Crawl failed: {result.error_message}")
- return
- data = json.loads(result.extracted_content)
- logger.info(f"Extracted {len(data)} coin rows")
- logger.debug(f"result.extracted_content: {result.extracted_content}")
- data = data[0] if data else {}
- excract_unique_words = data.get('unique_words', [])
- if excract_unique_words:
- data['unique_words'] = [item['word'] for item in excract_unique_words]
- return data
- def get_mpath_html_content(self, mhtml_path:str):
- mhtml_data = read_file(mhtml_path)
- mhtml_path_name = PurePath(mhtml_path).name
- temp_mhtml_path = save_to_file(mhtml_data, str(TEMP_PAGE_DIR / mhtml_path_name))
- self.get(temp_mhtml_path)
- html_content = self.page.html
- return temp_mhtml_path,html_content
- async def extract_product_and_save_resource(self, html_content:str, upload_s3_dir:str=None):
- data = await self.excra_product_info(html_content)
- if data['product_info'].get('image_url'):
- img_name = UPath(data['product_info']['image_url']).name
- img_path = str(UPath(upload_s3_dir) / img_name)
- logger.info(f"upload_s3_dir {upload_s3_dir}")
- status,save_img_path = await asyncio.to_thread(self.download_img,
- data['product_info']['image_url'],
- as_img_base64=False,
- img_path=img_path)
- data['product_info']['img_path'] = save_img_path
- logger.info(f"{json.dumps(data, indent=4,ensure_ascii=False)}")
- return data
- async def task():
- asin = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8']
- c = Crawler(ChromeOptions())
- file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml'
- # file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html'
- # tab.get(file_path)
- c.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
- # res = c.download_img(
- # 'https://www.asinseed.com/assets/svg/flat-icons/notice.svg?v=20181122',
- # upload_s3_dir='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V/')
- # logger.info(f"{res}")
- # logger.info(f"{res.extracted_content}")
-
- # res = await c.cralw4ai_run(file_path)
- # logger.info(f"{res.model_dump()}")
- # save_to_file(res.model_dump(), OUTPUT_DIR/'page\debug\B0CQ1SHD8V.json')
- return
- page = c.run_browser()
- tab = page.latest_tab
- data = tab.save()
- logger.info(f"{type(data)} , {data[:50]}")
- save_to_file(data, 's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',Metadata={'mykey':'myvalue','mykey2':'myvalue2'})
-
- def main():
- asyncio.run(task())
- # test()
- if __name__ == "__main__":
- main()
|