amazon
/
copywriting_production


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
							import asyncio
import datetime
import json
import os
import sys
import time
import asyncio
import signal
import asyncio
import pickle
from pathlib import Path,PurePath,PurePosixPath
import random
from typing import List
import httpx
import ssl
from sqlmodel import select, Session
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy,JsonXPathExtractionStrategy,ExtractionStrategy
from crawl4ai.content_filter_strategy import BM25ContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
import base64
from utils.logu import get_logger
from config.settings import OUTPUT_DIR,TEMP_PAGE_DIR
from utils.drission_page import load_chrome_from_ini,ChromeOptions
from utils.file import save_to_file,check_exists,s3,read_file
from config.settings import CFG
from src.browser.crawl_base import CrawlerBase
from upath import UPath

logger = get_logger('browser')
ASIN_HTML_DIR = OUTPUT_DIR / 'page' / 'asin'
ASIN_HTML_DIR.mkdir(parents=True, exist_ok=True)

class Crawler(CrawlerBase):
    s3_prefix = f"{CFG.s3_prefix}/output/asinseed/"
    def __init__(self, chrome_options:ChromeOptions):
        super().__init__(chrome_options)
    
    def get_asin_url(self, asin:str, asin_area:str):
        # https://www.asinseed.com/en/JP?q=B0CQ1SHD8V
        return f"https://www.asinseed.com/en/{asin_area}?q={asin}"

    def get_asin_page_data(self, asin:str, asin_area:str, mthml_type:bool=True):
        page = load_chrome_from_ini(
            self.chrome_options
        )
        url = self.get_asin_url(asin, asin_area)
        page.get(url)
        if mthml_type:
            return page.save()
        else:
            return page.html
    def get_asin_and_save_page(self, asin:str, asin_area:str='JP', mthml_type:bool=True, save_path:str=None, overwrite:bool=False):
        if not overwrite and check_exists(save_path):
            logger.info(f"exists {save_path} ")
            return save_path
        data = self.get_asin_page_data(asin, asin_area, mthml_type)
        save_path = save_path or str(ASIN_HTML_DIR / f'{asin}{".mhtml" if mthml_type else ".html"}')
        return save_to_file(data, save_path)
    

    async def extra_result_table(self, html:str, input_schema:dict={}) -> CrawlResult:
        schema = input_schema or{
            "baseSelector": "table.table tbody tr",  # 每行数据对应一个tr
            "fields": [
                {
                    "name": "traffic_keyword",
                    "selector": "td:first-child a",  # 关键词文本
                    "type": "text"
                },
                {
                    "name": "keyword_link",
                    "selector": "td:first-child a",  # 关键词超链接
                    "type": "attribute",
                    "attribute": "href"
                },
                {
                    "name": "monthly_searches",
                    "selector": "td:nth-child(2) span",  # 搜索量数值
                    "type": "text",
                    "transform": lambda x: x.replace(",", "") if x else None  # 移除逗号转数字
                },
                {
                    "name": "search_trend_link",
                    "selector": "td:nth-child(2) a",  # 搜索量趋势链接（带图表）
                    "type": "attribute",
                    "attribute": "href"
                },
                {
                    "name": "weight",
                    "selector": "td:nth-child(3) i.leaf",  # 统计叶子图标数量
                    "type": "count"  # 通过计数获取权重值
                },
                {
                    "name": "amazon_search_link",
                    "selector": "td:last-child a",  # Amazon搜索链接
                    "type": "attribute",
                    "attribute": "href"
                }
            ]
        }
        
        result = await self.excra_strategy_raw_html(html, schema, JsonCssExtractionStrategy)
        if not result.success:
            logger.error(f"Crawl failed: {result.error_message}")
            return
        data = json.loads(result.extracted_content)
        logger.info(f"Extracted {len(data)} coin rows")
        logger.debug(f"First item: {result.extracted_content}")
        # [{"traffic_keyword":"","keyword_link":"..."}, {}]
        return data
    
    async def excra_product_info(self, html:str, input_schema:dict={}, strategy:ExtractionStrategy=JsonXPathExtractionStrategy) -> CrawlResult:
        schema = input_schema or {
            "name": "Product Details",
            "baseSelector": "div.js-sticky-block",
            "fields": [
                {
                    "name": "product_info",
                    "selector": "#div-asin-product-infor",
                    "type": "nested",
                    "fields": [
                        {
                            "name": "image_url",
                            "selector": "div.avatar-self-pic img",
                            "type": "attribute",
                            "attribute": "src"
                        },
                        {
                            "name": "goto_amazon",
                            "selector": "a.btn-asinseed-link",
                            "type": "attribute",
                            "attribute": "href"
                        },
                        {
                            "name": "main_text",
                            "selector": "div.media-body h4",
                            "type": "text",
                            "transform": ["strip"]
                        }
                    ]
                },
                {
                    "name": "unique_words",
                    "selector": "h3:has(+ article) + article span.badge-asinseed-keywords-weight",  # 精准定位到目标article
                    "type": "list",
                    "fields": [
                        {"name": "word", "type": "text"}  # 保持字段结构
                    ],
                    "transform": ["extract_list"]  # 添加转换器将对象列表转为纯文本列表
                }
            ]
        }
        result:CrawlResult = await self.excra_strategy_raw_html(html, schema, JsonCssExtractionStrategy)
        if not result.success:
            logger.error(f"Crawl failed: {result.error_message}")
            return
        data = json.loads(result.extracted_content)
        logger.info(f"Extracted {len(data)} coin rows")
        logger.debug(f"result.extracted_content: {result.extracted_content}")
        data = data[0] if data else {}
        excract_unique_words = data.get('unique_words', [])
        if excract_unique_words:
            data['unique_words'] = [item['word'] for item in excract_unique_words]
        return data

    def get_mpath_html_content(self, mhtml_path:str):
        mhtml_data = read_file(mhtml_path)
        mhtml_path_name = PurePath(mhtml_path).name
        temp_mhtml_path = save_to_file(mhtml_data, str(TEMP_PAGE_DIR / mhtml_path_name))
        self.get(temp_mhtml_path)
        html_content = self.page.html
        return temp_mhtml_path,html_content

    async def extract_product_and_save_resource(self, html_content:str, upload_s3_dir:str=None):
        data = await self.excra_product_info(html_content)
        if data['product_info'].get('image_url'):
            img_name = UPath(data['product_info']['image_url']).name
            img_path = str(UPath(upload_s3_dir) / img_name)
            logger.info(f"upload_s3_dir {upload_s3_dir}")
            status,save_img_path = await asyncio.to_thread(self.download_img, 
                data['product_info']['image_url'],
                as_img_base64=False,
                img_path=img_path)
        data['product_info']['img_path'] = save_img_path
        logger.info(f"{json.dumps(data, indent=4,ensure_ascii=False)}")
        return data
async def task():
    asin = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8']
    c = Crawler(ChromeOptions())
    file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml'
    # file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html'
    # tab.get(file_path)
    c.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
    # res = c.download_img(
    #     'https://www.asinseed.com/assets/svg/flat-icons/notice.svg?v=20181122',
    #     upload_s3_dir='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V/')
    # logger.info(f"{res}")
    # logger.info(f"{res.extracted_content}")
    
    # res = await c.cralw4ai_run(file_path)
    # logger.info(f"{res.model_dump()}")
    # save_to_file(res.model_dump(), OUTPUT_DIR/'page\debug\B0CQ1SHD8V.json')
    return
    page = c.run_browser()
    tab = page.latest_tab
    data = tab.save()
    logger.info(f"{type(data)} , {data[:50]}")
    save_to_file(data, 's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',Metadata={'mykey':'myvalue','mykey2':'myvalue2'})
 

def main():
    asyncio.run(task())
    # test()

if __name__ == "__main__":
    main()