Browse Source

基本的 asin 获取

mrh 9 tháng trước cách đây
commit
8e47047418

+ 1 - 0
.env

@@ -0,0 +1 @@
+DB_URL = "postgresql+psycopg2://user:password@sv-v2.lan:5435/copywriting_production"

+ 11 - 0
.gitignore

@@ -0,0 +1,11 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+# Virtual environments
+.venv
+output/

+ 0 - 0
README.md


+ 34 - 0
config/dp_conf/9321.ini

@@ -0,0 +1,34 @@
+[paths]
+download_path = .
+tmp_path = 
+
+[chromium_options]
+address = 127.0.0.1:9321
+browser_path = C:\Program Files\Google\Chrome\Application\chrome.exe
+arguments = ['--no-default-browser-check', '--disable-suggestions-ui', '--no-first-run', '--disable-infobars', '--disable-popup-blocking', '--hide-crash-restore-bubble', '--disable-features=PrivacySandboxSettings4', '--mute-audio', '--lang=en-US', '--proxy-server=localhost:8851', '--user-data-dir=G:\\code\\amazone\\copywriting_production\\output\\user_data_dir']
+extensions = []
+prefs = {'profile.default_content_settings.popups': 0, 'profile.default_content_setting_values': {'notifications': 2}}
+flags = {}
+load_mode = normal
+user = Default
+auto_port = False
+system_user_path = False
+existing_only = False
+new_env = False
+
+[session_options]
+headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'connection': 'keep-alive', 'accept-charset': 'GB2312,utf-8;q=0.7,*;q=0.7'}
+
+[timeouts]
+base = 10
+page_load = 30
+script = 30
+
+[proxies]
+http = localhost:8851
+https = localhost:8851
+
+[others]
+retry_times = 3
+retry_interval = 2
+

+ 10 - 0
config/settings.py

@@ -0,0 +1,10 @@
+import os
+from pathlib import Path
+WORK_DIR = Path(__file__).resolve().parent.parent
+OUTPUT_DIR=WORK_DIR / "output"
+DB_URL = os.environ.get('DB_URL') or "postgresql+psycopg2://user:password@sv-v2.lan:5435/copywriting_production"
+LOG_LEVEL = 'DEBUG'
+LOG_DIR = Path(__file__).resolve().parent.parent/'output/logs'   
+
+CONFIG_DIR = Path(__file__).resolve().parent.parent/'config'
+BROWSER_CONFIG_DIR = CONFIG_DIR / 'dp_conf'

+ 28 - 0
docs/data_explain.md

@@ -0,0 +1,28 @@
+竞品关键词搜索
+
+https://www.asinseed.com/cn/是这个链接
+
+
+
+“热门搜索词”控制面板
+“热门搜索词”控制面板可让您查看特定时间段内的热门搜索词(按搜索频率排列)。对于每个搜索词,它都会相应地显示点击量最高的 3 种商品、点击量最高的 3 个分类和点击量最高的 3 个品牌。
+一般定义
+隐藏/显示其他指标详情
+搜索频率等级
+搜索频率排名顺序。在所选时间段内,搜索频率相同的搜索词将显示相同的排名。
+该搜索词在过去 24 小时内被搜索的总次数,与点击量或转化量无关。注意: 此指标最近进行了更新,现在更准确。
+点击量最高的商品
+因该搜索词生成最多结果的前 3 种商品。
+对于给定时间段内从该搜索词返回的所有结果,这些是用户点击(商品详情页面获得访问)的前 3 种商品。前 3 种商品按点击量顺序列出。
+点击量最高的分类
+因该搜索词生成最多结果的前 3 个分类。
+对于给定时间段内从该搜索词返回的所有结果,这些是商品因该搜索词获得点击(商品详情页面获得访问)的前 3 个分类。前 3 个分类按点击量顺序列出。请注意,前 3 个分类不一定与前 3 种商品相对应,即排名第一的分类可能与排名第一的商品或排名第一的品牌不对应。
+点击量最高的品牌
+因该搜索词生成最多结果的前 3 个品牌。
+对于给定时间段内从该搜索词返回的所有结果,这些是商品因该搜索词获得点击(商品详情页面获得访问)的前 3 个品牌。前 3 个品牌按点击量顺序列出。请注意,前 3 个品牌不一定与前 3 种商品相对应,即排名第一的品牌可能与排名第一的商品或排名第一的分类不对应。如果品牌名称未通过亚马逊 Brand Registry 注册在特定品牌名下,则该品牌名称可能显示为“-”。
+点击份额
+在选定时间段内,该 ASIN 获得的点击量除以搜索结果中所有 ASIN 获得的总点击量。
+点击量指的是特定时间段内搜索结果页面产生的 ASIN 总点击量。当搜索结果页面显示 ASIN 的详情页面时,点击才算有效。注意: 此指标最近进行了更新,现在更准确。点击份额现在考虑至少获得一次点击的所有 ASIN。
+转化率份额
+在选定时间段内,该 ASIN 的转化率除以搜索结果中所有 ASIN 的总转化率。
+转化指的是买家从点击 ASIN 成功转换为购买。此指标包含搜索结果页面产生的商品购买量与点击量百分比。注意: 此指标最近进行了更新,现在更准确。转化份额现在仅统计归因于搜索词的点击后购买的 ASIN。

+ 6 - 0
docs/dev.md

@@ -0,0 +1,6 @@
+```shell
+conda create -n copywriting  python==3.12 -y
+conda activate copywriting
+uv install sqlmodel loguru pandas sqlmodel drissionpage crawl4ai redis celery flower
+$env:PYTHONPATH = "$env:PYTHONPATH;$PWD"
+```

+ 15 - 0
pyproject.toml

@@ -0,0 +1,15 @@
+[project]
+name = "copywriting-production"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12.*"
+dependencies = [
+    "celery>=5.4.0",
+    "drissionpage>=4.1.0.17",
+    "flower>=2.0.1",
+    "loguru>=0.7.3",
+    "pandas>=2.2.3",
+    "redis>=5.2.1",
+    "sqlmodel>=0.0.24",
+]

+ 133 - 0
src/browser/crawl_asin.py

@@ -0,0 +1,133 @@
+import asyncio
+import datetime
+import json
+import os
+import sys
+import time
+import asyncio
+import signal
+import asyncio
+import pickle
+from pathlib import Path
+import random
+from typing import List
+import httpx
+import ssl
+from sqlmodel import select, Session
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
+from utils.logu import get_logger
+from config.settings import OUTPUT_DIR
+from utils.drission_page import load_chrome_from_ini,ChromeOptions
+from utils.file import save_to_file
+logger = get_logger('browser')
+ASIN_HTML_DIR = OUTPUT_DIR / 'page' / 'asin'
+ASIN_HTML_DIR.mkdir(parents=True, exist_ok=True)
+
+class Crawler():
+    def __init__(self, chrome_options:ChromeOptions):
+        self.chrome_options = chrome_options
+    
+    async def run(self, url:str):
+        page = load_chrome_from_ini(
+            self.chrome_options
+        )
+        craw_ai_browser_config = BrowserConfig(
+            headless=self.chrome_options.headless,
+            # verbose=False,
+            # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
+            # debugging_port=int(port),
+            use_managed_browser=True,
+            cdp_url=page.browser._driver._websocket_url
+            # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce'
+        )
+        try:
+            async with AsyncWebCrawler(config=craw_ai_browser_config) as crawler:
+                crawler_config = CrawlerRunConfig(
+                    cache_mode=CacheMode.BYPASS
+                )
+                result:CrawlResult = await crawler.arun(url=url, config=crawler_config)
+            logger.info(f"{result.markdown}")
+            logger.info(f"{result.model_dump_json()}")
+        except Exception as e:
+            logger.exception(f"{e}")
+        finally:
+            page.quit()
+        return result
+    
+    def run_browser(self):
+        page = load_chrome_from_ini(
+            self.chrome_options
+        )
+        return page
+    
+    def get_asin_url(self, asin:str, asin_area:str):
+        # https://www.asinseed.com/en/JP?q=B0CQ1SHD8V
+        return f"https://www.asinseed.com/en/{asin_area}?q={asin}"
+
+    def get_asin_page_data(self, asin:str, asin_area:str, mthml_type:bool=True):
+        page = load_chrome_from_ini(
+            self.chrome_options
+        )
+        url = self.get_asin_url(asin, asin_area)
+        page.get(url)
+        if mthml_type:
+            return page.save(str(ASIN_HTML_DIR), name=f'{asin}')
+        else:
+            return page.html
+    def get_asin_and_save_page(self, asin:str, asin_area:str='JP', mthml_type:bool=True, save_path:str=None):
+        data = self.get_asin_page_data(asin, asin_area, mthml_type)
+        save_path = save_path or str(ASIN_HTML_DIR / f'{asin}.html')
+        save_to_file(data, save_path)
+        return save_path
+async def task():
+    asin = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8']
+    c = Crawler(ChromeOptions())
+    page = c.run_browser()
+    logger.info(f"{str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')}")
+    tab = page.latest_tab
+    tab.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
+    save_to_file(page.html, str(OUTPUT_DIR / 'page/debug' / f'{asin[0]}-from-mthml.html'))
+    # page.save(str(ASIN_HTML_DIR), name=f'{asin[0]}.html')
+    # save_to_file(page.html, str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
+    # await c.run('https://fr.florame.com/en/essential-oils')
+    return
+    port = page.browser._chromium_options._address.split(':')[-1]
+    logger.info(f"{page.browser._driver.get(f'http://{page.browser._chromium_options._address}/json').json()}")
+    logger.info(f"{page.browser._driver._websocket_url}")
+    item_id = 1
+    # url = 'https://greg.app/acalypha-marissima-overview/'
+    url = 'https://fr.florame.com/en/essential-oils'
+    # url = 'https://repository.arizona.edu/bitstream/10150/550946/1/dp_04_01-04.pdf'
+    # url = 'https://baidu.com'
+    browser_config = BrowserConfig(
+        headless=False,
+        # verbose=False,
+        # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
+        # debugging_port=int(port),
+        use_managed_browser=True,
+        cdp_url=page.browser._driver._websocket_url
+        # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce'
+    )
+    # async with AsyncWebCrawler(config=browser_config) as crawler:
+    #     crawler_config = CrawlerRunConfig(
+    #         cache_mode=CacheMode.BYPASS                
+    #     )
+    #     result = await crawler.arun(url=url, config=crawler_config)
+    #     print(result.markdown)
+
+    crawler = AsyncWebCrawler(config=browser_config)
+    await crawler.start()
+    crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
+    result:CrawlResult = await crawler.arun(url=url, config=crawl_config)
+    logger.info(f"{item_id} crawler.arun result.success: {result.model_dump_json(indent=2)} ")
+    print(result.markdown)
+    input('press enter to continue')
+    await crawler.close()
+    # page.quit()
+
+def main():
+    asyncio.run(task())
+    # test()
+
+if __name__ == "__main__":
+    main()

+ 10 - 0
src/sql/create_user.sql

@@ -0,0 +1,10 @@
+-- 创建用户(仅在用户不存在时创建)
+DO $$
+BEGIN
+  IF NOT EXISTS (SELECT FROM pg_roles WHERE rolname = 'user') THEN
+    CREATE USER "user" WITH PASSWORD 'user';
+  END IF;
+END $$;
+
+-- 授予用户对数据库 "amazone" 的连接权限
+GRANT CONNECT ON DATABASE copywriting_production TO "user";

+ 17 - 0
src/sql/grant_permissions.sql

@@ -0,0 +1,17 @@
+-- 授予 public 模式的权限
+GRANT CREATE, USAGE ON SCHEMA public TO "user";
+
+-- 授予已有表、序列、函数的所有权限
+GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO "user";
+GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO "user";
+GRANT ALL PRIVILEGES ON ALL FUNCTIONS IN SCHEMA public TO "user";
+
+-- 设置默认权限(影响后续创建的新对象)
+ALTER DEFAULT PRIVILEGES IN SCHEMA public 
+GRANT ALL PRIVILEGES ON TABLES TO "user";
+
+ALTER DEFAULT PRIVILEGES IN SCHEMA public 
+GRANT ALL PRIVILEGES ON SEQUENCES TO "user";
+
+ALTER DEFAULT PRIVILEGES IN SCHEMA public 
+GRANT ALL PRIVILEGES ON FUNCTIONS TO "user";

+ 115 - 0
utils/drission_page.py

@@ -0,0 +1,115 @@
+import os
+import time
+from typing import Optional
+from DrissionPage import Chromium, ChromiumOptions, ChromiumPage
+from pathlib import Path
+from config.settings import OUTPUT_DIR, WORK_DIR, BROWSER_CONFIG_DIR
+from utils.logu import logger
+from pydantic import BaseModel
+
+BROWSER_PATH=r"C:\Program Files\Google\Chrome\Application\chrome.exe"
+
+def genarate_chrome_ini(address="localhost:9321"):
+    port = address.split(':')[1]
+    chrome_options = ChromiumOptions().set_browser_path(BROWSER_PATH)
+    chrome_options.set_address(address)
+    chrome_options.set_user_data_path(str(OUTPUT_DIR / f'user_data_dir_{port}'))
+    # chrome_options.no_imgs(True).mute(True)
+    # chrome_options.incognito(True)
+    path = chrome_options.save(BROWSER_CONFIG_DIR / f'{port}.ini')
+    return path
+
+class ChromeOptions(BaseModel):
+    ini_path: Optional[str] = BROWSER_CONFIG_DIR / '9321.ini'
+    browser_path: Optional[str] = BROWSER_PATH
+    user_data_dir: Optional[str] = str(OUTPUT_DIR / 'user_data_dir')
+    address: Optional[str] = "localhost:9321"
+    headless: Optional[bool] = False
+    proxy: Optional[str] = "localhost:8851"
+    no_imgs: Optional[bool] = False
+    auto_port: Optional[bool] = False
+    save: Optional[bool] = False
+
+def load_chrome_from_ini(options:ChromeOptions):
+    chrome_options = ChromiumOptions(ini_path=options.ini_path)
+    if options.browser_path:
+        chrome_options.set_browser_path(options.browser_path)
+    if options.proxy:
+        chrome_options.set_proxy(options.proxy)
+    if options.user_data_dir:
+        chrome_options.set_user_data_path(options.user_data_dir)
+    # 如果存在代理环境变量
+    elif 'HTTP_PROXY' in os.environ:
+        chrome_options.set_proxy(os.environ['HTTP_PROXY'])
+    chrome_options.auto_port(options.auto_port)
+    chrome_options.no_imgs(options.no_imgs)
+    chrome_options.headless(options.headless)
+    chrome_options.set_address(options.address)
+    if options.save:
+        chrome_options.save(options.ini_path)
+    logger.info(f"proxy {options.proxy}")
+    page = ChromiumPage(chrome_options)
+    return page
+
+def fake_ua():
+
+    # 创建一个 UserAgent 对象
+    ua = UserAgent()
+
+    # 生成支持的浏览器的 User-Agent 字符串
+    chrome_ua = ua.chrome  # Chrome 浏览器
+    firefox_ua = ua.firefox  # Firefox 浏览器
+    safari_ua = ua.safari  # Safari 浏览器
+    edge_ua = ua.edge  # Chromium Edge 浏览器
+
+    # 打印生成的 User-Agent 字符串
+    print("Chrome User-Agent:", chrome_ua)
+    print("Firefox User-Agent:", firefox_ua)
+    print("Safari User-Agent:", safari_ua)
+    print("Edge User-Agent:", edge_ua)
+    return chrome_ua
+
+def load_random_ua_chrome(headless=False):
+    chrome_options = ChromiumOptions()
+    chrome_options.auto_port(True)
+    chrome_options.no_imgs(False)
+    chrome_options.set_user_agent(fake_ua())
+    chrome_options.arguments.append("--lang=en")
+    chrome_options.headless(headless)
+    page = ChromiumPage(chrome_options)
+    # page.set.auto_handle_alert(True)
+    return page
+
+def test_random_ua_chrome():
+    page = load_random_ua_chrome()
+    tab = page.latest_tab
+    keyword = "Acalypha rivularis essential oil"
+    url = f"https://www.google.com/search?q={keyword}"
+    # url = f"https://www.google.com/"
+    # url = "https://bot.sannysoft.com/"
+    tab.get(url)
+    print(tab.url)
+    if page.browser._chromium_options.is_headless:
+        tab.get_screenshot('./1.png')
+    # page.quit()
+
+def test_normal_chrome():
+    # genarate_chrome_ini()
+    page = load_chrome_from_ini(proxy='http://localhost:1881')
+    tab = page.latest_tab
+    keyword = "Acalypha rivularis essential oil"
+    url = f"https://www.google.com/search?q={keyword}"
+    url = "https://bot.sannysoft.com/"
+    # recaptcha 验证码检测
+    # url = "https://patrickhlauke.github.io/recaptcha/"
+    tab.get(url)
+    tab.scroll.to_bottom()
+    # tab.get_screenshot('./1.png')
+    # page.quit()
+
+def main():
+    test_random_ua_chrome()
+    # test_normal_chrome()
+    
+if __name__ == "__main__":
+    main()

+ 15 - 0
utils/file.py

@@ -0,0 +1,15 @@
+import json
+from pathlib import Path
+def save_to_file(content, filename:Path):
+    if not isinstance(content, str):
+    # 如果可以用 json 格式化,则格式化
+        try:
+            content = json.dumps(content, indent=4, ensure_ascii=False)
+        except:
+            # 如果不是 str ,则格式化
+            if not isinstance(content, str):
+                content = str(content)
+    
+    with open(filename, "w", encoding="utf-8") as file:
+        file.write(content)
+    return filename

+ 52 - 0
utils/logu.py

@@ -0,0 +1,52 @@
+import logging
+import os
+import sys
+import loguru
+import os
+import sys
+
+from config.settings import LOG_LEVEL,LOG_DIR
+# python_xx/site-packages/loguru/_handler.py  _serialize_record
+FORMAT = '<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{file}:{line}</cyan> :<cyan>{function}</cyan> - {message}'
+loguru.logger.remove()
+# logger.add(sys.stderr, format='<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>')
+# logger.add(sys.stderr, format=FORMAT)
+# logger.add(LOG_FILE, format=FORMAT)
+if not os.path.exists(LOG_DIR):
+    os.makedirs(LOG_DIR)
+
+loggers = {} 
+FORMAT = '<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{file}:{line}</cyan> :<cyan>{function}</cyan> - {message}'
+
+def get_logger(name, console=True, console_level="INFO", file=True, file_level="DEBUG"):  
+    '''
+    用法
+        # 创建普通日志,并且启用控制台输出,默认保存到 {LOG_DIR}/default.log 文件  
+        logger = get_logger("default", console=True) 
+        
+        # 创建特定名称的日志和日志文件,保存到 {LOG_DIR}/斌的世界/gift.log 文件
+        user_log = get_logger("斌的世界/gift")
+
+        # 输出打印测试
+        user_log.info("将控制台日志器、文件日志器,添加进日志器对象中")
+        logger.info("这是一条info消息")
+    '''
+    global loggers  
+    if name in loggers:  
+        return loggers[name]  # 如果已经存在,则直接返回      # 创建用户特定的日志文件  
+    log_file = f"{name}.log"  
+    # 添加日志处理器,过滤出只包含该用户名的日志记录  
+    if file:
+        loguru.logger.add(LOG_DIR/log_file,level=file_level, format=FORMAT, filter=lambda record: record["extra"].get("name") == name)
+    if console:
+        loguru.logger.add(sys.stderr, format=FORMAT,level=console_level, filter=lambda record: record["extra"].get("name") == name)
+    user_logger = loguru.logger.bind(name=name)  
+    loggers[name] = user_logger  
+    return user_logger  
+
+logger = get_logger('main')
+
+if __name__ == '__main__':
+    user_log = get_logger("斌的世界/gift")
+    user_log.info("将控制台日志器、文件日志器,添加进日志器对象中")
+    logger.info("这是一条info消息")

+ 19 - 0
utils/sql_engine.py

@@ -0,0 +1,19 @@
+from config.settings import OUTPUT_DIR, WORK_DIR, DB_URL
+from datetime import datetime
+from sqlmodel import Field, SQLModel, create_engine, Session, select
+from sqlalchemy import text  # Add this import
+
+
+engine = create_engine(DB_URL, echo=False)
+
+
+def create_db_and_tables():
+    SQLModel.metadata.create_all(engine)
+
+def drop_table(model: SQLModel):
+    """删除SearchResult表以便重新创建"""
+    with engine.connect() as conn:
+        # Use text() to create an executable SQL statement
+        conn.execute(text(f"DROP TABLE IF EXISTS {model.__tablename__}"))
+        conn.commit()
+    SQLModel.metadata.create_all(engine)