Bläddra i källkod

新增多浏览器采集竞品;优化产品导入方式,支持批量竞品解析

mrh 11 månader sedan
förälder
incheckning
daeeeef694

+ 10 - 0
ai/base_agent.py

@@ -0,0 +1,10 @@
+from llama_index.llms.litellm import LiteLLM
+from src.manager.template_manager import TemplateManager, TemplateService, TemplateType
+
+class BaseAgent:
+    def __init__(self, llm:LiteLLM, template_manager:TemplateManager):
+        self.llm = llm
+        self.template_manager = template_manager
+    
+    def get_mainkeys_tailkeys(self, template_str: str):
+        pass

+ 23 - 22
ai/marketting_agent.py

@@ -21,22 +21,17 @@ from llama_index.core.llms.llm import LLM
 from src.models.product_model import Product
 from src.manager.template_manager import TemplateManager, TemplateService, TemplateType
 from src.models.ai_execution_record import MarketingInfo, LLMConfig, SuperPromptMixin, AgentConfig, AgentContent, AICompetitorAnalyzeMainKeywords, AICompetitorAnalyzeMainKeywordsResult, MarketingContentGeneration
+from ai.base_agent import BaseAgent
 from utils.file import save_to_file, read_file
 from config.settings import MONGO_URL, MONGO_DB_NAME,LITELLM_API_BASE, LITELLM_API_KEY,OPENAI_API_KEY,OPENAI_API_BASE
 from utils.logu import get_logger
 logger = get_logger("ai")
 
-class BaseAgent:
-    def __init__(self, llm:LiteLLM, template_manager:TemplateManager):
-        self.llm = llm
-        self.template_manager = template_manager
-    
-    def get_mainkeys_tailkeys(self, template_str: str):
-        pass
+
     
 class MarketingAgent(BaseAgent):
-    async def get_mainkeys_tailkeys_prompt(self, product_name, prompt: str='',output_type='markdown', verbose=False):
-        base_prompt = "{{product_info}}\n{{competitor_info}}\n"
+    async def get_mainkeys_tailkeys_prompt(self, product_name, prompt: str='',output_type='markdown', verbose=False) -> PromptTemplate:
+        base_prompt = "```\n{{product_info}}\n{{competitor_info}}\n```\n"
         prompt_mainkyes = prompt or '''\
 你是日本站的亚马逊运营,请你根据产品信息为用户选出主要关键词和长尾关键词。
 
@@ -173,9 +168,11 @@ class MarketingAgent(BaseAgent):
             logger.info(f"llm_name: {model.model_name} , {markdown[:100]}")
             unsorted_content += f"# {display_name}\n{markdown}\n\n"  # 使用基础名称
         
-        content = sorted_content + unsorted_content
+        prompt_template = await self.get_mainkeys_tailkeys_prompt(product_name, output_type='markdown')
+        content = sorted_content + unsorted_content + f'# 提示词 \n{prompt_template.format()}\n\n'
+
         return save_to_file(content, output_path)
-async def llm_task():
+async def llm_task(product_name):
     m = TemplateManager(MONGO_URL, MONGO_DB_NAME)
     await m.initialize()
     model = 'openai/groq/llama-3.1-8b-instant'
@@ -186,17 +183,19 @@ async def llm_task():
     # model = 'openai/deepseek-chat'
     model = 'openai/deepseek-reasoner'
     # model = 'openai/doubao-pro-32k-241215'
-    llm_models = [
+    llm_list = [
+        # 'openai/doubao-pro-32k-241215',
+        LiteLLM(model='openai/doubao-pro-32k-241215', api_key=OPENAI_API_KEY, api_base=OPENAI_API_BASE),
+        # 'openai/deepseek-reasoner',
+        LiteLLM(model='openai/deepseek-reasoner', api_key=OPENAI_API_KEY, api_base=OPENAI_API_BASE),
         # 'openai/deepseek-v3',
-        'openai/QwQ-32B',
-        'openai/deepseek-reasoner',
-        'openai/doubao-pro-32k-241215',
+        LiteLLM(model='openai/deepseek-v3', api_key=OPENAI_API_KEY, api_base=OPENAI_API_BASE),
+        # 'openai/QwQ-32B',
     ]
     task_list = []
-    for model in llm_models:
-        llm = LiteLLM(model=model, api_key=OPENAI_API_KEY, api_base=OPENAI_API_BASE)
+    for llm in llm_list:
         agent = MarketingAgent(llm=llm, template_manager=m)
-        agent_model = agent.gen_mainkeys_tailkeys(product_name='大尺寸厚款卸妆棉240片', verbose=True, overwrite=True)
+        agent_model = agent.gen_mainkeys_tailkeys(product_name=product_name, verbose=True, overwrite=True)
         task_list.append(agent_model)
         # logger.info(f"{agent_model.competitor.items()}")
     await asyncio.gather(*task_list)
@@ -204,24 +203,26 @@ async def llm_task():
     # agent = MarketingAgent(llm=llm, template_manager=m)
     # agent_model = await agent.gen_mainkeys_tailkeys(product_name='大尺寸厚款卸妆棉240片', verbose=True, overwrite=True)
     # logger.info(f"{agent_model.competitor.items()}")
-async def gen_marketing_file():
+async def gen_marketing_file(product_name):
     m = TemplateManager(MONGO_URL, MONGO_DB_NAME)
     await m.initialize()
     model = 'openai/deepseek-reasoner'
     llm = LiteLLM(model=model, api_key=OPENAI_API_KEY, api_base=OPENAI_API_BASE)
-    product_name = '大尺寸厚款卸妆棉240片'
+    # product_name = '大尺寸厚款卸妆棉240片'
     agent = MarketingAgent(llm=llm, template_manager=m)
     output_path = r'G:\code\amazone\copywriting_production\output\temp' + f"\\{product_name}-营销文案.md"
     llm_models = [
 'openai/doubao-pro-32k-241215',
 'openai/deepseek-reasoner',
 'openai/deepseek-v3', 
-'openai/QwQ-32B',
+# 'openai/QwQ-32B',
  ]
     await agent.gen_marketing_file(product_name=product_name, output_path=output_path, llm_models=llm_models)
     logger.info(f"{output_path}")
 def main():
-    asyncio.run(gen_marketing_file())
+    product_name = '养花专用园艺迷你3件套'
+    # product_name = '园艺镊子套装2件套'
+    asyncio.run(gen_marketing_file(product_name=product_name))
 
 if __name__ == "__main__":
     main()

+ 3 - 0
ai/product_agent.py

@@ -0,0 +1,3 @@
+from ai.base_agent import BaseAgent
+from llama_index.core.prompts import PromptTemplate
+from llama_index.llms.litellm import LiteLLM

+ 4 - 2
config/celery.py

@@ -4,6 +4,9 @@ from config import celery_config
 from celery import signals
 import os
 from utils.logu import logger
+from utils.drission_page import ChromeOptions
+from utils.drission_page import load_chrome_from_ini,ChromeOptions
+
 
 @signals.worker_init.connect
 def on_worker_init(sender=None, **kwargs):
@@ -12,14 +15,13 @@ def on_worker_init(sender=None, **kwargs):
     cfg_info = CFG.model_dump(exclude={'s3_secret_key'})
     # s3_secret_key not show in log
     logger.info(f"Worker启动配置检查:\n{cfg_info}")
-    
     # 状态检查示例:确保输出目录存在
     output_dir = f"{CFG.s3_prefix}/output"
     if not os.path.exists(output_dir):
         os.makedirs(output_dir, exist_ok=True)
         logger.warning(f"创建缺失的output目录: {output_dir}")
-    
     logger.info(f"Worker初始化完成,当前配置版本: {CFG.version}")
+    logger.info(f"浏览器配置: {CFG.chrome_config_ini}")
 
 @signals.worker_shutdown.connect
 def on_worker_shutdown(sender=None, **kwargs):

+ 34 - 0
config/dp_conf/9322.ini

@@ -0,0 +1,34 @@
+[paths]
+download_path = .
+tmp_path = 
+
+[chromium_options]
+address = 127.0.0.1:9322
+browser_path = C:\Program Files\Google\Chrome\Application\chrome.exe
+arguments = ['--no-default-browser-check', '--disable-suggestions-ui', '--no-first-run', '--disable-infobars', '--disable-popup-blocking', '--hide-crash-restore-bubble', '--disable-features=PrivacySandboxSettings4', '--mute-audio', '--lang=en-US', '--proxy-server=localhost:8851', '--user-data-dir=G:\\code\\amazone\\copywriting_production\\output\\user_data_dir2']
+extensions = []
+prefs = {'profile.default_content_settings.popups': 0, 'profile.default_content_setting_values': {'notifications': 2}}
+flags = {}
+load_mode = normal
+user = Default
+auto_port = False
+system_user_path = False
+existing_only = False
+new_env = False
+
+[session_options]
+headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'connection': 'keep-alive', 'accept-charset': 'GB2312,utf-8;q=0.7,*;q=0.7'}
+
+[timeouts]
+base = 10
+page_load = 30
+script = 30
+
+[proxies]
+http = localhost:8851
+https = localhost:8851
+
+[others]
+retry_times = 3
+retry_interval = 2
+

+ 4 - 0
config/dp_conf/9322_worker.yaml

@@ -0,0 +1,4 @@
+chrome_config_ini: G:\code\amazone\copywriting_production\config\dp_conf\9322.ini
+s3_access_key: bh9LbfsPHRJgQ44wXIlv
+s3_endpoint: http://vs1.lan:9002
+storage: s3

+ 34 - 0
config/dp_conf/9323.ini

@@ -0,0 +1,34 @@
+[paths]
+download_path = .
+tmp_path = 
+
+[chromium_options]
+address = 127.0.0.1:9323
+browser_path = C:\Program Files\Google\Chrome\Application\chrome.exe
+arguments = ['--no-default-browser-check', '--disable-suggestions-ui', '--no-first-run', '--disable-infobars', '--disable-popup-blocking', '--hide-crash-restore-bubble', '--disable-features=PrivacySandboxSettings4', '--mute-audio', '--lang=en-US', '--proxy-server=localhost:8851', '--user-data-dir=G:\\code\\amazone\\copywriting_production\\output\\user_data_dir3']
+extensions = []
+prefs = {'profile.default_content_settings.popups': 0, 'profile.default_content_setting_values': {'notifications': 2}}
+flags = {}
+load_mode = normal
+user = Default
+auto_port = False
+system_user_path = False
+existing_only = False
+new_env = False
+
+[session_options]
+headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'connection': 'keep-alive', 'accept-charset': 'GB2312,utf-8;q=0.7,*;q=0.7'}
+
+[timeouts]
+base = 10
+page_load = 30
+script = 30
+
+[proxies]
+http = localhost:8851
+https = localhost:8851
+
+[others]
+retry_times = 3
+retry_interval = 2
+

+ 4 - 0
config/dp_conf/9323_worker.yaml

@@ -0,0 +1,4 @@
+chrome_config_ini: G:\code\amazone\copywriting_production\config\dp_conf\9323.ini
+s3_access_key: bh9LbfsPHRJgQ44wXIlv
+s3_endpoint: http://vs1.lan:9002
+storage: s3

+ 1 - 1
config/settings.py

@@ -29,7 +29,7 @@ class Config(BaseModel):
     s3_secret_key: Optional[str] = os.environ.get("S3_SECRET_KEY", 'N744RZ60T1b4zlcWG2MROCzjEE2mPTdNQCc7Pk3M')
     s3_endpoint: Optional[str] = os.environ.get("S3_ENDPOINT", 'http://s3.vs1.lan')
     s3_prefix: Optional[str] = 's3://public/amazone/copywriting_production'
-    chrome_config_ini: Optional[str] = r'G:\code\amazone\copywriting_production\config\dp_conf\9321.ini'
+    chrome_config_ini: Optional[str] = r'G:\code\amazone\copywriting_production\config\dp_conf\9322.ini'
     redis_url: Optional[str] = os.environ.get("REDIS_URL", 'redis://localhost:6379/0')
     version: Optional[str] = "0.0.1-alpha"
     def save(self, config_path: Path = None):

+ 54 - 0
docs/gpt/excel_product_sheet.md

@@ -1,3 +1,57 @@
+导入产品数据 product_name
+src\manager\manager_task.py
+```python
+async def gardening_tools_to_mongo():
+    manager = ManagerTask()
+    await manager.db_mongo.initialize()
+    product_name = "养花专用园艺迷你3件套"
+    exist_product = await Product.find_one(Product.basic_info.name == product_name)
+    if exist_product:
+        logger.info(f"产品 {product_name} 已存在")
+        product = exist_product
+    else:
+        product = Product(basic_info=ProductBaseInfo(
+            name=product_name,
+            content='塑料柄耙子1个,塑料柄圆头铲1个,塑料柄尖头铲1个',
+            material='不锈钢头,塑料制的柄',
+            color='银色',
+            main_usage='多肉等盆栽植物用的工具',
+            selling_point = [
+                '1.不锈钢头材质,做工精密不易生锈',
+                '2.手柄缠头铆钉加固,不易脱落',
+                '3.塑料磨砂防滑手柄,弧形贴合手掌,握感舒适',
+                '4.用途,圆头铲用于填土,施肥,移植。尖头铲用于深挖,切割。耙子用于松土,除草,种植适用于多肉植物和小盆栽',
+            ]
+        ))
+    
+    asinseed_list = ['B08YRPFCWH', 'B0D816V61B', 'B0CVX4V2NK', 'B086LDLCW9', 'B0CRTX616P']
+
+```
+参考上述的文件,帮我导入另一个产品数据:
+```
+产品基本信息(中文)				
+产品名称	脚趾保护套			
+包装内容	20枚			
+材质	银纤维			
+颜色	灰色			
+尺寸	4.5*2cm			
+包裹尺寸	13*9*1.5cm			
+重量	15g			
+主要用途	保护脚趾			
+主要卖点	抗菌抑菌			
+	抗臭			
+	吸湿透气			
+	柔软舒适 耐久性好			
+竞品			主关键词	
+B0CP94YMWM			指サック 親指	
+B0BPN5F3Q6			足指ソックス	
+B0CYB1DJM7			足カバー	
+B0CYBRD55M			親指サック	
+B09L7W8M3X			保護 足指ソックス	
+B0CB5RJCYV			足指サック	
+
+```
+
 @/src/excel_tools/excel_writer.py 
 上述文件是完成了一个 excel 文件的其中一个工作表“竞品关键词调研1”。不过,其实文件还有很多工作表 "产品信息2" "产品信息" 。因此我希望新建另一个文件,用于生成各种各样的工作表,当前已经生成了 “竞品关键词调研1” 表,但不要写入文件,而是从模板文件 "G:\code\amazone\copywriting_production\output\resource\文案制作-template.xlsx" 中读取 "产品信息2" "产品信息"  全部信息包括样式。用于新的 excel 文件生成。注意模板文件是只读的。并且不能跨文件复制工作表,因此我觉得最好的做法就是通过 python 复制模板文件,然后打开复制后的文件进行编辑写入。
 

+ 7 - 2
src/excel_tools/file_manager.py

@@ -70,10 +70,15 @@ class ExcelFileManager:
         return self.s3_storage_manager.load_s3_complete_extract_data()
 
 async def main():
-    self = ExcelFileManager(r"G:\code\amazone\copywriting_production\output\resource\extra-data-大尺寸厚款卸妆棉240片.xlsx")
+    product_name = '园艺镊子套装2件套'
+    # product_name = '养花专用园艺迷你3件套'
+    self = ExcelFileManager(
+        r"G:\code\amazone\copywriting_production\output\resource\extra-data-"+ f"{product_name}.xlsx",
+        r"G:\code\amazone\copywriting_production\output\resource\镊子套装文案制作模版.xlsx")
+    self.TEMPLATE_PATH
     db_mongo = BaseMongoManager()  # 自动初始化
     await db_mongo.initialize()
-    product = await Product.find_one(Product.basic_info["name"] == "大尺寸厚款卸妆棉240片")
+    product = await Product.find_one(Product.basic_info["name"] == product_name)
 
     logger.info(f"{product}")
     extract_data_lsit = product.competitor_crawl_data

+ 35 - 11
src/manager/manager_task.py

@@ -311,23 +311,47 @@ async def main():
     # await manager.async_analyze_and_save(product, dry_run=False, over_write=True)
     # await manager.submit_search_mainkeyword(product)
     return
-async def run_asinseed_task():
-    manager = ManagerTask()
-    asinseed_list = ['B0BTHX39VZ', 'B081SQRGZP', 'B003UOO8PG', 'B01DNS2FP8', 'B07YQ3BH96']
-    return
+async def run_asinseed_task(manager:ManagerTask, asinseed_list, over_write=False):
+    # manager = ManagerTask()
+    # asinseed_list = ['B09KRH3J5P', 'B0020FO356', 'B07Q194HG1', 'B07PVX118J', ]
     for asin in asinseed_list:
         logger.info(f"{asin}")
-        manager.submit_asinseed_task_and_wait(asin)
+        manager.submit_asinseed_task_and_wait(asin, overwrite=over_write)
         manager.submit_extract_task_and_wait(asin)
     # result = {'status': 'success', 'path': 's3://public/amazone/copywriting_production/output/B0B658JC22/B0B658JC22.mhtml'}
     # manager.save_task_asin_crawl_result('B0B658JC22', 'JP', result)
 
-async def asinseed_to_mongo():
+# docs\gpt\excel_product_sheet.md
+async def gardening_tools_to_mongo():
     manager = ManagerTask()
     await manager.db_mongo.initialize()
-    # product = await Product.find_one(Product.basic_info.name == "电线保护套")
-    product = Product(basic_info=ProductBaseInfo(name="大尺寸厚款卸妆棉240片"), )
-    asinseed_list = ['B0BTHX39VZ', 'B081SQRGZP', 'B003UOO8PG', 'B01DNS2FP8', 'B07YQ3BH96']
+    product_name = "脚趾保护套"
+    exist_product = await Product.find_one(Product.basic_info.name == product_name)
+    if exist_product:
+        logger.info(f"产品 {product_name} 已存在")
+        product = exist_product
+    else:
+        product = Product(basic_info=ProductBaseInfo(
+            name=product_name,
+            content='20枚',
+            material='银纤维',
+            color='灰色',
+            size='4.5*2cm',
+            package_size='13*9*1.5cm',
+            weight='15g',
+            main_usage='保护脚趾',
+            selling_point = [
+                '1.抗菌抑菌',
+                '2.抗臭',
+                '3.吸湿透气',
+                '4.柔软舒适 耐久性好',
+            ]
+        ))
+    
+    asinseed_list = ['B0CP94YMWM', 'B0BPN5F3Q6', 'B0CYB1DJM7', 'B0CYBRD55M', 'B09L7W8M3X', 'B0CB5RJCYV']
+    # asinseed_list = ['B0CVX4V2NK', 'B086LDLCW9']
+
+    await run_asinseed_task(manager, asinseed_list)
     for asin in asinseed_list:
         logger.info(f"{asin}")
         asin_model = manager.db.get_asin_seed(asin)
@@ -337,8 +361,8 @@ async def asinseed_to_mongo():
             asin_area=asin_model.asin_area,
             mhtml_path=asin_model.mhtml_path,
             extra_result=json.loads(crawl_data)
-            )
+        )
     await product.save()
 
 if __name__ == "__main__":
-    asyncio.run(asinseed_to_mongo())
+    asyncio.run(gardening_tools_to_mongo())

+ 7 - 0
src/readme.md

@@ -3,8 +3,15 @@
 ssh mrh@sv-v2
 cd ~/program/redis
 dc up -d
+# mahui4228@gmail.com
 $env:CONFIG_PATH="G:\code\amazone\copywriting_production\config\dp_conf\9321_worker.yaml";celery -A src.tasks.crawl_asin_save_task worker --loglevel=info --hostname=9321@%h
 
+# mahui6188@gmail.com U*
+$env:CONFIG_PATH="G:\code\amazone\copywriting_production\config\dp_conf\9322_worker.yaml";celery -A src.tasks.crawl_asin_save_task worker --loglevel=info --hostname=9322@%h
+
+$env:CONFIG_PATH="G:\code\amazone\copywriting_production\config\dp_conf\9323_worker.yaml";celery -A src.tasks.crawl_asin_save_task worker --loglevel=info --hostname=9323@%h
+
+
 celery -A config.celery flower 
 
 python -m src.manager.cli_tasks --help

+ 8 - 1
src/tasks/crawl_asin_save_task.py

@@ -6,9 +6,16 @@ from config.settings import CFG
 from utils.drission_page import ChromeOptions
 from utils.file import check_exists, save_to_file,s3_uri_to_http_url
 from utils.logu import get_logger
+from utils.drission_page import load_chrome_from_ini,ChromeOptions
 
-logger = get_logger('worker')
+logger = get_logger('worker')   
 import asyncio
+logger.info(f"Worker初始化完成,当前配置版本: {CFG.version}")
+logger.info(f"浏览器配置: {CFG.chrome_config_ini}")
+chrome_options = ChromeOptions(ini_path=CFG.chrome_config_ini)
+page = load_chrome_from_ini(chrome_options)
+logger.info(f"_chromium_options.address: {page._browser._chromium_options.address}")
+page.get('chrome://version')
 
 
 @app.task(bind=True,

+ 18 - 0
tests/mytest/t_page.py

@@ -0,0 +1,18 @@
+from celery import Celery
+from config.settings import CFG
+from config import celery_config
+from celery import signals
+import os
+from utils.logu import logger
+from utils.drission_page import ChromeOptions
+from utils.drission_page import load_chrome_from_ini,ChromeOptions
+def main():
+    logger.info(f"Worker初始化完成,当前配置版本: {CFG.version}")
+    logger.info(f"浏览器配置: {CFG.chrome_config_ini}")
+    chrome_options = ChromeOptions(ini_path=CFG.chrome_config_ini)
+    page = load_chrome_from_ini(chrome_options)
+    logger.info(f"_chromium_options.address: {page._browser._chromium_options.address}")
+    page.get('chrome://version')
+
+if __name__ == "__main__":
+    main()

+ 12 - 8
utils/drission_page.py

@@ -21,11 +21,11 @@ def genarate_chrome_ini(address="localhost:9321"):
 
 class ChromeOptions(BaseModel):
     ini_path: Optional[str] = BROWSER_CONFIG_DIR / '9321.ini'
-    browser_path: Optional[str] = BROWSER_PATH
-    user_data_dir: Optional[str] = str(OUTPUT_DIR / 'user_data_dir')
-    address: Optional[str] = "localhost:9321"
+    browser_path: Optional[str] = None
+    user_data_dir: Optional[str] = None
+    address: Optional[str] = None
     headless: Optional[bool] = False
-    proxy: Optional[str] = "localhost:8851"
+    proxy: Optional[str] = None
     no_imgs: Optional[bool] = False
     auto_port: Optional[bool] = False
     save: Optional[bool] = False
@@ -41,10 +41,14 @@ def load_chrome_from_ini(options:ChromeOptions):
     # 如果存在代理环境变量
     elif 'HTTP_PROXY' in os.environ:
         chrome_options.set_proxy(os.environ['HTTP_PROXY'])
-    chrome_options.auto_port(options.auto_port)
-    chrome_options.no_imgs(options.no_imgs)
-    chrome_options.headless(options.headless)
-    chrome_options.set_address(options.address)
+    if options.auto_port:
+        chrome_options.auto_port(options.auto_port)
+    if options.no_imgs:
+        chrome_options.no_imgs(options.no_imgs)
+    if options.address:
+        chrome_options.headless(options.headless)
+    if options.address:
+        chrome_options.set_address(options.address)
     if options.save:
         chrome_options.save(options.ini_path)
     logger.info(f"proxy {options.proxy}")