|
|
@@ -10,45 +10,18 @@ import random
|
|
|
from pprint import pprint
|
|
|
from datetime import datetime
|
|
|
from src.flow_task.crawl_asin import crawl_asin_flow, CrawlAsinInput, AsinAreaEnum
|
|
|
-from src.browser.browser_config import create_direct_browser_config
|
|
|
+from src.browser.browser_config import create_direct_browser_config,create_direct_browser_config_with_stats
|
|
|
from src.browser.crawl_base import AsinCrawlerBase
|
|
|
-
|
|
|
-def t_init_browser():
|
|
|
- """主函数"""
|
|
|
- # 获取所有浏览器配置
|
|
|
- browser_configs = create_direct_browser_config()
|
|
|
-
|
|
|
- print(f"获取到 {len(browser_configs)} 个浏览器配置")
|
|
|
-
|
|
|
- # 遍历所有浏览器配置,为每个配置创建AsinCrawlerBase实例并调用get_home_page
|
|
|
- for port, browser_config in browser_configs.items():
|
|
|
- print(f"正在使用浏览器配置: 端口 {port}, 账号 {browser_config.account}")
|
|
|
-
|
|
|
- try:
|
|
|
- # 使用AsinCrawlerBase的create_browser类方法创建实例
|
|
|
- crawler = AsinCrawlerBase.create_browser(
|
|
|
- address=browser_config.browser_address,
|
|
|
- user_data_dir=str(browser_config.browser_userdata_dir)
|
|
|
- )
|
|
|
-
|
|
|
- # 调用get_home_page方法
|
|
|
- print(f"正在为账号 {browser_config.account} 获取首页...")
|
|
|
- crawler.get_home_page()
|
|
|
- print(f"成功为账号 {browser_config.account} 获取首页")
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- print(f"为账号 {browser_config.account} 获取首页时出错: {e}")
|
|
|
-
|
|
|
- print("-" * 50)
|
|
|
-
|
|
|
+from src.flow_task.db.product_import_db import product_import_manager
|
|
|
|
|
|
competitor_list = ["B09MQMTBJW","B000THQ4ZO","B0D6RVGL2M","B004OCLMTI","B0D7TKHSP4","B000THROUS","B08HK93VBD","B0C6LXPSVX","B0C8MRSD6P","B08LD1MZX4","B0CLCJXXWF"]
|
|
|
|
|
|
-def t_random_crawl_asin():
|
|
|
+def t_crawl_asin_flow():
|
|
|
"""随机选择一个ASIN和浏览器配置,调用crawl_asin_flow"""
|
|
|
# 获取所有浏览器配置
|
|
|
- browser_configs = create_direct_browser_config()
|
|
|
-
|
|
|
+ # browser_configs = create_direct_browser_config()
|
|
|
+ # {9321: AccountInBrowser(browser_address='127.0.0.1:9321', browser_userdata_dir=WindowsPath('G:/code/amazone/copywriting_production/output/browser_data/user_data_dir_9321'), chrome_config_ini=None, proxy=None, active=True, host_name='pc', account='mahui4228@gmail.com', password='password123', max_access_limit=10, remaining_access_limit=10)
|
|
|
+ browser_configs = create_direct_browser_config_with_stats()
|
|
|
if not browser_configs:
|
|
|
print("没有可用的浏览器配置")
|
|
|
return
|
|
|
@@ -57,40 +30,251 @@ def t_random_crawl_asin():
|
|
|
print("没有可用的ASIN列表")
|
|
|
return
|
|
|
|
|
|
+ pprint(browser_configs)
|
|
|
+
|
|
|
# 随机选择一个ASIN和浏览器配置
|
|
|
- random_asin = competitor_list[0]
|
|
|
- random_port = random.choice(list(browser_configs.keys()))
|
|
|
- random_browser_config = browser_configs[9323]
|
|
|
+ asin = competitor_list[2]
|
|
|
+ browser_config = browser_configs[9323]
|
|
|
|
|
|
- print(f"随机选择的ASIN: {random_asin}")
|
|
|
- print(f"随机选择的浏览器配置: 端口 {random_port}, 账号 {random_browser_config.account}")
|
|
|
+ print(f"随机选择的ASIN: {asin}")
|
|
|
+ print(f"随机选择的浏览器配置: 端口 {browser_config},")
|
|
|
|
|
|
try:
|
|
|
# 创建CrawlAsinInput对象
|
|
|
flow_input = CrawlAsinInput(
|
|
|
- asin=random_asin,
|
|
|
+ asin=asin,
|
|
|
asin_area=AsinAreaEnum.JP, # 默认使用日本地区
|
|
|
mthml_type=True, # 保存为MHTML格式
|
|
|
- overwrite=False, # 不覆盖已存在文件
|
|
|
- browser=random_browser_config
|
|
|
+ # refresh_cache=True, # 不覆盖已存在文件
|
|
|
+ refresh_cache=False,
|
|
|
+ browser=browser_config
|
|
|
)
|
|
|
|
|
|
print(f"开始执行ASIN爬取流程...")
|
|
|
|
|
|
# 调用crawl_asin_flow
|
|
|
- result = crawl_asin_flow(flow_input)
|
|
|
+ state = crawl_asin_flow(flow_input, return_state=True)
|
|
|
+ pprint(state)
|
|
|
|
|
|
+ pprint(state.result())
|
|
|
print(f"爬取流程执行成功")
|
|
|
- pprint(result)
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"执行ASIN爬取流程时出错: {e}")
|
|
|
|
|
|
print("-" * 50)
|
|
|
|
|
|
+def t_print_monthly_competitor_lists():
|
|
|
+ """从product_import_manager中获取本月所有的数据,打印出各个商品的competitor_list"""
|
|
|
+ print("获取本月所有产品的competitor_list...")
|
|
|
+ print("=" * 60)
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 获取本月所有产品数据
|
|
|
+ monthly_products = product_import_manager.get_monthly_product_imports_by_asin_exists(to_dict=True)
|
|
|
+ '''monthly_products =
|
|
|
+[{'asin_exists': False,
|
|
|
+ 'competitor': 'B0020FO356',
|
|
|
+ 'created_at': datetime.datetime(2025, 8, 8, 15, 57, 6, 997403),
|
|
|
+ 'id': 1,
|
|
|
+ 'product_name': '1P双头压刀镊子'},
|
|
|
+ {'asin_exists': False,
|
|
|
+ 'competitor': 'B00F8BH8XS',
|
|
|
+ 'created_at': datetime.datetime(2025, 8, 8, 15, 57, 6, 997403),
|
|
|
+ 'id': 1,
|
|
|
+ 'product_name': '1P双头压刀镊子'},...]
|
|
|
+ '''
|
|
|
+ pprint(monthly_products)
|
|
|
+ return
|
|
|
+ if not monthly_products:
|
|
|
+ print("本月没有找到任何产品数据")
|
|
|
+ return
|
|
|
+
|
|
|
+ print(f"本月共找到 {len(monthly_products)} 个产品:")
|
|
|
+ print("-" * 60)
|
|
|
+
|
|
|
+ for i, product in enumerate(monthly_products, 1):
|
|
|
+ product_name = product.get('product_name', '未知产品')
|
|
|
+ filename = product.get('filename', '未知文件')
|
|
|
+ id = product.get('id', '未知id')
|
|
|
+
|
|
|
+ print(f"产品 {i}: {product_name}")
|
|
|
+ print(f"文件名: {filename}")
|
|
|
+ print(f"ID: {id}")
|
|
|
+
|
|
|
+ # 从product_data中解析competitor_list
|
|
|
+ try:
|
|
|
+ import json
|
|
|
+ product_data = json.loads(product.get('product_data', '{}'))
|
|
|
+ competitor_list = product_data.get('competitor_list', [])
|
|
|
+
|
|
|
+ if competitor_list:
|
|
|
+ print(f"竞品ASIN列表: {competitor_list}")
|
|
|
+ else:
|
|
|
+ print("竞品ASIN列表: 无")
|
|
|
+
|
|
|
+ except (json.JSONDecodeError, KeyError) as e:
|
|
|
+ print(f"解析competitor_list时出错: {e}")
|
|
|
+ print("竞品ASIN列表: 解析失败")
|
|
|
+
|
|
|
+ print("-" * 60)
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"获取本月产品数据时出错: {e}")
|
|
|
+
|
|
|
+def t_crawl_multiple_competitors():
|
|
|
+ """从monthly_products中获取competitor,使用所有浏览器配置执行crawl_asin_flow"""
|
|
|
+ print("获取本月所有产品的competitor并执行爬取...")
|
|
|
+ print("=" * 60)
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 获取本月所有产品数据
|
|
|
+ monthly_products = product_import_manager.get_monthly_product_imports_by_asin_exists(to_dict=True)
|
|
|
+
|
|
|
+ if not monthly_products:
|
|
|
+ print("本月没有找到任何产品数据")
|
|
|
+ return
|
|
|
+
|
|
|
+ print(f"本月共找到 {len(monthly_products)} 个产品数据")
|
|
|
+ pprint(monthly_products)
|
|
|
+ y = input("是否继续?")
|
|
|
+ print(f"input: {y}")
|
|
|
+ if y != 'y':
|
|
|
+ print("取消执行")
|
|
|
+ return
|
|
|
+ # 获取所有浏览器配置
|
|
|
+ browser_configs = create_direct_browser_config_with_stats()
|
|
|
+ if not browser_configs:
|
|
|
+ print("没有可用的浏览器配置")
|
|
|
+ return
|
|
|
+
|
|
|
+ print(f"初始可用浏览器配置: {list(browser_configs.keys())}")
|
|
|
+
|
|
|
+ # 使用pop方式遍历monthly_products
|
|
|
+ monthly_products_copy = monthly_products.copy() # 创建副本避免修改原数据
|
|
|
+ processed_count = 0
|
|
|
+
|
|
|
+ while monthly_products_copy and processed_count < 20: # 限制最多处理20个产品
|
|
|
+ # 每次循环都重新获取浏览器配置以更新剩余访问次数
|
|
|
+ browser_configs = create_direct_browser_config_with_stats()
|
|
|
+
|
|
|
+ # 找到还有剩余访问次数的浏览器
|
|
|
+ available_browsers = []
|
|
|
+ for browser_id, browser_config in browser_configs.items():
|
|
|
+ if hasattr(browser_config, 'remaining_access_limit') and browser_config.remaining_access_limit > 0:
|
|
|
+ available_browsers.append((browser_id, browser_config))
|
|
|
+
|
|
|
+ if not available_browsers:
|
|
|
+ print("\n没有可用的浏览器配置(所有浏览器剩余访问次数为0)")
|
|
|
+ break
|
|
|
+
|
|
|
+ print(f"\n当前可用浏览器: {[browser_id for browser_id, _ in available_browsers]}")
|
|
|
+
|
|
|
+ # 存储本轮任务的状态和信息
|
|
|
+ current_states = []
|
|
|
+ current_task_info = []
|
|
|
+
|
|
|
+ # 遍历所有可用浏览器,在每个浏览器中处理产品
|
|
|
+ for browser_id, browser_config in available_browsers:
|
|
|
+ if not monthly_products_copy:
|
|
|
+ break
|
|
|
+
|
|
|
+ # 从monthly_products中取出一个产品
|
|
|
+ product = monthly_products_copy.pop(0)
|
|
|
+ competitor = product.get('competitor')
|
|
|
+
|
|
|
+ if not competitor:
|
|
|
+ print(f"\n跳过没有competitor数据的产品: {product.get('product_name', '未知产品')}")
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 打印当前产品信息
|
|
|
+ print(f"\n处理产品 {processed_count + 1}:")
|
|
|
+ print(f"产品名称: {product.get('product_name', '未知产品')}")
|
|
|
+ print(f"Competitor ASIN: {competitor}")
|
|
|
+ print(f"文件名: {product.get('filename', '未知文件')}")
|
|
|
+ print(f"ID: {product.get('id', '未知id')}")
|
|
|
+ print("-" * 40)
|
|
|
+
|
|
|
+ print(f"使用浏览器 {browser_id} (剩余访问次数: {browser_config.remaining_access_limit}) 爬取 ASIN: {competitor}")
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 创建CrawlAsinInput对象
|
|
|
+ flow_input = CrawlAsinInput(
|
|
|
+ asin=competitor,
|
|
|
+ asin_area=AsinAreaEnum.JP, # 默认使用日本地区
|
|
|
+ mthml_type=True, # 保存为MHTML格式
|
|
|
+ refresh_cache=True, # 不覆盖已存在文件
|
|
|
+ browser=browser_config
|
|
|
+ )
|
|
|
+
|
|
|
+ print(f"开始执行ASIN爬取流程...")
|
|
|
+
|
|
|
+ # 调用crawl_asin_flow,获取state对象
|
|
|
+ state = crawl_asin_flow(flow_input, return_state=True)
|
|
|
+ current_states.append(state)
|
|
|
+ current_task_info.append({
|
|
|
+ 'browser_id': browser_id,
|
|
|
+ 'asin': competitor,
|
|
|
+ 'state': state,
|
|
|
+ 'product_info': product
|
|
|
+ })
|
|
|
+ processed_count += 1
|
|
|
+ print(f"浏览器 {browser_id} 爬取 ASIN {competitor} 任务已提交")
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"浏览器 {browser_id} 爬取 ASIN {competitor} 时出错: {e}")
|
|
|
+
|
|
|
+ print("=" * 60)
|
|
|
+
|
|
|
+ # 一轮浏览器遍历完成后,立即等待任务完成并获取结果
|
|
|
+ if current_states:
|
|
|
+ print("\n" + "=" * 60)
|
|
|
+ print("等待本轮任务完成...")
|
|
|
+ print("=" * 60)
|
|
|
+
|
|
|
+ for task in current_task_info:
|
|
|
+ browser_id = task['browser_id']
|
|
|
+ asin = task['asin']
|
|
|
+ state = task['state']
|
|
|
+ product_info = task['product_info']
|
|
|
+
|
|
|
+ print(f"\n获取浏览器 {browser_id} 爬取 ASIN {asin} 的结果:")
|
|
|
+ print(f"产品信息: {product_info.get('product_name', '未知产品')}")
|
|
|
+ print(f"文件名: {product_info.get('filename', '未知文件')}")
|
|
|
+ print("-" * 40)
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 获取任务最终结果
|
|
|
+ result = state.result()
|
|
|
+ print(f"浏览器 {browser_id} 爬取 ASIN {asin} 成功完成")
|
|
|
+ print(f"结果类型: {type(result)}")
|
|
|
+ if hasattr(result, '__dict__'):
|
|
|
+ print(f"结果属性: {vars(result)}")
|
|
|
+ else:
|
|
|
+ print(f"结果内容: {result}")
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"浏览器 {browser_id} 爬取 ASIN {asin} 获取结果时出错: {e}")
|
|
|
+
|
|
|
+ # 在while循环层更新浏览器剩余次数
|
|
|
+ print(f"\n=== 本轮浏览器状态更新 ===")
|
|
|
+ updated_browser_configs = create_direct_browser_config_with_stats()
|
|
|
+ for browser_id, browser_config in updated_browser_configs.items():
|
|
|
+ if hasattr(browser_config, 'remaining_access_limit'):
|
|
|
+ print(f"浏览器 {browser_id} 剩余访问次数: {browser_config.remaining_access_limit}")
|
|
|
+ print("=" * 60)
|
|
|
+
|
|
|
+ print("\n" + "=" * 60)
|
|
|
+ print(f"所有爬取任务完成,共处理 {processed_count} 个产品")
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"执行批量爬取时出错: {e}")
|
|
|
+
|
|
|
def main():
|
|
|
# t_init_browser()
|
|
|
- t_random_crawl_asin()
|
|
|
+ # t_crawl_asin_flow()
|
|
|
+ # t_print_monthly_competitor_lists()
|
|
|
+ t_crawl_multiple_competitors()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|