|
|
@@ -1,494 +0,0 @@
|
|
|
-from datetime import datetime, timedelta
|
|
|
-from enum import StrEnum
|
|
|
-from pathlib import Path
|
|
|
-from typing import Optional, Any, Union, List
|
|
|
-from pydantic import BaseModel, Field
|
|
|
-import re
|
|
|
-from prefect import flow, task
|
|
|
-from prefect.states import Failed, Running, Completed
|
|
|
-from prefect.cache_policies import INPUTS
|
|
|
-from prefect.futures import wait
|
|
|
-from src.browser.crawl_asin import Crawler
|
|
|
-from utils.drission_page import ChromeOptions
|
|
|
-from config.settings import CFG, read_config, get_config_path, TEMP_PAGE_DIR, OPENAI_API_KEY, OPENAI_API_BASE
|
|
|
-from utils.logu import get_logger
|
|
|
-from utils.file import save_to_file, check_exists, extract_excel_text_from_url
|
|
|
-from utils.file import s3
|
|
|
-from utils.url_utils import extract_urls_from_text, extract_filename_from_url
|
|
|
-from llama_index.llms.litellm import LiteLLM
|
|
|
-from llama_index.core.program import LLMTextCompletionProgram
|
|
|
-from llama_index.core.output_parsers import PydanticOutputParser
|
|
|
-from llama_index.core.output_parsers.pydantic import extract_json_str
|
|
|
-from src.flow_task.db.product_import_db import product_import_manager
|
|
|
-from src.flow_task.db.models.product_models import ProductImport, ProductForExtraction
|
|
|
-from src.manager.core.db import DbManager, AsinSeed
|
|
|
-from markitdown import MarkItDown
|
|
|
-import tempfile
|
|
|
-import os
|
|
|
-
|
|
|
-logger = get_logger('flow_task')
|
|
|
-
|
|
|
-
|
|
|
-class AsinAreaEnum(StrEnum):
|
|
|
- """ASIN地区枚举"""
|
|
|
- US = "US"
|
|
|
- JP = "JP"
|
|
|
- UK = "UK"
|
|
|
- DE = "DE"
|
|
|
- FR = "FR"
|
|
|
- CA = "CA"
|
|
|
- AU = "AU"
|
|
|
-
|
|
|
-
|
|
|
-class BaseCommon(BaseModel):
|
|
|
- """基础通用输入模型"""
|
|
|
- browser_address: Optional[str] = Field(default='127.0.0.1:16800', description="浏览器调试地址")
|
|
|
- browser_userdata_dir: Optional[str | Path] = Field(default=None, description="浏览器用户数据目录")
|
|
|
- chrome_config_ini: Optional[str] = Field(default=None, description="Chrome配置INI文件路径")
|
|
|
-
|
|
|
-
|
|
|
-class CrawlAsinInput(BaseCommon):
|
|
|
- """爬取ASIN页面输入模型"""
|
|
|
- asin: str = Field(description="ASIN编码")
|
|
|
- asin_area: AsinAreaEnum = Field(default=AsinAreaEnum.JP, description="ASIN地区")
|
|
|
- mthml_type: bool = Field(default=True, description="是否保存为MHTML格式")
|
|
|
- save_path: Optional[str] = Field(default=None, description="保存路径")
|
|
|
- overwrite: bool = Field(default=False, description="是否覆盖已存在文件")
|
|
|
-
|
|
|
-
|
|
|
-class BaseCrawlFlow:
|
|
|
- """基础爬取流程类"""
|
|
|
- FLOW_NAME = "基础爬取流程"
|
|
|
-
|
|
|
- def __init__(self, flow_input: CrawlAsinInput):
|
|
|
- self.flow_input = flow_input
|
|
|
- self.chrome_options = self._init_chrome_options()
|
|
|
- self.crawler = Crawler(chrome_options=self.chrome_options)
|
|
|
-
|
|
|
- def _init_chrome_options(self) -> ChromeOptions:
|
|
|
- """初始化Chrome选项"""
|
|
|
- chrome_config_ini = self.flow_input.chrome_config_ini or CFG.chrome_config_ini
|
|
|
- return ChromeOptions(ini_path=chrome_config_ini)
|
|
|
-
|
|
|
- def _get_save_path(self) -> str:
|
|
|
- """获取保存路径"""
|
|
|
- if self.flow_input.save_path:
|
|
|
- return self.flow_input.save_path
|
|
|
-
|
|
|
- # 默认保存路径
|
|
|
- extension = ".mhtml" if self.flow_input.mthml_type else ".html"
|
|
|
- return f"{Crawler.s3_prefix}{self.flow_input.asin}/{self.flow_input.asin}{extension}"
|
|
|
-
|
|
|
- def _get_and_save_page_data(self):
|
|
|
- """获取页面数据并保存的方法"""
|
|
|
- return _get_and_save_page_data_task(
|
|
|
- crawler=self.crawler,
|
|
|
- asin=self.flow_input.asin,
|
|
|
- asin_area=self.flow_input.asin_area,
|
|
|
- mthml_type=self.flow_input.mthml_type,
|
|
|
- save_path=self._get_save_path(),
|
|
|
- overwrite=self.flow_input.overwrite
|
|
|
- )
|
|
|
-
|
|
|
- def run(self):
|
|
|
- """执行流程"""
|
|
|
- logger.info(f"开始执行流程: {self.FLOW_NAME}")
|
|
|
- logger.info(f"ASIN: {self.flow_input.asin}")
|
|
|
- logger.info(f"地区: {self.flow_input.asin_area}")
|
|
|
- logger.info(f"MHTML格式: {self.flow_input.mthml_type}")
|
|
|
- logger.info(f"覆盖模式: {self.flow_input.overwrite}")
|
|
|
-
|
|
|
- try:
|
|
|
- # 使用task方法获取和保存页面数据
|
|
|
- final_path = self._get_and_save_page_data()
|
|
|
-
|
|
|
- logger.info(f"流程执行成功,保存路径: {final_path}")
|
|
|
- return {
|
|
|
- 'status': 'success',
|
|
|
- 'path': final_path,
|
|
|
- 'asin': self.flow_input.asin,
|
|
|
- 'asin_area': self.flow_input.asin_area
|
|
|
- }
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- logger.error(f"流程执行失败: {e}")
|
|
|
- raise e
|
|
|
-
|
|
|
-
|
|
|
-class CrawlAsinFlow(BaseCrawlFlow):
|
|
|
- """ASIN页面爬取流程"""
|
|
|
- FLOW_NAME = "ASIN页面爬取流程"
|
|
|
-
|
|
|
-
|
|
|
-@flow(
|
|
|
- name=CrawlAsinFlow.FLOW_NAME,
|
|
|
- persist_result=True,
|
|
|
- result_serializer="json",
|
|
|
-)
|
|
|
-def crawl_asin_flow(flow_input: CrawlAsinInput):
|
|
|
- """ASIN页面爬取Prefect流程"""
|
|
|
- logger.info(f"启动ASIN爬取流程: {flow_input.asin}")
|
|
|
-
|
|
|
- flow_runner = CrawlAsinFlow(flow_input)
|
|
|
- result = flow_runner.run()
|
|
|
-
|
|
|
- return result
|
|
|
-
|
|
|
-
|
|
|
-@task(name="保存整个html页面",
|
|
|
- persist_result=True,
|
|
|
- cache_expiration=timedelta(days=31),
|
|
|
- cache_policy=INPUTS- 'crawler' - 'overwrite'
|
|
|
-)
|
|
|
-def task_save_page(crawler: Crawler, asin: str, asin_area: AsinAreaEnum,
|
|
|
- mthml_type: bool, overwrite: bool):
|
|
|
- """获取页面数据并保存到本地temp目录的task方法"""
|
|
|
- logger.info(f"开始获取页面数据: {asin}")
|
|
|
-
|
|
|
- # 获取页面数据
|
|
|
- data = crawler.get_asin_page_data(
|
|
|
- asin=asin,
|
|
|
- asin_area=asin_area,
|
|
|
- mthml_type=mthml_type
|
|
|
- )
|
|
|
-
|
|
|
- # 生成本地temp保存路径
|
|
|
- local_dir = TEMP_PAGE_DIR / "asin" / asin
|
|
|
- local_dir.mkdir(parents=True, exist_ok=True)
|
|
|
-
|
|
|
- extension = ".mhtml" if mthml_type else ".html"
|
|
|
- local_path = local_dir / f"{asin}{extension}"
|
|
|
-
|
|
|
- # 检查文件是否已存在
|
|
|
- if not overwrite and local_path.exists():
|
|
|
- logger.info(f"文件已存在,跳过保存: {local_path}")
|
|
|
- return str(local_path)
|
|
|
-
|
|
|
- try:
|
|
|
- logger.info(f"保存到本地temp目录: {local_path}")
|
|
|
- local_final_path = save_to_file(data, local_path)
|
|
|
- logger.info(f"成功保存到本地temp目录: {local_final_path}")
|
|
|
- return str(local_final_path)
|
|
|
- except Exception as local_error:
|
|
|
- logger.error(f"本地temp目录保存失败: {local_error}")
|
|
|
- raise Exception(f"本地temp目录保存失败: {local_error}")
|
|
|
-
|
|
|
-
|
|
|
-@task(name="保存到数据库和对象存储",
|
|
|
- persist_result=True,
|
|
|
- cache_expiration=timedelta(days=31),
|
|
|
-)
|
|
|
-def task_save_to_db(local_file_path: str, asin: str, mthml_type: bool, asin_area: str = 'JP'):
|
|
|
- """将temp目录文件上传到S3的task方法,先检查数据库是否存在记录"""
|
|
|
- logger.info(f"开始处理文件: {local_file_path}")
|
|
|
-
|
|
|
- # 初始化数据库管理器
|
|
|
- db_manager = DbManager()
|
|
|
-
|
|
|
- # 检查数据库中是否已存在该ASIN的记录
|
|
|
- existing_record = db_manager.get_asin_seed(asin)
|
|
|
-
|
|
|
- # 如果存在记录且mhtml_path不为空,直接返回该路径
|
|
|
- if existing_record and existing_record.mhtml_path:
|
|
|
- logger.info(f"数据库中已存在ASIN {asin} 的记录,路径: {existing_record.mhtml_path}")
|
|
|
- return existing_record.mhtml_path
|
|
|
-
|
|
|
- # 生成S3保存路径
|
|
|
- s3_path = f"{Crawler.s3_prefix}{asin}/{asin}{'.mhtml' if mthml_type else '.html'}"
|
|
|
-
|
|
|
- try:
|
|
|
- # 读取本地文件
|
|
|
- with open(local_file_path, 'rb') as f:
|
|
|
- data = f.read()
|
|
|
-
|
|
|
- logger.info(f"上传到S3: {s3_path}")
|
|
|
- final_path = save_to_file(data, s3_path)
|
|
|
- logger.info(f"成功上传到S3: {final_path}")
|
|
|
-
|
|
|
- # 保存到数据库
|
|
|
- if existing_record:
|
|
|
- # 更新现有记录
|
|
|
- existing_record.mhtml_path = final_path
|
|
|
- existing_record.asin_area = asin_area
|
|
|
- db_manager.save_asin_seed(existing_record)
|
|
|
- logger.info(f"更新数据库记录: ASIN {asin}")
|
|
|
- else:
|
|
|
- # 创建新记录
|
|
|
- new_record = AsinSeed(asin=asin, asin_area=asin_area, mhtml_path=final_path)
|
|
|
- db_manager.save_asin_seed(new_record)
|
|
|
- logger.info(f"创建数据库记录: ASIN {asin}")
|
|
|
-
|
|
|
- return final_path
|
|
|
- except Exception as s3_error:
|
|
|
- logger.error(f"S3上传失败: {s3_error}")
|
|
|
- raise Exception(f"S3上传失败: {s3_error}")
|
|
|
-
|
|
|
-
|
|
|
-@task
|
|
|
-def _get_and_save_page_data_task(crawler: Crawler, asin: str, asin_area: AsinAreaEnum,
|
|
|
- mthml_type: bool, overwrite: bool, save_path: str = None):
|
|
|
- """获取页面数据并保存的独立task方法(兼容旧版本)"""
|
|
|
- logger.info(f"开始获取页面数据并保存: {asin}")
|
|
|
-
|
|
|
- # 第一步:获取数据并保存到本地temp目录
|
|
|
- try:
|
|
|
- local_file_path = task_save_page(
|
|
|
- crawler=crawler,
|
|
|
- asin=asin,
|
|
|
- asin_area=asin_area,
|
|
|
- mthml_type=mthml_type,
|
|
|
- overwrite=overwrite
|
|
|
- )
|
|
|
- logger.info(f"成功保存到本地temp目录: {local_file_path}")
|
|
|
- except Exception as local_error:
|
|
|
- logger.error(f"本地temp目录保存失败: {local_error}")
|
|
|
- raise Exception(f"本地temp目录保存失败: {local_error}")
|
|
|
-
|
|
|
- # 第二步:将temp目录文件上传到S3并保存到数据库
|
|
|
- try:
|
|
|
- s3_path = task_save_to_db(
|
|
|
- local_file_path=local_file_path,
|
|
|
- asin=asin,
|
|
|
- mthml_type=mthml_type,
|
|
|
- asin_area=asin_area
|
|
|
- )
|
|
|
- logger.info(f"成功上传到S3并保存到数据库: {s3_path}")
|
|
|
- return s3_path
|
|
|
- except Exception as s3_error:
|
|
|
- logger.error(f"S3上传失败,但本地文件已保存: {s3_error}")
|
|
|
- raise Exception(f"S3上传失败,但本地文件已保存: {s3_error}")
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-@task
|
|
|
-def crawl_asin_task(flow_input: CrawlAsinInput):
|
|
|
- """ASIN页面爬取任务"""
|
|
|
- return crawl_asin_flow(flow_input)
|
|
|
-
|
|
|
-
|
|
|
-@task(name="解析URL表格为Markdown",
|
|
|
- persist_result=True,
|
|
|
- cache_expiration=timedelta(days=31),
|
|
|
- cache_policy=INPUTS
|
|
|
-)
|
|
|
-def parse_url_to_markdown_task(url: str):
|
|
|
- """将URL表格文件转换为Markdown格式的task方法
|
|
|
- 支持Excel文件和其他文件格式,Excel文件使用pandas读取所有工作表信息
|
|
|
-
|
|
|
- Args:
|
|
|
- url (str): 表格文件的URL或本地路径
|
|
|
-
|
|
|
- Returns:
|
|
|
- str: 解析后的Markdown格式内容
|
|
|
- """
|
|
|
- logger.info(f"开始解析URL表格文件: {url}")
|
|
|
-
|
|
|
- try:
|
|
|
- # 检查文件类型,如果是Excel文件则使用pandas方法
|
|
|
- if url.lower().endswith(('.xlsx', '.xls')):
|
|
|
- logger.info(f"检测到Excel文件,使用pandas方法读取: {url}")
|
|
|
-
|
|
|
- # 使用pandas方法读取Excel文件
|
|
|
- all_cells_text_dict = extract_excel_text_from_url(url)
|
|
|
-
|
|
|
- if not all_cells_text_dict:
|
|
|
- logger.warning(f"Excel文件读取失败或为空: {url}")
|
|
|
- return ""
|
|
|
-
|
|
|
- # 将Excel内容转换为Markdown格式
|
|
|
- markdown_content = ""
|
|
|
- for sheet_name, sheet_content in all_cells_text_dict.items():
|
|
|
- markdown_content += f"## 工作表: {sheet_name}\n\n"
|
|
|
- markdown_content += "```\n"
|
|
|
- markdown_content += sheet_content
|
|
|
- markdown_content += "\n```\n\n"
|
|
|
-
|
|
|
- logger.info(f"成功解析Excel文件,共读取 {len(all_cells_text_dict)} 个工作表: {url}")
|
|
|
- return markdown_content
|
|
|
-
|
|
|
- else:
|
|
|
- # 非Excel文件使用原来的markitdown方法
|
|
|
- logger.info(f"检测到非Excel文件,使用markitdown方法读取: {url}")
|
|
|
-
|
|
|
- # 创建MarkItDown实例
|
|
|
- md = MarkItDown(enable_plugins=False)
|
|
|
-
|
|
|
- # 转换文档
|
|
|
- result = md.convert(url)
|
|
|
-
|
|
|
- # 获取Markdown格式内容
|
|
|
- markdown_content = result.text_content
|
|
|
-
|
|
|
- logger.info(f"成功解析URL表格文件: {url}")
|
|
|
- return markdown_content
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- logger.error(f"解析URL表格文件时发生错误: {e}")
|
|
|
- raise Exception(f"解析URL表格文件失败: {e}")
|
|
|
-
|
|
|
-
|
|
|
-class DebugPydanticOutputParser(PydanticOutputParser):
|
|
|
- """继承自PydanticOutputParser的调试版本,打印LLM生成结果"""
|
|
|
-
|
|
|
- def parse(self, text: str) -> Any:
|
|
|
- """Parse, validate, and correct errors programmatically."""
|
|
|
- logger.info("=== LLM生成结果 ===")
|
|
|
- logger.info(text)
|
|
|
- logger.info("=== LLM生成结果结束 ===")
|
|
|
-
|
|
|
- # 清理markdown代码块格式
|
|
|
- cleaned_text = text
|
|
|
- if "```json" in text:
|
|
|
- # 移除markdown代码块标记
|
|
|
- cleaned_text = text.split("```json")[1].split("```")[0]
|
|
|
- elif "```" in text:
|
|
|
- # 移除通用markdown代码块标记
|
|
|
- cleaned_text = text.split("```")[1].split("```")[0]
|
|
|
-
|
|
|
- # 清理转义字符
|
|
|
- cleaned_text = cleaned_text.replace("\\n", "\n").replace("\\\"", "\"")
|
|
|
-
|
|
|
- json_str = extract_json_str(cleaned_text)
|
|
|
- return self._output_cls.model_validate_json(json_str)
|
|
|
-
|
|
|
-
|
|
|
-def extract_product_from_text(text: str, uri: str = "", filename: str = "") -> ProductImport:
|
|
|
- """使用LLMTextCompletionProgram从文本中提取产品信息"""
|
|
|
- llm = LiteLLM(model='openai/GLM-4-Flash', api_key=OPENAI_API_KEY, api_base=OPENAI_API_BASE)
|
|
|
-
|
|
|
- # 使用自定义的DebugPydanticOutputParser
|
|
|
- output_parser = DebugPydanticOutputParser(output_cls=ProductForExtraction)
|
|
|
-
|
|
|
- program = LLMTextCompletionProgram.from_defaults(
|
|
|
- prompt_template_str=f"请从以下文本中提取产品信息:\n\nurl: {uri} \n\n{{text}}",
|
|
|
- llm=llm,
|
|
|
- verbose=True,
|
|
|
- output_parser=output_parser
|
|
|
- )
|
|
|
-
|
|
|
- extracted_product = program(text=text)
|
|
|
-
|
|
|
- # 使用类方法创建Product实例
|
|
|
- return ProductImport.from_product_extraction(
|
|
|
- extracted_product=extracted_product,
|
|
|
- markdown_content=text,
|
|
|
- uri=uri,
|
|
|
- filename=filename
|
|
|
- )
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-@task(name="Excel处理",
|
|
|
- persist_result=True,
|
|
|
- cache_expiration=timedelta(days=31),
|
|
|
- cache_policy=INPUTS
|
|
|
-)
|
|
|
-def get_or_create_product_import_by_url(file_url: str):
|
|
|
- """根据文件URL获取数据库中的ProductImport记录,如果不存在则解析Excel并保存到数据库
|
|
|
-
|
|
|
- Args:
|
|
|
- file_url (str): 文件的URL或本地路径
|
|
|
-
|
|
|
- Returns:
|
|
|
- ProductImport: 数据库中的ProductImport记录
|
|
|
- """
|
|
|
- # 从URL中提取文件名
|
|
|
- file_name = extract_filename_from_url(file_url)
|
|
|
-
|
|
|
- logger.info(f"开始处理文件: {file_name} (URL: {file_url})")
|
|
|
-
|
|
|
- # 首先检查数据库中是否已存在该文件名的记录
|
|
|
- existing_record = product_import_manager.get_product_import_by_filename(file_name)
|
|
|
-
|
|
|
- if existing_record:
|
|
|
- logger.info(f"数据库中已存在文件 {file_name} 的记录,直接返回")
|
|
|
- return existing_record
|
|
|
-
|
|
|
- logger.info(f"数据库中不存在文件 {file_name} 的记录,开始解析Excel并保存到数据库")
|
|
|
-
|
|
|
- try:
|
|
|
- # 解析Excel文件为Markdown格式
|
|
|
- markdown_content = parse_url_to_markdown_task(file_url)
|
|
|
-
|
|
|
- if not markdown_content:
|
|
|
- logger.warning(f"Excel文件解析失败或为空: {file_url}")
|
|
|
- raise Exception(f"Excel文件解析失败或为空: {file_url}")
|
|
|
-
|
|
|
- # 使用LLM从Markdown内容中提取产品信息
|
|
|
- product_import = extract_product_from_text(
|
|
|
- text=markdown_content,
|
|
|
- uri=file_url,
|
|
|
- filename=file_name
|
|
|
- )
|
|
|
-
|
|
|
- # 保存到数据库
|
|
|
- saved_record = product_import_manager.save_product_import(product_import)
|
|
|
-
|
|
|
- logger.info(f"成功解析Excel并保存到数据库: {file_name}")
|
|
|
- return saved_record
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- logger.error(f"处理文件 {file_name} 时发生错误: {e}")
|
|
|
- raise Exception(f"处理文件失败: {e}")
|
|
|
-
|
|
|
-
|
|
|
-class ProductImportInput(BaseModel):
|
|
|
- """产品导入输入模型"""
|
|
|
- file_url: Union[str, List[str]] = Field(description="文件的URL或本地路径,可以是字符串或列表")
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-@flow(
|
|
|
- name="产品导入流程",
|
|
|
- persist_result=True,
|
|
|
- result_serializer="json",
|
|
|
-)
|
|
|
-def product_import_flow(flow_input: ProductImportInput):
|
|
|
- """产品导入Prefect流程,支持字符串或列表输入,并发执行解析"""
|
|
|
- # 处理输入,统一转换为URL列表
|
|
|
- if isinstance(flow_input.file_url, str):
|
|
|
- logger.info(f"输入为字符串,尝试提取URL: {flow_input.file_url}")
|
|
|
- # 如果是字符串,尝试提取URL
|
|
|
- urls = extract_urls_from_text(flow_input.file_url)
|
|
|
- if not urls:
|
|
|
- # 如果没有提取到URL,假设整个字符串就是一个URL
|
|
|
- urls = [flow_input.file_url]
|
|
|
- logger.info(f"提取到 {len(urls)} 个URL: {urls}")
|
|
|
- else:
|
|
|
- # 如果是列表,直接使用
|
|
|
- urls = flow_input.file_url
|
|
|
- logger.info(f"输入为列表,共 {len(urls)} 个URL: {urls}")
|
|
|
-
|
|
|
- # 并发执行所有URL的解析
|
|
|
- all_futures = []
|
|
|
- for url in urls:
|
|
|
- future = get_or_create_product_import_by_url.with_options(
|
|
|
- task_run_name=f"处理URL: {url}",
|
|
|
- ).submit(url)
|
|
|
- all_futures.append(future)
|
|
|
-
|
|
|
- # 等待所有任务完成
|
|
|
- logger.info(f"等待 {len(all_futures)} 个任务完成...")
|
|
|
- results = [future.result() for future in wait(all_futures).done]
|
|
|
-
|
|
|
- logger.info(f"所有任务完成,成功处理 {len(results)} 个文件")
|
|
|
-
|
|
|
- return {
|
|
|
- 'status': 'success',
|
|
|
- 'product_imports': results,
|
|
|
- 'file_urls': urls,
|
|
|
- 'total_count': len(results)
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
-@task
|
|
|
-def product_import_task(flow_input: ProductImportInput):
|
|
|
- """产品导入任务"""
|
|
|
- return product_import_flow(flow_input)
|
|
|
-
|