Răsfoiți Sursa

路径还原默认为 asin/asin.mhtml,因为 asin 目录要存放图片、json等资源文件

mrh 4 luni în urmă
părinte
comite
15d6806fc5
4 a modificat fișierele cu 73 adăugiri și 2 ștergeri
  1. 3 2
      src/flow_task/crawl_asin.py
  2. 28 0
      src/manager/core/db.py
  3. 29 0
      tests/mytest/t_delete_asin_seed.py
  4. 13 0
      utils/file.py

+ 3 - 2
src/flow_task/crawl_asin.py

@@ -108,7 +108,8 @@ class CrawlAsinFlow(BaseCrawlFlow):
         # )
         self.crawler.page.get('https://docs.llamaindex.ai/en/stable/examples/output_parsing/llm_program/#define-a-custom-output-parser')
         # 生成本地temp保存路径
-        local_dir = TEMP_PAGE_DIR / "asinseed"
+        local_dir = TEMP_PAGE_DIR / "asinseed"/asin
+
         local_dir.mkdir(parents=True, exist_ok=True)
         
         extension = ".mhtml" if mthml_type else ".html"
@@ -141,7 +142,7 @@ class CrawlAsinFlow(BaseCrawlFlow):
         
         
         # 生成S3保存路径
-        s3_path = f"{Crawler.s3_prefix}/{asin}{'.mhtml' if mthml_type else '.html'}"
+        s3_path = f"{Crawler.s3_prefix}/{asin}/{asin}{'.mhtml' if mthml_type else '.html'}"
             
         try:
             # 读取本地文件

+ 28 - 0
src/manager/core/db.py

@@ -5,6 +5,7 @@ from config.settings import DB_URL
 from utils.sql_engine import create_db_and_tables,drop_table,engine
 from src.models.asin_model import AsinSeed
 from src.models.product_model import Product
+from utils.file import delete_s3_file
 
 class DbManager:
     def __init__(self, engine: str=None):
@@ -44,6 +45,33 @@ class DbManager:
             else:
                 return list_model
 
+    def delete_asin_seed_by_id(self, asin_id: int) -> bool:
+        """根据id删除asin_seed记录,如果s3路径存在,连同一起删除"""
+        with Session(self.engine) as session:
+            # 首先获取要删除的记录
+            statement = select(AsinSeed).where(AsinSeed.id == asin_id)
+            result = session.exec(statement)
+            asin_seed = result.first()
+            
+            if not asin_seed:
+                return False
+            
+            # 删除S3文件(如果存在)
+            deleted_s3_files = []
+            if asin_seed.extra_result_path:
+                if delete_s3_file(asin_seed.extra_result_path):
+                    deleted_s3_files.append(asin_seed.extra_result_path)
+            
+            if asin_seed.mhtml_path:
+                if delete_s3_file(asin_seed.mhtml_path):
+                    deleted_s3_files.append(asin_seed.mhtml_path)
+            
+            # 删除数据库记录
+            session.delete(asin_seed)
+            session.commit()
+            
+            return True
+
 class ProductManager:
     def __init__(self, engine: str=None):
         self.engine = engine or create_engine(DB_URL)

+ 29 - 0
tests/mytest/t_delete_asin_seed.py

@@ -0,0 +1,29 @@
+from src.manager.core.db import DbManager
+from utils.logu import get_logger
+
+logger = get_logger('delete_asin_seed')
+
+def main():
+    # 要删除的asin_seed id列表
+    asin_ids_to_delete = [81, 80, 79, 78, 77, 76, 75, 74, 73]
+    
+    # 初始化数据库管理器
+    db_manager = DbManager()
+    
+    logger.info(f"开始删除asin_seed记录,id列表: {asin_ids_to_delete}")
+    
+    # 遍历删除每个id
+    for asin_id in asin_ids_to_delete:
+        try:
+            result = db_manager.delete_asin_seed_by_id(asin_id)
+            if result:
+                logger.info(f"成功删除asin_seed记录,id: {asin_id}")
+            else:
+                logger.warning(f"未找到asin_seed记录,id: {asin_id}")
+        except Exception as e:
+            logger.error(f"删除asin_seed记录失败,id: {asin_id},错误: {e}")
+    
+    logger.info("删除操作完成")
+
+if __name__ == "__main__":
+    main()

+ 13 - 0
utils/file.py

@@ -245,6 +245,19 @@ def check_exists(file_uri:str):
             return False
         raise e
 
+def delete_s3_file(s3_uri:str):
+    """删除S3文件"""
+    if not s3_uri or not s3_uri.startswith('s3://'):
+        return False
+    
+    try:
+        bucket_name, object_name, _ = get_s3_uri_info(s3_uri)
+        s3.delete_object(Bucket=bucket_name, Key=object_name)
+        return True
+    except (FileNotFoundError,OSError,ClientError) as e:
+        logging.error(f"删除S3文件失败: {s3_uri}, 错误: {e}")
+        return False
+
 def read_excel_from_url(url):
     """
     使用 pandas 从 URL 读取 Excel 文件