8 月之前 · a051c0ed6e
--- a/docs/gpt/excel_writer_usage.py
+++ b/docs/gpt/excel_writer_usage.py
@@ -0,0 +1,20 @@
 
				+from src.excel_tools.excel_writer import ExcelWriter
			
 
				+import json
			
 
				+input_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0B658JC22_extract.json'
			
 
				+
			
 
				+# 示例数据加载
			
 
				+with open(input_path, encoding='utf-8') as f:
			
 
				+    data = json.load(f)
			
 
				+
			
 
				+# 调用示例
			
 
				+success = ExcelWriter.process_json_to_excel(
			
 
				+    data=data,            # 输入数据
			
 
				+    asin="B0B658JC22",    # 产品ASIN编号
			
 
				+    template_path=r"G:\code\amazone\copywriting_production\output\3月新品-文案制作-temp.xlsx",  # Excel模板路径
			
 
				+    output_path=r"G:\code\amazone\copywriting_production\output\3月新品-文案制作-temp.xlsx"     # 输出文件路径
			
 
				+)
			
 
				+
			
 
				+if success:
			
 
				+    print("Excel文件生成成功！")
			
 
				+else:
			
 
				+    print("文件生成失败，请检查日志")
			
--- a/docs/gpt/to_excel.md
+++ b/docs/gpt/to_excel.md
@@ -0,0 +1,75 @@
 
				+```json
			
 
				+{
			
 
				+    "result_table": [
			
 
				+        {
			
 
				+            "traffic_keyword": "コードカバー",
			
 
				+            "keyword_link": "https://www.asinseed.com/en/JP?q=%E3%82%B3%E3%83%BC%E3%83%89%E3%82%AB%E3%83%90%E3%83%BC",
			
 
				+            "monthly_searches": "9,332",
			
 
				+            "amazon_search_link": "https://www.amazon.co.jp/s?k=%E3%82%B3%E3%83%BC%E3%83%89%E3%82%AB%E3%83%90%E3%83%BC"
			
 
				+        },
			
 
				+        {
			
 
				+            "traffic_keyword": "コードカバー ペット",
			
 
				+            "keyword_link": "https://www.asinseed.com/en/JP?q=%E3%82%B3%E3%83%BC%E3%83%89%E3%82%AB%E3%83%90%E3%83%BC%20%E3%83%9A%E3%83%83%E3%83%88",
			
 
				+            "monthly_searches": "3,195",
			
 
				+            "amazon_search_link": "https://www.amazon.co.jp/s?k=%E3%82%B3%E3%83%BC%E3%83%89%E3%82%AB%E3%83%90%E3%83%BC%20%E3%83%9A%E3%83%83%E3%83%88"
			
 
				+        },
			
 
				+        // ...
			
 
				+        {
			
 
				+            "monthly_searches": ""
			
 
				+        }
			
 
				+    ],
			
 
				+    "product_info": {
			
 
				+        "image_url": "https://m.media-amazon.com/images/I/41Q7bZ1H94L._AC_US200_.jpg",
			
 
				+        "goto_amazon": "https://www.amazon.co.jp/dp/B0B658JC22",
			
 
				+        "main_text": "MEL Chemistry大径 肉厚 ペットコード ペット 犬 猫 キャット ドッグ 噛みつき 防止 感電 保護 家電 チャージ コード 配線 プロテクター カバー 螺旋 スパイラル チューブ ラップ 被覆 破れ 防止 破損防止 補強 収納 収束 結束 まとめる TPU 約93cm (ブラック 黒)B0B658JC22",
			
 
				+        "imgbase64": "/9j/4AAQSkZJRgABAQAAAQABAAD/..."
			
 
				+    },
			
 
				+    "unique_words": [
			
 
				+        {
			
 
				+            "word": "コードカバー"
			
 
				+        },
			
 
				+        {
			
 
				+            "word": "猫"
			
 
				+        },
			
 
				+        {
			
 
				+            "word": "ケーブルカバー"
			
 
				+        },
			
 
				+        // ...
			
 
				+    ]
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+我想根据上述 json 文件内容写入到一个表格，用 pandas 模块写入。
			
 
				+一个文件对应 3 列表格。
			
 
				+第一列：标题是来自传参 asin ，例如 "B0CQ1SHD8V" 。每行的内容是来自 result_table 的 "traffic_keyword" 字段。如果可以的话，将每行的内容用超链接链接到 result_table 的 "amazon_search_link" 字段。
			
 
				+第二列：标题是搜索量 ，每行的内容是来自 result_table 的 "monthly_searches" 字段。
			
 
				+
			
 
				+并且在第一列标题附近把 Base64 图片插入到表格中
			
 
				+
			
 
				+因为 result_table 是一个表格总和，他们都有相同的行数，他们可能要筛选或者排列，我希望能够按搜索量进行排序从大到小，由于各个列的情况都不同，你来决定是用 python 原生代码排列再写入，还是写入到 excel 中在用excel接口进行排序。而且超过 1万的数值需要标红。
			
 
				+
			
 
				+但是 product_info 和 unique_words 是一个单独的字段，如果嵌入到同一个表格中，会影响 table 的排序，你觉得如何构造 excel 表格，放在最后一行可以吗？比如在 result_table 生成的 excel 结果下方新增 product_info 和 unique_words 的行。
			
 
				+
			
 
				+请移除掉模板文件，直接生成 excel 即可。
			
 
				+文件中只获取了单个 json 文件，未来需要读取多个 json 文件，写入到表格中，用空列隔开。例如上述案例中，存在2列，然后隔开一列，从第三列新增新的 json 文件，新增各自的 result_table 、 product_info 、 unique_words，因此需要保持代码的兼容性，可扩展性。
			
 
				+
			
 
				+请重构代码。
			
 
				+必须符合最佳编码规范，高内聚，低耦合，用类来管理模块。
			
 
				+
			
 
				+
			
 
				+必须读取 "G:\code\amazone\copywriting_production\output\3月新品-文案制作.xlsx" 这个模板文件，将生成的内容另存为通路径下另一个文件。模板文件是只读的。
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+我想根据上述文件内容写入到数据库，现在 submit_extract_task_and_wait 函数已经将 json 文件上传到 S3 并且保存到数据库路径，你只需从数据库中读取，获取 s3 的文件路径获取 json 内容。然后在一个新表中中写入。
			
 
				+新表的字段如下：
			
 
				+asin: str
			
 
				+traffic_keyword: str
			
 
				+monthly_searches: str
			
 
				+keyword_link: str   
			
 
				+amazon_search_link: str
			
 
				+tail_keyword: dict    
			
 
				+
			
 
				+
			
--- a/src/excel_tools/excel_writer.py
+++ b/src/excel_tools/excel_writer.py
@@ -0,0 +1,174 @@
 
				+from dataclasses import dataclass
			
 
				+import json
			
 
				+from openpyxl import Workbook
			
 
				+from openpyxl.drawing.image import Image
			
 
				+from openpyxl.formatting.rule import CellIsRule
			
 
				+from openpyxl.styles import PatternFill, Font, Alignment
			
 
				+from openpyxl.utils import get_column_letter
			
 
				+from io import BytesIO
			
 
				+import base64
			
 
				+from typing import Dict, List, Any
			
 
				+from pathlib import Path
			
 
				+from utils.file import read_file
			
 
				+from utils.logu import logger
			
 
				+
			
 
				+RED_FILL = PatternFill(start_color='FFFF0000', end_color='FFFF0000', fill_type='solid')
			
 
				+HEADER_FONT = Font(bold=True, color='FFFFFF')
			
 
				+HEADER_FILL = PatternFill(start_color='4F81BD', patternType='solid')
			
 
				+
			
 
				+@dataclass
			
 
				+class DataProcessor:
			
 
				+    """处理单个JSON文件的数据转换"""
			
 
				+    json_data: Dict[str, Any]
			
 
				+    asin: str
			
 
				+    
			
 
				+    def process_table_data(self) -> List[Dict[str, Any]]:
			
 
				+        """处理表格主体数据"""
			
 
				+        return [
			
 
				+            {
			
 
				+                'traffic_keyword': item.get('traffic_keyword', ''),
			
 
				+                'amazon_search_link': item.get('amazon_search_link', ''),
			
 
				+                'monthly_searches': item.get('monthly_searches', '0').replace(',', '')
			
 
				+            }
			
 
				+            for item in self.json_data.get('result_table', [])
			
 
				+            if item.get('traffic_keyword')
			
 
				+        ]
			
 
				+    
			
 
				+    @property
			
 
				+    def product_info(self) -> Dict[str, str]:
			
 
				+        """提取产品基础信息"""
			
 
				+        return self.json_data.get('product_info', {})
			
 
				+    
			
 
				+    @property
			
 
				+    def unique_words(self) -> List[str]:
			
 
				+        """提取唯一词列表"""
			
 
				+        return [word.get('word', '') for word in self.json_data.get('unique_words', [])]
			
 
				+
			
 
				+class ExcelWriter:
			
 
				+    """Excel文件写入器"""
			
 
				+    
			
 
				+    def __init__(self, output_path: str):
			
 
				+        self.wb = Workbook()
			
 
				+        self.ws = self.wb.active
			
 
				+        self.output_path = Path(output_path)
			
 
				+        self.current_col = 1  # 当前写入列位置
			
 
				+        
			
 
				+    def add_json_data(self, json_path: str, asin: str):
			
 
				+        """添加单个JSON文件数据"""
			
 
				+        str_data = read_file(json_path)
			
 
				+        data = json.loads(str_data)
			
 
				+        processor = DataProcessor(data, asin)
			
 
				+        
			
 
				+        # 写入表头
			
 
				+        self._write_header(processor)
			
 
				+        
			
 
				+        # 写入表格数据
			
 
				+        self._write_table_data(processor)
			
 
				+        
			
 
				+        # 写入附加信息
			
 
				+        self._write_additional_info(processor)
			
 
				+        
			
 
				+        self.current_col += 3  # 数据列+间隔列
			
 
				+    
			
 
				+    def _write_header(self, processor: DataProcessor):
			
 
				+        """写入表头（含图片）"""
			
 
				+        # ASIN标题
			
 
				+        header_cell = self.ws.cell(
			
 
				+            row=1,
			
 
				+            column=self.current_col,
			
 
				+            value=processor.asin
			
 
				+        )
			
 
				+        header_cell.font = HEADER_FONT
			
 
				+        header_cell.fill = HEADER_FILL
			
 
				+        
			
 
				+        # 插入Base64图片
			
 
				+        img_data = processor.product_info.get('imgbase64', '')
			
 
				+        if img_data and img_data.startswith('/9j/'):
			
 
				+            try:
			
 
				+                img = Image(BytesIO(base64.b64decode(img_data)))
			
 
				+                img.anchor = f'{get_column_letter(self.current_col)}2'
			
 
				+                self.ws.add_image(img)
			
 
				+            except Exception as e:
			
 
				+                logger.error(f'图片插入失败: {e}')
			
 
				+                
			
 
				+        # 搜索量标题
			
 
				+        self.ws.cell(
			
 
				+            row=1,
			
 
				+            column=self.current_col + 1,
			
 
				+            value='搜索量'
			
 
				+        ).font = HEADER_FONT
			
 
				+    
			
 
				+    def _write_table_data(self, processor: DataProcessor):
			
 
				+        """写入表格主体数据"""
			
 
				+        for row_idx, item in enumerate(processor.process_table_data(), start=3):
			
 
				+            # 关键词超链接
			
 
				+            self.ws.cell(
			
 
				+                row=row_idx,
			
 
				+                column=self.current_col,
			
 
				+                value=item['traffic_keyword']
			
 
				+            ).hyperlink = item['amazon_search_link']
			
 
				+            
			
 
				+            # 搜索量数值处理
			
 
				+            search_volume = item['monthly_searches']
			
 
				+            cell = self.ws.cell(
			
 
				+                row=row_idx,
			
 
				+                column=self.current_col + 1,
			
 
				+                value=int(search_volume) if search_volume.isdigit() else 0
			
 
				+            )
			
 
				+            
			
 
				+            # 条件格式设置（超过1万标红）
			
 
				+            if cell.value > 10000:
			
 
				+                cell.fill = RED_FILL
			
 
				+    
			
 
				+    def _write_additional_info(self, processor: DataProcessor):
			
 
				+        """写入附加信息到表格下方"""
			
 
				+        max_row = self.ws.max_row
			
 
				+        base_row = max_row + 2
			
 
				+        
			
 
				+        # 产品信息
			
 
				+        self.ws.cell(base_row, self.current_col, '产品信息:').font = Font(bold=True)
			
 
				+        self.ws.cell(base_row + 1, self.current_col, processor.product_info.get('main_text', ''))
			
 
				+        
			
 
				+        # 唯一词列表
			
 
				+        self.ws.cell(base_row + 3, self.current_col, '唯一词:').font = Font(bold=True)
			
 
				+        for idx, word in enumerate(processor.unique_words, start=1):
			
 
				+            self.ws.cell(base_row + 3 + idx, self.current_col, word)
			
 
				+        
			
 
				+    def apply_styles(self):
			
 
				+        """应用全局样式"""
			
 
				+        # 设置列宽自适应
			
 
				+        for col in self.ws.columns:
			
 
				+            max_length = max(
			
 
				+                len(str(cell.value)) for cell in col
			
 
				+                if cell.value is not None
			
 
				+            )
			
 
				+            self.ws.column_dimensions[get_column_letter(col[0].column)].width = max_length + 2
			
 
				+            
			
 
				+        # 设置标题对齐
			
 
				+        for row in self.ws.iter_rows(min_row=1, max_row=1):
			
 
				+            for cell in row:
			
 
				+                cell.alignment = Alignment(horizontal='center')
			
 
				+    
			
 
				+    def save(self):
			
 
				+        """保存文件"""
			
 
				+        self.output_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+        self.wb.save(self.output_path)
			
 
				+        logger.info(f'Excel文件已保存至: {self.output_path}')
			
 
				+
			
 
				+def main():
			
 
				+    output = r"G:\code\amazone\copywriting_production\output\multi_data.xlsx"
			
 
				+    json_files = [
			
 
				+        (r"G:\code\amazone\copywriting_production\output\page\debug\B0B658JC22_extract.json", "B0B658JC22"),
			
 
				+        # 添加更多文件示例
			
 
				+        # (r"path\to\other.json", "ASIN123")
			
 
				+    ]
			
 
				+    logger.info(f"{json_files}")
			
 
				+    writer = ExcelWriter(output)
			
 
				+    for json_path, asin in json_files:
			
 
				+        writer.add_json_data(json_path, asin)
			
 
				+    
			
 
				+    writer.apply_styles()
			
 
				+    writer.save()
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/src/manager/core/db.py
+++ b/src/manager/core/db.py
@@ -3,15 +3,7 @@ from typing import Optional
 
				 from sqlmodel import SQLModel, create_engine, Session, select, Field
			
 
				 from config.settings import DB_URL
			
 
				 from utils.sql_engine import create_db_and_tables,drop_table,engine
			
 
				-
			
 
				-class AsinSeed(SQLModel, table=True):
			
 
				-    id: Optional[int] = Field(default=None, primary_key=True)
			
 
				-    asin: str
			
 
				-    asin_area: str = 'JP'
			
 
				-    extra_result_path: Optional[str] = None
			
 
				-    mhtml_path: Optional[str] = None
			
 
				-    error: Optional[str] = None
			
 
				-    created_at: Optional[datetime] = Field(default_factory=datetime.now)
			
 
				+from src.models.asin_model import AsinSeed
			
 
				 
			
 
				 class DbManager:
			
 
				     def __init__(self, engine: str=None):
			
@@ -25,7 +17,7 @@ class DbManager:
 
				             session.refresh(asin_model)
			
 
				             return asin_model
			
 
				 
			
 
				-    def get_asin_seed(self, asin: str):
			
 
				+    def get_asin_seed(self, asin: str)->AsinSeed:
			
 
				         with Session(self.engine) as session:
			
 
				             statement = select(AsinSeed).where(AsinSeed.asin == asin)
			
 
				             results = session.exec(statement)
			
@@ -37,22 +29,7 @@ class DbManager:
 
				             return exist
			
 
				         else:
			
 
				             return self.save_asin_seed(asin_model)
			
 
				-    
			
 
				-    def update_asin_seed(self, asin_model: AsinSeed):
			
 
				-        with Session(self.engine) as session:
			
 
				-            statement = select(AsinSeed).where(AsinSeed.asin == asin_model.asin)
			
 
				-            results = session.exec(statement)
			
 
				-            exist = results.first()
			
 
				-            if exist:
			
 
				-                for key, value in asin_model.model_dump().items():
			
 
				-                    setattr(exist, key, value)
			
 
				-                session.add(exist)
			
 
				-                session.commit()
			
 
				-                session.refresh(exist)
			
 
				-                return exist
			
 
				-            else:
			
 
				-                return None
			
 
				-            
			
 
				+               
			
 
				 def main():
			
 
				     asinseed_list = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8']
			
 
				     db_manager = DbManager()
			
--- a/src/manager/manager_task.py
+++ b/src/manager/manager_task.py
@@ -5,6 +5,8 @@ from utils.file import save_to_file, read_file
 
				 from src.tasks.crawl_asin_save_task import get_asin_and_save_page
			
 
				 from src.tasks.crawl_asin_exract_task import extra_result
			
 
				 from celery.result import AsyncResult
			
 
				+from utils.logu import get_logger
			
 
				+logger = get_logger('main')
			
 
				 
			
 
				 class ManagerTask:
			
 
				     s3_prefix = CFG.s3_prefix + '/output/page'
			
@@ -12,6 +14,10 @@ class ManagerTask:
 
				         self.db = DbManager()
			
 
				 
			
 
				     def submit_task_and_wait(self, asin: str, asin_area: str = 'JP',overwrite:bool=False, timeout: int = 300):
			
 
				+        model = self.db.get_asin_seed(asin)
			
 
				+        if model and model.mhtml_path:
			
 
				+            logger.info(f"{asin}已经爬取过，跳过")
			
 
				+            return model
			
 
				         """提交任务并等待完成，保存结果路径到数据库"""
			
 
				         # 提交celery任务
			
 
				         task = get_asin_and_save_page.delay(asin, asin_area, overwrite)
			
@@ -23,17 +29,21 @@ class ManagerTask:
 
				         # 处理任务结果
			
 
				         if result.successful():
			
 
				             task_result = result.result
			
 
				-            self.save_task_asin_crawl_result(asin, asin_area, task_result)
			
 
				+            model.mhtml_path = task_result['path']
			
 
				+            self.db.save_asin_seed(model)
			
 
				         return None
			
 
				     
			
 
				     def submit_extract_task_and_wait(self, asin: str, asin_area: str = 'JP', timeout: int = 300):
			
 
				         """提交页面解析任务并等待完成，保存结果到数据库"""
			
 
				         # 从数据库获取mhtml路径
			
 
				         asin_seed = self.db.get_asin_seed(asin)
			
 
				+        if asin_seed and asin_seed.extra_result_path:
			
 
				+            logger.info(f"{asin}已经解析过，跳过")
			
 
				+            return asin_seed
			
 
				         if not asin_seed or not asin_seed.mhtml_path:
			
 
				             print(f"未找到{asin}的mhtml路径")
			
 
				             return None
			
 
				-        
			
 
				+        logger.info(f"{asin}页面解析开始: {asin_seed.mhtml_path}")
			
 
				         # 提交celery任务
			
 
				         task = extra_result.delay(asin_seed.mhtml_path)
			
 
				         
			
@@ -45,17 +55,17 @@ class ManagerTask:
 
				         if result.successful():
			
 
				             task_result = result.result
			
 
				             if task_result['status'] == 'success':
			
 
				+                task_result_data = task_result['data']
			
 
				                 # 保存提取结果到文件并上传S3
			
 
				-                filename = f"{asin}_extract.json"
			
 
				-                save_path = self.upload_file(
			
 
				-                    file_path=task_result['data'],
			
 
				-                    filename=filename
			
 
				-                )
			
 
				+                s3_dir = asin_seed.mhtml_path.rsplit('/', 1)[0]
			
 
				+                save_json_uri = f"{s3_dir}/{asin}_extract.json"
			
 
				+                save_to_file(task_result_data, save_json_uri)
			
 
				+                task_result['path'] = save_json_uri
			
 
				                 # 保存数据库记录
			
 
				-                self.save_task_asin_page_extract_result(asin, asin_area, {
			
 
				-                    'status': 'success',
			
 
				-                    'path': save_path
			
 
				-                })
			
 
				+                asin_model = self.db.get_asin_seed(asin=asin)
			
 
				+                asin_model.extra_result_path = save_json_uri
			
 
				+                self.db.save_asin_seed(asin_model)
			
 
				+                logger.info(f"{asin}页面解析成功: {task_result}")
			
 
				         return task_result
			
 
				     
			
 
				     def save_task_asin_crawl_result(self, asin: str, asin_area:str=None, task_result: dict={}):
			
@@ -68,20 +78,6 @@ class ManagerTask:
 
				             else:
			
 
				                 self.db.add_or_ignore_asin_seed(AsinSeed(asin=asin, asin_area=asin_area, mhtml_path=task_result['path']))
			
 
				             return asin_seed
			
 
				-    def save_task_asin_page_extract_result(self, asin: str, asin_area:str=None, task_result: dict={}):
			
 
				-        if task_result.get('status') == 'success':
			
 
				-            asin_seed = self.db.get_asin_seed(asin)
			
 
				-            if asin_seed:
			
 
				-                asin_seed.extract_path = task_result['path']
			
 
				-                self.db.update_asin_seed(asin_seed)
			
 
				-            else:
			
 
				-                new_seed = AsinSeed(
			
 
				-                    asin=asin,
			
 
				-                    asin_area=asin_area,
			
 
				-                    extract_path=task_result['path']
			
 
				-                )
			
 
				-                self.db.add_or_ignore_asin_seed(new_seed)
			
 
				-            return asin_seed
			
 
				     def upload_file(self, file_path: str, filename: str):
			
 
				         res = save_to_file(Path(file_path).read_text(), self.s3_prefix + '/' + filename)
			
 
				         return res
			
@@ -93,8 +89,9 @@ class ManagerTask:
 
				 def main():
			
 
				     asinseed_list = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8']
			
 
				     manager = ManagerTask()    
			
 
				-    # manager.submit_task_and_wait('B0B658JC22', overwrite=False)
			
 
				-    manager.submit_extract_task_and_wait('B0B658JC22')
			
 
				+    for asin in asinseed_list:
			
 
				+        manager.submit_task_and_wait(asin)
			
 
				+        manager.submit_extract_task_and_wait(asin)
			
 
				     # result = {'status': 'success', 'path': 's3://public/amazone/copywriting_production/output/B0B658JC22/B0B658JC22.mhtml'}
			
 
				     # manager.save_task_asin_crawl_result('B0B658JC22', 'JP', result)
			
 
				 if __name__ == "__main__":
			
--- a/src/models/asin_model.py
+++ b/src/models/asin_model.py
@@ -0,0 +1,13 @@
 
				+from datetime import datetime
			
 
				+from typing import Optional
			
 
				+from sqlmodel import SQLModel, create_engine, Session, select, Field
			
 
				+from config.settings import DB_URL
			
 
				+
			
 
				+class AsinSeed(SQLModel, table=True):
			
 
				+    id: Optional[int] = Field(default=None, primary_key=True)
			
 
				+    asin: str
			
 
				+    asin_area: str = 'JP'
			
 
				+    extra_result_path: Optional[str] = None
			
 
				+    mhtml_path: Optional[str] = None
			
 
				+    error: Optional[str] = None
			
 
				+    created_at: Optional[datetime] = Field(default_factory=datetime.now)
			
--- a/src/models/excel_product_model.py
+++ b/src/models/excel_product_model.py
--- a/tests/mytest/t_boto3.py
+++ b/tests/mytest/t_boto3.py
@@ -1,8 +1,12 @@
 
				 from utils.file import s3_uri_to_http_url
			
 
				-
			
 
				+from pathlib import Path
			
 
				 def main():
			
 
				     s3_uri = 's3://public/amazone/copywriting_production/output/B0B658JC22/B0B658JC22.mhtml'
			
 
				-    print(s3_uri_to_http_url(s3_uri))
			
 
				+    path = Path(r's3://public/amazone/copywriting_production/output/B0B658JC22/B0B658JC22.mhtml')
			
 
				+    asin = 'B0B658JC22'
			
 
				+    save_json_path = Path(s3_uri).parent / f"{asin}_extract.json"
			
 
				+    print(save_json_path)
			
 
				+    # print(s3_uri_to_http_url(s3_uri))
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     main()
			
--- a/utils/config.py
+++ b/utils/config.py
@@ -1,44 +0,0 @@
 
				-import os
			
 
				-import yaml
			
 
				-from pathlib import Path
			
 
				-from pydantic import BaseModel, Field
			
 
				-from typing import List, Dict, Union,Optional,Any
			
 
				-from utils.pydantic_auto_field import AutoLoadModel
			
 
				-from dotenv import load_dotenv
			
 
				-load_dotenv()
			
 
				-
			
 
				-class Config(BaseModel):
			
 
				-    storage: str = "local"
			
 
				-    s3_access_key: Optional[str] = os.environ.get("S3_ACCESS_KEY", 'bh9LbfsPHRJgQ44wXIlv')
			
 
				-    s3_secret_key: Optional[str] = os.environ.get("S3_SECRET_KEY", 'N744RZ60T1b4zlcWG2MROCzjEE2mPTdNQCc7Pk3M')
			
 
				-    s3_endpoint: Optional[str] = os.environ.get("S3_ENDPOINT", 'http://vs1.lan:9002')
			
 
				-    chrome_config_ini: Optional[str] = r'G:\code\amazone\copywriting_production\config\dp_conf\9321.ini'
			
 
				-    redis_url: Optional[str] = os.environ.get("REDIS_URL", 'redis://localhost:6379/0')
			
 
				-    def save(self, config_path: Path = None):
			
 
				-        config_path = config_path or get_config_path()
			
 
				-        with open(config_path, "w", encoding="utf-8") as file:
			
 
				-            yaml.dump(self.model_dump(), file)
			
 
				-        return self
			
 
				-            
			
 
				-def get_config_path():
			
 
				-    return os.environ.get('CONFIG_PATH',CONFIG_DIR / "config.yaml") 
			
 
				-
			
 
				-def read_config(config_path: Path):
			
 
				-    if isinstance(config_path, str):
			
 
				-        config_path = Path(config_path)
			
 
				-    if not config_path.exists():
			
 
				-        config = Config()
			
 
				-        config.save(config_path)
			
 
				-        return config
			
 
				-    with open(config_path, "r", encoding="utf-8") as file:
			
 
				-        config_dict = yaml.safe_load(file)
			
 
				-    return Config(**config_dict)
			
 
				-
			
 
				-CFG = read_config(get_config_path())
			
 
				-
			
 
				-def main():
			
 
				-    print(CFG)
			
 
				-    CFG.save()
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    main()