amazon
/
copywriting_production


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
							import sys
from utils.file import extract_excel_text_from_url, read_excel_from_url, get_all_cells_text, read_excel, extract_excel_text_from_file

def test_pandas_excel_reader():
    """
    测试函数：使用模块化函数读取 Excel 文件并提取文本
    """
    # 测试 URL
    test_url = "http://s3.vs1.lan/public/amazone/copywriting_production/product/202508/1P镊子压刀.xlsx"
    
    print(f"正在读取 URL: {test_url}")
    
    # 方法1：使用组合函数直接提取文本
    print("\n=== 使用 extract_excel_text_from_url 函数 ===")
    all_cells_text_dict = extract_excel_text_from_url(test_url)
    
    # 打印所有工作表的内容
    for sheet_name, sheet_content in all_cells_text_dict.items():
        print(f"\n=== 工作表: {sheet_name} ===")
        print(sheet_content)
        
        # 将每个工作表的内容保存到单独的文件中
        with open(f'test_{sheet_name}.txt', 'w', encoding='utf-8') as f:
            f.write(sheet_content)
    
    # 同时保存所有工作表到一个文件中
    with open('test_all_sheets.txt', 'w', encoding='utf-8') as f:
        for sheet_name, sheet_content in all_cells_text_dict.items():
            f.write(f"=== 工作表: {sheet_name} ===\n")
            f.write(sheet_content)
            f.write("\n\n")
    
    print(f"\n共读取了 {len(all_cells_text_dict)} 个工作表")
    return
    # 方法2：分别调用两个函数
    print("\n=== 分别调用 read_excel_from_url 和 get_all_cells_text ===")
    excel_data = read_excel_from_url(test_url)
    
    if excel_data:
        print("Excel 文件读取成功！")
        
        # 提取所有单元格内容（包括空值）
        all_cells_text = get_all_cells_text(excel_data)
        print(all_cells_text)
        
    else:
        print("Excel 文件读取失败")

def test_local_excel_files():
    """
    测试函数：读取本地 Excel 文件并提取文本
    """
    # 指定的Excel文件列表
    excel_files = [
r"G:\code\amazone\copywriting_production\output\generated_excels\extra-data-大型犬牙刷手柄款.xlsx"
]
    
    for file_path in excel_files:
        print(f"\n=== 正在读取文件: {file_path} ===")
        
        # 检查文件是否存在
        import os
        if not os.path.exists(file_path):
            print(f"文件不存在: {file_path}")
            continue
        
        # 方法1：使用组合函数直接提取文本
        print("\n--- 使用 extract_excel_text_from_file 函数 ---")
        all_cells_text_dict = extract_excel_text_from_file(file_path)
        
        if all_cells_text_dict:
            # 将所有工作表内容合并为一个markdown文件
            combined_markdown = ""
            
            # 打印所有工作表的内容
            for sheet_name, sheet_content in all_cells_text_dict.items():
                print(f"\n--- 工作表: {sheet_name} ---")
                print(f"内容预览 (前200字符): {sheet_content[:200]}...")
                
                # 将Excel内容转换为Markdown格式，与extra_excel_product_flow.py保持一致
                combined_markdown += f"## 工作表: {sheet_name}\n\n```\n{sheet_content}\n```\n\n"
            
            # 将合并的markdown内容保存到与原始文件同路径的markdown文件
            file_dir = os.path.dirname(file_path)
            file_name = os.path.basename(file_path).replace('.xlsx', '.md')
            output_file = os.path.join(file_dir, file_name)
            
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(combined_markdown)
            print(f"\n已保存到: {output_file}")
            print(f"共读取了 {len(all_cells_text_dict)} 个工作表")
        else:
            print("Excel 文件读取失败")
        
        # 方法2：分别调用两个函数
        print("\n--- 分别调用 read_excel 和 get_all_cells_text ---")
        excel_data = read_excel(file_path)
        
        if excel_data:
            print("Excel 文件读取成功！")
            print("md 所在目录： G:\code\amazone\copywriting_production\output\generated_excels ")
            print("提示词所在目录： G:\code\amazone\copywriting_production\output\resource")
            
            # 提取所有单元格内容（包括空值）
            all_cells_text = get_all_cells_text(excel_data)
            print(f"提取到 {len(all_cells_text)} 个工作表的文本内容")
            
        else:
            print("Excel 文件读取失败")
        
        print("=" * 80)

if __name__ == "__main__":
    # test_pandas_excel_reader()
    test_local_excel_files()