Browse Source

新增s3 列出http路径

mrh 4 months ago
parent
commit
7c605ddd1e
2 changed files with 119 additions and 11 deletions
  1. 20 7
      tests/mytest/t_boto3.py
  2. 99 4
      utils/file.py

+ 20 - 7
tests/mytest/t_boto3.py

@@ -1,12 +1,25 @@
-from utils.file import s3_uri_to_http_url
+import sys
+# print(sys.path)
+from utils.file import s3_uri_to_http_url, s3_client
 from pathlib import Path
 def main():
-    s3_uri = 's3://public/amazone/copywriting_production/output/B0B658JC22/B0B658JC22.mhtml'
-    path = Path(r's3://public/amazone/copywriting_production/output/B0B658JC22/B0B658JC22.mhtml')
-    asin = 'B0B658JC22'
-    save_json_path = Path(s3_uri).parent / f"{asin}_extract.json"
-    print(save_json_path)
-    # print(s3_uri_to_http_url(s3_uri))
+    # 使用新的组合方法列出文件并生成HTTP URL
+    result = s3_client.list_s3_files_with_urls('/public/amazone/copywriting_production/product/202508/')
+    print(f"S3路径: {result['path']}")
+    print(f"Bucket: {result['bucket']}")
+    print(f"Prefix: {result['prefix']}")
+    print(f"找到 {result['count']} 个文件:")
+    
+    for i, file_info in enumerate(result['files'], 1):
+        file_name = file_info['key'].split('/')[-1]
+        http_url = result['http_urls'][i-1]  # 从结果中获取对应的HTTP URL
+        print(f"{i}. {file_name}")
+        print(f"   HTTP URL: {http_url}")
+        print(f"   大小: {file_info['size']} 字节, 修改时间: {file_info['last_modified']}")
+    
+    print("\n=== HTTP URL 列表 ===")
+    for i, url in enumerate(result['http_urls'], 1):
+        print(f"{i}. {url}")
 
 if __name__ == "__main__":
     main()

+ 99 - 4
utils/file.py

@@ -7,6 +7,7 @@ from smart_open import open
 from botocore.exceptions import NoCredentialsError,ClientError
 import boto3
 import logging
+from typing import List, Dict, Any, Optional
 
 from botocore.config import Config
 from config.settings import CFG
@@ -20,6 +21,99 @@ s3 = boto3.client(
 )
 # resource = boto3.resource('s3')
 
+class S3Client:
+    """S3客户端类,封装常用的S3操作"""
+    
+    def __init__(self, client=None):
+        """初始化S3客户端"""
+        self.client = client or s3
+        self.logger = logging.getLogger(__name__)
+    
+    def list_s3_files(self, s3_path: str) -> Dict[str, Any]:
+        """列出S3指定路径下的所有文件"""
+        self.logger.info(f"开始列出S3路径下的文件: {s3_path}")
+        
+        try:
+            # 解析S3路径
+            if s3_path.startswith('s3://'):
+                # 从s3://bucket/path格式解析
+                path_parts = s3_path[5:].split('/', 1)
+                bucket_name = path_parts[0]
+                prefix = path_parts[1] if len(path_parts) > 1 else ''
+            else:
+                # 从/path格式解析,使用默认bucket
+                bucket_name = 'public'
+                prefix = s3_path.lstrip('/')
+                # 移除路径开头的'public/'(如果存在),避免重复
+                if prefix.startswith('public/'):
+                    prefix = prefix[7:]
+            
+            # 确保prefix以/结尾,以便正确列出目录内容
+            if prefix and not prefix.endswith('/'):
+                prefix += '/'
+            
+            self.logger.info(f"使用bucket: {bucket_name}, prefix: {prefix}")
+            
+            # 使用boto3列出文件
+            response = self.client.list_objects_v2(
+                Bucket=bucket_name,
+                Prefix=prefix
+            )
+            
+            files = []
+            if 'Contents' in response:
+                for obj in response['Contents']:
+                    # 跳过目录本身(以/结尾的对象)
+                    if not obj['Key'].endswith('/'):
+                        files.append({
+                            'key': obj['Key'],
+                            'size': obj['Size'],
+                            'last_modified': obj['LastModified'].isoformat(),
+                            'etag': obj['ETag'].strip('"')
+                        })
+            
+            self.logger.info(f"找到 {len(files)} 个文件")
+            return {
+                'path': s3_path,
+                'bucket': bucket_name,
+                'prefix': prefix,
+                'files': files,
+                'count': len(files)
+            }
+            
+        except Exception as e:
+            self.logger.error(f"列出S3文件失败: {e}")
+            raise Exception(f"列出S3文件失败: {e}")
+    
+    def generate_http_urls(self, s3_files_result: Dict[str, Any], base_url: str = "http://s3.vs1.lan") -> List[str]:
+        """根据S3文件列表结果生成HTTP URL列表"""
+        http_urls = []
+        
+        for file_info in s3_files_result['files']:
+            # 从完整路径中提取文件名
+            file_name = file_info['key'].split('/')[-1]
+            # 构造完整的 HTTP URL
+            http_url = f"{base_url}/{s3_files_result['bucket']}/{s3_files_result['prefix']}{file_name}"
+            http_urls.append(http_url)
+        
+        return http_urls
+    
+    def list_s3_files_with_urls(self, s3_path: str, base_url: str = "http://s3.vs1.lan") -> Dict[str, Any]:
+        """列出S3指定路径下的所有文件并生成HTTP URL列表"""
+        # 先获取文件列表
+        files_result = self.list_s3_files(s3_path)
+        # 生成HTTP URL列表
+        http_urls = self.generate_http_urls(files_result, base_url)
+        
+        # 将HTTP URL添加到结果中
+        result = files_result.copy()
+        result['http_urls'] = http_urls
+        
+        return result
+
+# 创建全局S3客户端实例
+s3_client = S3Client()
+
 def s3_uri_to_http_url(s3_uri):
     """
     将 s3://bucket/key 格式的 URI 转换为 Minio 的 HTTP 访问链接。
@@ -128,12 +222,13 @@ def save_to_file(content, filename:Path, **extra_args):
     return filename
 
 def read_file(file_uri:str, mode='r'):
-    if not str(file_uri).startswith('http'):
+    if str(file_uri).startswith('s3://'):
+        with open(file_uri, mode or 'r', transport_params={'client': s3}) as f:
+            # 文件存在,继续操作
+            return f.read()
+    else:
         with open(file_uri, mode) as f:
             return f.read()
-    with open(file_uri, mode or 'r', transport_params={'client': s3}) as f:
-        # 文件存在,继续操作
-        return f.read()
 
 def check_exists(file_uri:str):
     if not file_uri.startswith('s3://'):