ソースを参照

上传S3时附带参数和格式

mrh 1 年間 前
コミット
57b68a12e5
2 ファイル変更73 行追加30 行削除
  1. 26 11
      src/browser/crawl_asin.py
  2. 47 19
      utils/file.py

+ 26 - 11
src/browser/crawl_asin.py

@@ -18,7 +18,7 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 from utils.logu import get_logger
 from config.settings import OUTPUT_DIR
 from utils.drission_page import load_chrome_from_ini,ChromeOptions
-from utils.file import save_to_file,check_exists
+from utils.file import save_to_file,check_exists,s3
 from utils.config import CFG
 
 logger = get_logger('browser')
@@ -73,15 +73,14 @@ class Crawler():
         url = self.get_asin_url(asin, asin_area)
         page.get(url)
         if mthml_type:
-            return page.save(str(ASIN_HTML_DIR), name=f'{asin}')
+            return page.save()
         else:
             return page.html
     def get_asin_and_save_page(self, asin:str, asin_area:str='JP', mthml_type:bool=True, save_path:str=None, overwrite:bool=False):
         if not overwrite and check_exists(save_path):
             logger.info(f"{save_path} exists")
             return save_path
-        # data = self.get_asin_page_data(asin, asin_area, mthml_type)
-        data = self.get_asin_url(asin, asin_area)
+        data = self.get_asin_page_data(asin, asin_area, mthml_type)
         save_path = save_path or str(ASIN_HTML_DIR / f'{asin}.html')
         return save_to_file(data, save_path)
         
@@ -91,15 +90,31 @@ async def task():
     page = c.run_browser()
     logger.info(f"{str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')}")
     tab = page.latest_tab
-    tab.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
+    file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml'
+    # file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html'
+    # tab.get(file_path)
+    # page.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
+    # save_to_file(tab.save(), str(ASIN_HTML_DIR / 'B0CQ1SHD8V.png'))
+    data = tab.save()
+    # logger.info(f"{type(data)} , {data[:50]}")
+    save_to_file(data, 's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',Metadata={'mykey':'myvalue','mykey2':'myvalue2'})
+    return
+    # 附带源信息上传 https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html#the-extraargs-parameter
+    with open(file_path, 'rb') as data:
+        s3.upload_fileobj(
+            data, "public", 'amazone/copywriting_production/output/B0CQ1SHD8V.mhtml',
+            ExtraArgs={
+                'Metadata': {'mykey': 'myvalue'},
+                'ContentType': 'text/html'
+                })
     # c.get_asin_and_save_page(asin[0], 'JP', save_path=str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
     # logger.info(f"{CFG.s3_secret_key}")
-    c.get_asin_and_save_page(
-        asin[0], 
-        'JP',
-        save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
-        overwrite=True
-    )
+    # c.get_asin_and_save_page(
+    #     asin[0], 
+    #     'JP',
+    #     save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
+    #     overwrite=True
+    # )
     # page.save(str(ASIN_HTML_DIR), name=f'{asin[0]}.html')
     # save_to_file(page.html, str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
     # await c.run('https://fr.florame.com/en/essential-oils')

+ 47 - 19
utils/file.py

@@ -1,36 +1,52 @@
 import json
 from pathlib import Path
+import smart_open
 from smart_open import open
 from botocore.exceptions import NoCredentialsError
 import boto3
 from botocore.config import Config
 from utils.config import CFG
+import mimetypes
 
 s3 = boto3.client(
     's3',
     aws_access_key_id=CFG.s3_access_key,
     aws_secret_access_key=CFG.s3_secret_key,
     endpoint_url=CFG.s3_endpoint,
-    config=Config(signature_version='s3v4'),
-    # aws_account_id='ACCOUNT_ID'
+    config=Config(signature_version='s3v4', retries={'mode': 'standard'}),
 )
-response = s3.list_buckets()
+resource = boto3.resource('s3')
 
-# Output the bucket names
-print('Existing buckets:')
-for bucket in response['Buckets']:
-    print(f'  {bucket["Name"]}')
-def save_to_file(content, filename:Path):
-    if not isinstance(content, str):
-    # 如果可以用 json 格式化,则格式化
-        try:
-            content = json.dumps(content, indent=4, ensure_ascii=False)
-        except:
-            # 如果不是 str ,则格式化
-            if not isinstance(content, str):
-                content = str(content)
-    
-    with open(filename, "w", encoding="utf-8", transport_params={'client': s3}) as file:
+def upload_to_s3(content, filename:str, **extra_args):
+    bucket_name = filename.split('/')[2]
+    object_name = '/'.join(filename.split('/')[3:])
+    content_type, _ = mimetypes.guess_type(object_name)
+    content_type = content_type or 'application/octet-stream'
+    upload_args = {
+        'ContentType': content_type,
+    }
+    upload_args.update(extra_args)
+    if isinstance(content, str):
+        content = content.encode('utf-8')
+    print(bucket_name, object_name)
+    s3.put_object(
+        Bucket=bucket_name,
+        Key=object_name,
+        Body=content,
+        **upload_args
+    )
+    return filename
+def save_to_file(content, filename:Path, **extra_args):
+    '''
+    save_to_file(
+        data, 
+        's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
+        Metadata={'mykey':'myvalue','mykey2':'myvalue2'}
+        )
+    '''
+    if str(filename).startswith('s3://'):
+        return upload_to_s3(content, str(filename), **extra_args)
+    with open(filename, "w", encoding="utf-8") as file:
         file.write(content)
     return filename
 
@@ -41,4 +57,16 @@ def check_exists(file_uri:str):
             return file_uri
     except (FileNotFoundError,OSError):
         # 文件不存在,执行相应的操作
-        return False
+        return False
+
+def main():
+    response = s3.list_buckets()
+
+    # Output the bucket names
+    print('Existing buckets:')
+    for bucket in response['Buckets']:
+        print(f'  {bucket["Name"]}')
+
+
+if __name__ == "__main__":
+    main()