Просмотр исходного кода

上传S3时附带参数和格式

mrh 1 год назад
Родитель
Сommit
57b68a12e5
2 измененных файлов с 73 добавлено и 30 удалено
  1. 26 11
      src/browser/crawl_asin.py
  2. 47 19
      utils/file.py

+ 26 - 11
src/browser/crawl_asin.py

@@ -18,7 +18,7 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 from utils.logu import get_logger
 from utils.logu import get_logger
 from config.settings import OUTPUT_DIR
 from config.settings import OUTPUT_DIR
 from utils.drission_page import load_chrome_from_ini,ChromeOptions
 from utils.drission_page import load_chrome_from_ini,ChromeOptions
-from utils.file import save_to_file,check_exists
+from utils.file import save_to_file,check_exists,s3
 from utils.config import CFG
 from utils.config import CFG
 
 
 logger = get_logger('browser')
 logger = get_logger('browser')
@@ -73,15 +73,14 @@ class Crawler():
         url = self.get_asin_url(asin, asin_area)
         url = self.get_asin_url(asin, asin_area)
         page.get(url)
         page.get(url)
         if mthml_type:
         if mthml_type:
-            return page.save(str(ASIN_HTML_DIR), name=f'{asin}')
+            return page.save()
         else:
         else:
             return page.html
             return page.html
     def get_asin_and_save_page(self, asin:str, asin_area:str='JP', mthml_type:bool=True, save_path:str=None, overwrite:bool=False):
     def get_asin_and_save_page(self, asin:str, asin_area:str='JP', mthml_type:bool=True, save_path:str=None, overwrite:bool=False):
         if not overwrite and check_exists(save_path):
         if not overwrite and check_exists(save_path):
             logger.info(f"{save_path} exists")
             logger.info(f"{save_path} exists")
             return save_path
             return save_path
-        # data = self.get_asin_page_data(asin, asin_area, mthml_type)
-        data = self.get_asin_url(asin, asin_area)
+        data = self.get_asin_page_data(asin, asin_area, mthml_type)
         save_path = save_path or str(ASIN_HTML_DIR / f'{asin}.html')
         save_path = save_path or str(ASIN_HTML_DIR / f'{asin}.html')
         return save_to_file(data, save_path)
         return save_to_file(data, save_path)
         
         
@@ -91,15 +90,31 @@ async def task():
     page = c.run_browser()
     page = c.run_browser()
     logger.info(f"{str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')}")
     logger.info(f"{str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')}")
     tab = page.latest_tab
     tab = page.latest_tab
-    tab.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
+    file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml'
+    # file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html'
+    # tab.get(file_path)
+    # page.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
+    # save_to_file(tab.save(), str(ASIN_HTML_DIR / 'B0CQ1SHD8V.png'))
+    data = tab.save()
+    # logger.info(f"{type(data)} , {data[:50]}")
+    save_to_file(data, 's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',Metadata={'mykey':'myvalue','mykey2':'myvalue2'})
+    return
+    # 附带源信息上传 https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html#the-extraargs-parameter
+    with open(file_path, 'rb') as data:
+        s3.upload_fileobj(
+            data, "public", 'amazone/copywriting_production/output/B0CQ1SHD8V.mhtml',
+            ExtraArgs={
+                'Metadata': {'mykey': 'myvalue'},
+                'ContentType': 'text/html'
+                })
     # c.get_asin_and_save_page(asin[0], 'JP', save_path=str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
     # c.get_asin_and_save_page(asin[0], 'JP', save_path=str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
     # logger.info(f"{CFG.s3_secret_key}")
     # logger.info(f"{CFG.s3_secret_key}")
-    c.get_asin_and_save_page(
-        asin[0], 
-        'JP',
-        save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
-        overwrite=True
-    )
+    # c.get_asin_and_save_page(
+    #     asin[0], 
+    #     'JP',
+    #     save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
+    #     overwrite=True
+    # )
     # page.save(str(ASIN_HTML_DIR), name=f'{asin[0]}.html')
     # page.save(str(ASIN_HTML_DIR), name=f'{asin[0]}.html')
     # save_to_file(page.html, str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
     # save_to_file(page.html, str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
     # await c.run('https://fr.florame.com/en/essential-oils')
     # await c.run('https://fr.florame.com/en/essential-oils')

+ 47 - 19
utils/file.py

@@ -1,36 +1,52 @@
 import json
 import json
 from pathlib import Path
 from pathlib import Path
+import smart_open
 from smart_open import open
 from smart_open import open
 from botocore.exceptions import NoCredentialsError
 from botocore.exceptions import NoCredentialsError
 import boto3
 import boto3
 from botocore.config import Config
 from botocore.config import Config
 from utils.config import CFG
 from utils.config import CFG
+import mimetypes
 
 
 s3 = boto3.client(
 s3 = boto3.client(
     's3',
     's3',
     aws_access_key_id=CFG.s3_access_key,
     aws_access_key_id=CFG.s3_access_key,
     aws_secret_access_key=CFG.s3_secret_key,
     aws_secret_access_key=CFG.s3_secret_key,
     endpoint_url=CFG.s3_endpoint,
     endpoint_url=CFG.s3_endpoint,
-    config=Config(signature_version='s3v4'),
-    # aws_account_id='ACCOUNT_ID'
+    config=Config(signature_version='s3v4', retries={'mode': 'standard'}),
 )
 )
-response = s3.list_buckets()
+resource = boto3.resource('s3')
 
 
-# Output the bucket names
-print('Existing buckets:')
-for bucket in response['Buckets']:
-    print(f'  {bucket["Name"]}')
-def save_to_file(content, filename:Path):
-    if not isinstance(content, str):
-    # 如果可以用 json 格式化,则格式化
-        try:
-            content = json.dumps(content, indent=4, ensure_ascii=False)
-        except:
-            # 如果不是 str ,则格式化
-            if not isinstance(content, str):
-                content = str(content)
-    
-    with open(filename, "w", encoding="utf-8", transport_params={'client': s3}) as file:
+def upload_to_s3(content, filename:str, **extra_args):
+    bucket_name = filename.split('/')[2]
+    object_name = '/'.join(filename.split('/')[3:])
+    content_type, _ = mimetypes.guess_type(object_name)
+    content_type = content_type or 'application/octet-stream'
+    upload_args = {
+        'ContentType': content_type,
+    }
+    upload_args.update(extra_args)
+    if isinstance(content, str):
+        content = content.encode('utf-8')
+    print(bucket_name, object_name)
+    s3.put_object(
+        Bucket=bucket_name,
+        Key=object_name,
+        Body=content,
+        **upload_args
+    )
+    return filename
+def save_to_file(content, filename:Path, **extra_args):
+    '''
+    save_to_file(
+        data, 
+        's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
+        Metadata={'mykey':'myvalue','mykey2':'myvalue2'}
+        )
+    '''
+    if str(filename).startswith('s3://'):
+        return upload_to_s3(content, str(filename), **extra_args)
+    with open(filename, "w", encoding="utf-8") as file:
         file.write(content)
         file.write(content)
     return filename
     return filename
 
 
@@ -41,4 +57,16 @@ def check_exists(file_uri:str):
             return file_uri
             return file_uri
     except (FileNotFoundError,OSError):
     except (FileNotFoundError,OSError):
         # 文件不存在,执行相应的操作
         # 文件不存在,执行相应的操作
-        return False
+        return False
+
+def main():
+    response = s3.list_buckets()
+
+    # Output the bucket names
+    print('Existing buckets:')
+    for bucket in response['Buckets']:
+        print(f'  {bucket["Name"]}')
+
+
+if __name__ == "__main__":
+    main()