1 год назад · 57b68a12e5
--- a/src/browser/crawl_asin.py
+++ b/src/browser/crawl_asin.py
@@ -18,7 +18,7 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 
															 from utils.logu import get_logger
														
 
															 from config.settings import OUTPUT_DIR
														
 
															 from utils.drission_page import load_chrome_from_ini,ChromeOptions
														
 
															-from utils.file import save_to_file,check_exists
														
 
															+from utils.file import save_to_file,check_exists,s3
														
 
															 from utils.config import CFG
														
 
															 logger = get_logger('browser')
														
@@ -73,15 +73,14 @@ class Crawler():
 
															         url = self.get_asin_url(asin, asin_area)
														
 
															         page.get(url)
														
 
															         if mthml_type:
														
 
															-            return page.save(str(ASIN_HTML_DIR), name=f'{asin}')
														
 
															+            return page.save()
														
 
															         else:
														
 
															             return page.html
														
 
															     def get_asin_and_save_page(self, asin:str, asin_area:str='JP', mthml_type:bool=True, save_path:str=None, overwrite:bool=False):
														
 
															         if not overwrite and check_exists(save_path):
														
 
															             logger.info(f"{save_path} exists")
														
 
															             return save_path
														
 
															-        # data = self.get_asin_page_data(asin, asin_area, mthml_type)
														
 
															-        data = self.get_asin_url(asin, asin_area)
														
 
															+        data = self.get_asin_page_data(asin, asin_area, mthml_type)
														
 
															         save_path = save_path or str(ASIN_HTML_DIR / f'{asin}.html')
														
 
															         return save_to_file(data, save_path)
														
@@ -91,15 +90,31 @@ async def task():
 
															     page = c.run_browser()
														
 
															     logger.info(f"{str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')}")
														
 
															     tab = page.latest_tab
														
 
															-    tab.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
														
 
															+    file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml'
														
 
															+    # file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html'
														
 
															+    # tab.get(file_path)
														
 
															+    # page.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
														
 
															+    # save_to_file(tab.save(), str(ASIN_HTML_DIR / 'B0CQ1SHD8V.png'))
														
 
															+    data = tab.save()
														
 
															+    # logger.info(f"{type(data)} , {data[:50]}")
														
 
															+    save_to_file(data, 's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',Metadata={'mykey':'myvalue','mykey2':'myvalue2'})
														
 
															+    return
														
 
															+    # 附带源信息上传 https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html#the-extraargs-parameter
														
 
															+    with open(file_path, 'rb') as data:
														
 
															+        s3.upload_fileobj(
														
 
															+            data, "public", 'amazone/copywriting_production/output/B0CQ1SHD8V.mhtml',
														
 
															+            ExtraArgs={
														
 
															+                'Metadata': {'mykey': 'myvalue'},
														
 
															+                'ContentType': 'text/html'
														
 
															+                })
														
 
															     # c.get_asin_and_save_page(asin[0], 'JP', save_path=str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
														
 
															     # logger.info(f"{CFG.s3_secret_key}")
														
 
															-    c.get_asin_and_save_page(
														
 
															-        asin[0], 
														
 
															-        'JP',
														
 
															-        save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
														
 
															-        overwrite=True
														
 
															-    )
														
 
															+    # c.get_asin_and_save_page(
														
 
															+    #     asin[0], 
														
 
															+    #     'JP',
														
 
															+    #     save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
														
 
															+    #     overwrite=True
														
 
															+    # )
														
 
															     # page.save(str(ASIN_HTML_DIR), name=f'{asin[0]}.html')
														
 
															     # save_to_file(page.html, str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
														
 
															     # await c.run('https://fr.florame.com/en/essential-oils')
														
--- a/utils/file.py
+++ b/utils/file.py
@@ -1,36 +1,52 @@
 
															 import json
														
 
															 from pathlib import Path
														
 
															+import smart_open
														
 
															 from smart_open import open
														
 
															 from botocore.exceptions import NoCredentialsError
														
 
															 import boto3
														
 
															 from botocore.config import Config
														
 
															 from utils.config import CFG
														
 
															+import mimetypes
														
 
															 s3 = boto3.client(
														
 
															     's3',
														
 
															     aws_access_key_id=CFG.s3_access_key,
														
 
															     aws_secret_access_key=CFG.s3_secret_key,
														
 
															     endpoint_url=CFG.s3_endpoint,
														
 
															-    config=Config(signature_version='s3v4'),
														
 
															-    # aws_account_id='ACCOUNT_ID'
														
 
															+    config=Config(signature_version='s3v4', retries={'mode': 'standard'}),
														
 
															 )
														
 
															-response = s3.list_buckets()
														
 
															+resource = boto3.resource('s3')
														
 
															-# Output the bucket names
														
 
															-print('Existing buckets:')
														
 
															-for bucket in response['Buckets']:
														
 
															-    print(f'  {bucket["Name"]}')
														
 
															-def save_to_file(content, filename:Path):
														
 
															-    if not isinstance(content, str):
														
 
															-    # 如果可以用 json 格式化，则格式化
														
 
															-        try:
														
 
															-            content = json.dumps(content, indent=4, ensure_ascii=False)
														
 
															-        except:
														
 
															-            # 如果不是 str ，则格式化
														
 
															-            if not isinstance(content, str):
														
 
															-                content = str(content)
														
 
															-    
														
 
															-    with open(filename, "w", encoding="utf-8", transport_params={'client': s3}) as file:
														
 
															+def upload_to_s3(content, filename:str, **extra_args):
														
 
															+    bucket_name = filename.split('/')[2]
														
 
															+    object_name = '/'.join(filename.split('/')[3:])
														
 
															+    content_type, _ = mimetypes.guess_type(object_name)
														
 
															+    content_type = content_type or 'application/octet-stream'
														
 
															+    upload_args = {
														
 
															+        'ContentType': content_type,
														
 
															+    }
														
 
															+    upload_args.update(extra_args)
														
 
															+    if isinstance(content, str):
														
 
															+        content = content.encode('utf-8')
														
 
															+    print(bucket_name, object_name)
														
 
															+    s3.put_object(
														
 
															+        Bucket=bucket_name,
														
 
															+        Key=object_name,
														
 
															+        Body=content,
														
 
															+        **upload_args
														
 
															+    )
														
 
															+    return filename
														
 
															+def save_to_file(content, filename:Path, **extra_args):
														
 
															+    '''
														
 
															+    save_to_file(
														
 
															+        data, 
														
 
															+        's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
														
 
															+        Metadata={'mykey':'myvalue','mykey2':'myvalue2'}
														
 
															+        )
														
 
															+    '''
														
 
															+    if str(filename).startswith('s3://'):
														
 
															+        return upload_to_s3(content, str(filename), **extra_args)
														
 
															+    with open(filename, "w", encoding="utf-8") as file:
														
 
															         file.write(content)
														
 
															     return filename
														
@@ -41,4 +57,16 @@ def check_exists(file_uri:str):
 
															             return file_uri
														
 
															     except (FileNotFoundError,OSError):
														
 
															         # 文件不存在，执行相应的操作
														
 
															-        return False
														
 
															+        return False
														
 
															+
														
 
															+def main():
														
 
															+    response = s3.list_buckets()
														
 
															+
														
 
															+    # Output the bucket names
														
 
															+    print('Existing buckets:')
														
 
															+    for bucket in response['Buckets']:
														
 
															+        print(f'  {bucket["Name"]}')
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    main()