1 年間前 · 57b68a12e5
--- a/src/browser/crawl_asin.py
+++ b/src/browser/crawl_asin.py
@@ -18,7 +18,7 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 
				 from utils.logu import get_logger
			
 
				 from config.settings import OUTPUT_DIR
			
 
				 from utils.drission_page import load_chrome_from_ini,ChromeOptions
			
 
				-from utils.file import save_to_file,check_exists
			
 
				+from utils.file import save_to_file,check_exists,s3
			
 
				 from utils.config import CFG
			
 
				 
			
 
				 logger = get_logger('browser')
			
@@ -73,15 +73,14 @@ class Crawler():
 
				         url = self.get_asin_url(asin, asin_area)
			
 
				         page.get(url)
			
 
				         if mthml_type:
			
 
				-            return page.save(str(ASIN_HTML_DIR), name=f'{asin}')
			
 
				+            return page.save()
			
 
				         else:
			
 
				             return page.html
			
 
				     def get_asin_and_save_page(self, asin:str, asin_area:str='JP', mthml_type:bool=True, save_path:str=None, overwrite:bool=False):
			
 
				         if not overwrite and check_exists(save_path):
			
 
				             logger.info(f"{save_path} exists")
			
 
				             return save_path
			
 
				-        # data = self.get_asin_page_data(asin, asin_area, mthml_type)
			
 
				-        data = self.get_asin_url(asin, asin_area)
			
 
				+        data = self.get_asin_page_data(asin, asin_area, mthml_type)
			
 
				         save_path = save_path or str(ASIN_HTML_DIR / f'{asin}.html')
			
 
				         return save_to_file(data, save_path)
			
 
				         
			
@@ -91,15 +90,31 @@ async def task():
 
				     page = c.run_browser()
			
 
				     logger.info(f"{str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')}")
			
 
				     tab = page.latest_tab
			
 
				-    tab.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
			
 
				+    file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml'
			
 
				+    # file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html'
			
 
				+    # tab.get(file_path)
			
 
				+    # page.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
			
 
				+    # save_to_file(tab.save(), str(ASIN_HTML_DIR / 'B0CQ1SHD8V.png'))
			
 
				+    data = tab.save()
			
 
				+    # logger.info(f"{type(data)} , {data[:50]}")
			
 
				+    save_to_file(data, 's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',Metadata={'mykey':'myvalue','mykey2':'myvalue2'})
			
 
				+    return
			
 
				+    # 附带源信息上传 https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html#the-extraargs-parameter
			
 
				+    with open(file_path, 'rb') as data:
			
 
				+        s3.upload_fileobj(
			
 
				+            data, "public", 'amazone/copywriting_production/output/B0CQ1SHD8V.mhtml',
			
 
				+            ExtraArgs={
			
 
				+                'Metadata': {'mykey': 'myvalue'},
			
 
				+                'ContentType': 'text/html'
			
 
				+                })
			
 
				     # c.get_asin_and_save_page(asin[0], 'JP', save_path=str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
			
 
				     # logger.info(f"{CFG.s3_secret_key}")
			
 
				-    c.get_asin_and_save_page(
			
 
				-        asin[0], 
			
 
				-        'JP',
			
 
				-        save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
			
 
				-        overwrite=True
			
 
				-    )
			
 
				+    # c.get_asin_and_save_page(
			
 
				+    #     asin[0], 
			
 
				+    #     'JP',
			
 
				+    #     save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
			
 
				+    #     overwrite=True
			
 
				+    # )
			
 
				     # page.save(str(ASIN_HTML_DIR), name=f'{asin[0]}.html')
			
 
				     # save_to_file(page.html, str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
			
 
				     # await c.run('https://fr.florame.com/en/essential-oils')
			
--- a/utils/file.py
+++ b/utils/file.py
@@ -1,36 +1,52 @@
 
				 import json
			
 
				 from pathlib import Path
			
 
				+import smart_open
			
 
				 from smart_open import open
			
 
				 from botocore.exceptions import NoCredentialsError
			
 
				 import boto3
			
 
				 from botocore.config import Config
			
 
				 from utils.config import CFG
			
 
				+import mimetypes
			
 
				 
			
 
				 s3 = boto3.client(
			
 
				     's3',
			
 
				     aws_access_key_id=CFG.s3_access_key,
			
 
				     aws_secret_access_key=CFG.s3_secret_key,
			
 
				     endpoint_url=CFG.s3_endpoint,
			
 
				-    config=Config(signature_version='s3v4'),
			
 
				-    # aws_account_id='ACCOUNT_ID'
			
 
				+    config=Config(signature_version='s3v4', retries={'mode': 'standard'}),
			
 
				 )
			
 
				-response = s3.list_buckets()
			
 
				+resource = boto3.resource('s3')
			
 
				 
			
 
				-# Output the bucket names
			
 
				-print('Existing buckets:')
			
 
				-for bucket in response['Buckets']:
			
 
				-    print(f'  {bucket["Name"]}')
			
 
				-def save_to_file(content, filename:Path):
			
 
				-    if not isinstance(content, str):
			
 
				-    # 如果可以用 json 格式化，则格式化
			
 
				-        try:
			
 
				-            content = json.dumps(content, indent=4, ensure_ascii=False)
			
 
				-        except:
			
 
				-            # 如果不是 str ，则格式化
			
 
				-            if not isinstance(content, str):
			
 
				-                content = str(content)
			
 
				-    
			
 
				-    with open(filename, "w", encoding="utf-8", transport_params={'client': s3}) as file:
			
 
				+def upload_to_s3(content, filename:str, **extra_args):
			
 
				+    bucket_name = filename.split('/')[2]
			
 
				+    object_name = '/'.join(filename.split('/')[3:])
			
 
				+    content_type, _ = mimetypes.guess_type(object_name)
			
 
				+    content_type = content_type or 'application/octet-stream'
			
 
				+    upload_args = {
			
 
				+        'ContentType': content_type,
			
 
				+    }
			
 
				+    upload_args.update(extra_args)
			
 
				+    if isinstance(content, str):
			
 
				+        content = content.encode('utf-8')
			
 
				+    print(bucket_name, object_name)
			
 
				+    s3.put_object(
			
 
				+        Bucket=bucket_name,
			
 
				+        Key=object_name,
			
 
				+        Body=content,
			
 
				+        **upload_args
			
 
				+    )
			
 
				+    return filename
			
 
				+def save_to_file(content, filename:Path, **extra_args):
			
 
				+    '''
			
 
				+    save_to_file(
			
 
				+        data, 
			
 
				+        's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
			
 
				+        Metadata={'mykey':'myvalue','mykey2':'myvalue2'}
			
 
				+        )
			
 
				+    '''
			
 
				+    if str(filename).startswith('s3://'):
			
 
				+        return upload_to_s3(content, str(filename), **extra_args)
			
 
				+    with open(filename, "w", encoding="utf-8") as file:
			
 
				         file.write(content)
			
 
				     return filename
			
 
				 
			
@@ -41,4 +57,16 @@ def check_exists(file_uri:str):
 
				             return file_uri
			
 
				     except (FileNotFoundError,OSError):
			
 
				         # 文件不存在，执行相应的操作
			
 
				-        return False
			
 
				+        return False
			
 
				+
			
 
				+def main():
			
 
				+    response = s3.list_buckets()
			
 
				+
			
 
				+    # Output the bucket names
			
 
				+    print('Existing buckets:')
			
 
				+    for bucket in response['Buckets']:
			
 
				+        print(f'  {bucket["Name"]}')
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()