|
@@ -18,7 +18,7 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
|
|
from utils.logu import get_logger
|
|
from utils.logu import get_logger
|
|
|
from config.settings import OUTPUT_DIR
|
|
from config.settings import OUTPUT_DIR
|
|
|
from utils.drission_page import load_chrome_from_ini,ChromeOptions
|
|
from utils.drission_page import load_chrome_from_ini,ChromeOptions
|
|
|
-from utils.file import save_to_file,check_exists
|
|
|
|
|
|
|
+from utils.file import save_to_file,check_exists,s3
|
|
|
from utils.config import CFG
|
|
from utils.config import CFG
|
|
|
|
|
|
|
|
logger = get_logger('browser')
|
|
logger = get_logger('browser')
|
|
@@ -73,15 +73,14 @@ class Crawler():
|
|
|
url = self.get_asin_url(asin, asin_area)
|
|
url = self.get_asin_url(asin, asin_area)
|
|
|
page.get(url)
|
|
page.get(url)
|
|
|
if mthml_type:
|
|
if mthml_type:
|
|
|
- return page.save(str(ASIN_HTML_DIR), name=f'{asin}')
|
|
|
|
|
|
|
+ return page.save()
|
|
|
else:
|
|
else:
|
|
|
return page.html
|
|
return page.html
|
|
|
def get_asin_and_save_page(self, asin:str, asin_area:str='JP', mthml_type:bool=True, save_path:str=None, overwrite:bool=False):
|
|
def get_asin_and_save_page(self, asin:str, asin_area:str='JP', mthml_type:bool=True, save_path:str=None, overwrite:bool=False):
|
|
|
if not overwrite and check_exists(save_path):
|
|
if not overwrite and check_exists(save_path):
|
|
|
logger.info(f"{save_path} exists")
|
|
logger.info(f"{save_path} exists")
|
|
|
return save_path
|
|
return save_path
|
|
|
- # data = self.get_asin_page_data(asin, asin_area, mthml_type)
|
|
|
|
|
- data = self.get_asin_url(asin, asin_area)
|
|
|
|
|
|
|
+ data = self.get_asin_page_data(asin, asin_area, mthml_type)
|
|
|
save_path = save_path or str(ASIN_HTML_DIR / f'{asin}.html')
|
|
save_path = save_path or str(ASIN_HTML_DIR / f'{asin}.html')
|
|
|
return save_to_file(data, save_path)
|
|
return save_to_file(data, save_path)
|
|
|
|
|
|
|
@@ -91,15 +90,31 @@ async def task():
|
|
|
page = c.run_browser()
|
|
page = c.run_browser()
|
|
|
logger.info(f"{str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')}")
|
|
logger.info(f"{str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')}")
|
|
|
tab = page.latest_tab
|
|
tab = page.latest_tab
|
|
|
- tab.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
|
|
|
|
|
|
|
+ file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml'
|
|
|
|
|
+ # file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html'
|
|
|
|
|
+ # tab.get(file_path)
|
|
|
|
|
+ # page.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
|
|
|
|
|
+ # save_to_file(tab.save(), str(ASIN_HTML_DIR / 'B0CQ1SHD8V.png'))
|
|
|
|
|
+ data = tab.save()
|
|
|
|
|
+ # logger.info(f"{type(data)} , {data[:50]}")
|
|
|
|
|
+ save_to_file(data, 's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',Metadata={'mykey':'myvalue','mykey2':'myvalue2'})
|
|
|
|
|
+ return
|
|
|
|
|
+ # 附带源信息上传 https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html#the-extraargs-parameter
|
|
|
|
|
+ with open(file_path, 'rb') as data:
|
|
|
|
|
+ s3.upload_fileobj(
|
|
|
|
|
+ data, "public", 'amazone/copywriting_production/output/B0CQ1SHD8V.mhtml',
|
|
|
|
|
+ ExtraArgs={
|
|
|
|
|
+ 'Metadata': {'mykey': 'myvalue'},
|
|
|
|
|
+ 'ContentType': 'text/html'
|
|
|
|
|
+ })
|
|
|
# c.get_asin_and_save_page(asin[0], 'JP', save_path=str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
|
|
# c.get_asin_and_save_page(asin[0], 'JP', save_path=str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
|
|
|
# logger.info(f"{CFG.s3_secret_key}")
|
|
# logger.info(f"{CFG.s3_secret_key}")
|
|
|
- c.get_asin_and_save_page(
|
|
|
|
|
- asin[0],
|
|
|
|
|
- 'JP',
|
|
|
|
|
- save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
|
|
|
|
|
- overwrite=True
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ # c.get_asin_and_save_page(
|
|
|
|
|
+ # asin[0],
|
|
|
|
|
+ # 'JP',
|
|
|
|
|
+ # save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
|
|
|
|
|
+ # overwrite=True
|
|
|
|
|
+ # )
|
|
|
# page.save(str(ASIN_HTML_DIR), name=f'{asin[0]}.html')
|
|
# page.save(str(ASIN_HTML_DIR), name=f'{asin[0]}.html')
|
|
|
# save_to_file(page.html, str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
|
|
# save_to_file(page.html, str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
|
|
|
# await c.run('https://fr.florame.com/en/essential-oils')
|
|
# await c.run('https://fr.florame.com/en/essential-oils')
|