|
|
@@ -18,13 +18,15 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
|
|
from utils.logu import get_logger
|
|
|
from config.settings import OUTPUT_DIR
|
|
|
from utils.drission_page import load_chrome_from_ini,ChromeOptions
|
|
|
-from utils.file import save_to_file
|
|
|
+from utils.file import save_to_file,check_exists
|
|
|
+from utils.config import CFG
|
|
|
+
|
|
|
logger = get_logger('browser')
|
|
|
ASIN_HTML_DIR = OUTPUT_DIR / 'page' / 'asin'
|
|
|
ASIN_HTML_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
class Crawler():
|
|
|
- def __init__(self, chrome_options:ChromeOptions):
|
|
|
+ def __init__(self, chrome_options:ChromeOptions, storage_config:dict=None):
|
|
|
self.chrome_options = chrome_options
|
|
|
|
|
|
async def run(self, url:str):
|
|
|
@@ -74,11 +76,15 @@ class Crawler():
|
|
|
return page.save(str(ASIN_HTML_DIR), name=f'{asin}')
|
|
|
else:
|
|
|
return page.html
|
|
|
- def get_asin_and_save_page(self, asin:str, asin_area:str='JP', mthml_type:bool=True, save_path:str=None):
|
|
|
- data = self.get_asin_page_data(asin, asin_area, mthml_type)
|
|
|
+ def get_asin_and_save_page(self, asin:str, asin_area:str='JP', mthml_type:bool=True, save_path:str=None, overwrite:bool=False):
|
|
|
+ if not overwrite and check_exists(save_path):
|
|
|
+ logger.info(f"{save_path} exists")
|
|
|
+ return save_path
|
|
|
+ # data = self.get_asin_page_data(asin, asin_area, mthml_type)
|
|
|
+ data = self.get_asin_url(asin, asin_area)
|
|
|
save_path = save_path or str(ASIN_HTML_DIR / f'{asin}.html')
|
|
|
- save_to_file(data, save_path)
|
|
|
- return save_path
|
|
|
+ return save_to_file(data, save_path)
|
|
|
+
|
|
|
async def task():
|
|
|
asin = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8']
|
|
|
c = Crawler(ChromeOptions())
|
|
|
@@ -86,7 +92,14 @@ async def task():
|
|
|
logger.info(f"{str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')}")
|
|
|
tab = page.latest_tab
|
|
|
tab.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
|
|
|
- save_to_file(page.html, str(OUTPUT_DIR / 'page/debug' / f'{asin[0]}-from-mthml.html'))
|
|
|
+ # c.get_asin_and_save_page(asin[0], 'JP', save_path=str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
|
|
|
+ # logger.info(f"{CFG.s3_secret_key}")
|
|
|
+ c.get_asin_and_save_page(
|
|
|
+ asin[0],
|
|
|
+ 'JP',
|
|
|
+ save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
|
|
|
+ overwrite=True
|
|
|
+ )
|
|
|
# page.save(str(ASIN_HTML_DIR), name=f'{asin[0]}.html')
|
|
|
# save_to_file(page.html, str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
|
|
|
# await c.run('https://fr.florame.com/en/essential-oils')
|