|
|
@@ -21,7 +21,7 @@ from crawl4ai.content_filter_strategy import BM25ContentFilter
|
|
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|
|
import base64
|
|
|
from utils.logu import get_logger
|
|
|
-from config.settings import OUTPUT_DIR
|
|
|
+from config.settings import OUTPUT_DIR,TEMP_PAGE_DIR
|
|
|
from utils.drission_page import load_chrome_from_ini,ChromeOptions
|
|
|
from utils.file import save_to_file,check_exists,s3,read_file
|
|
|
from config.settings import CFG
|
|
|
@@ -29,8 +29,6 @@ from config.settings import CFG
|
|
|
logger = get_logger('browser')
|
|
|
ASIN_HTML_DIR = OUTPUT_DIR / 'page' / 'asin'
|
|
|
ASIN_HTML_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
-TEMP_PAGE_DIR = OUTPUT_DIR / 'page' / 'temp'
|
|
|
-TEMP_PAGE_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
class Crawler():
|
|
|
def __init__(self, chrome_options:ChromeOptions):
|
|
|
@@ -49,7 +47,7 @@ class Crawler():
|
|
|
self.browser_config.update({
|
|
|
"cdp_url": self.page.browser._driver._websocket_url
|
|
|
})
|
|
|
-
|
|
|
+ logger.info(f"get {url}, browser_config: {self.browser_config}")
|
|
|
async def run(self, url:str):
|
|
|
page = load_chrome_from_ini(
|
|
|
self.chrome_options
|
|
|
@@ -94,16 +92,20 @@ class Crawler():
|
|
|
return page.html
|
|
|
def get_asin_and_save_page(self, asin:str, asin_area:str='JP', mthml_type:bool=True, save_path:str=None, overwrite:bool=False):
|
|
|
if not overwrite and check_exists(save_path):
|
|
|
- logger.info(f"{save_path} exists")
|
|
|
+ logger.info(f"exists {save_path} ")
|
|
|
return save_path
|
|
|
data = self.get_asin_page_data(asin, asin_area, mthml_type)
|
|
|
- save_path = save_path or str(ASIN_HTML_DIR / f'{asin}.html')
|
|
|
+ save_path = save_path or str(ASIN_HTML_DIR / f'{asin}{".mhtml" if mthml_type else ".html"}')
|
|
|
return save_to_file(data, save_path)
|
|
|
|
|
|
async def excra_strategy_raw_html(self, raw_html:str, schema:dict, strategy:ExtractionStrategy=JsonXPathExtractionStrategy):
|
|
|
browser_config = BrowserConfig(
|
|
|
- **self.browser_config,
|
|
|
+ headless=self.chrome_options.headless,
|
|
|
+ use_managed_browser=True,
|
|
|
+ cdp_url=self.page.browser._driver._websocket_url
|
|
|
)
|
|
|
+ logger.info(f"{self.browser_config}")
|
|
|
+ logger.info(f"len {len(raw_html)} {type(raw_html)} {raw_html[:150]}")
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
result:CrawlResult = await crawler.arun(
|
|
|
url=f"raw://{raw_html}",
|
|
|
@@ -162,6 +164,7 @@ class Crawler():
|
|
|
data = json.loads(result.extracted_content)
|
|
|
logger.info(f"Extracted {len(data)} coin rows")
|
|
|
logger.debug(f"First item: {result.extracted_content}")
|
|
|
+ # [{"traffic_keyword":"","keyword_link":"..."}, {}]
|
|
|
return data
|
|
|
|
|
|
async def excra_product_info(self, html:str, input_schema:dict={}, strategy:ExtractionStrategy=JsonXPathExtractionStrategy) -> CrawlResult:
|
|
|
@@ -205,7 +208,15 @@ class Crawler():
|
|
|
}
|
|
|
]
|
|
|
}
|
|
|
- return await self.excra_strategy_raw_html(html, schema, JsonCssExtractionStrategy)
|
|
|
+ result:CrawlResult = await self.excra_strategy_raw_html(html, schema, JsonCssExtractionStrategy)
|
|
|
+ if not result.success:
|
|
|
+ logger.error(f"Crawl failed: {result.error_message}")
|
|
|
+ return
|
|
|
+ data = json.loads(result.extracted_content)
|
|
|
+ logger.info(f"Extracted {len(data)} coin rows")
|
|
|
+ logger.debug(f"result.extracted_content: {result.extracted_content}")
|
|
|
+ return data
|
|
|
+
|
|
|
|
|
|
def download_img(self,url:str,save_dir:str=TEMP_PAGE_DIR, page:str=None,as_img_base64:bool=True, upload_s3_dir:str=''):
|
|
|
# ('success', '{abs_current_path}\\notice.svg')
|
|
|
@@ -237,10 +248,10 @@ async def task():
|
|
|
# file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html'
|
|
|
# tab.get(file_path)
|
|
|
c.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
|
|
|
- res = c.download_img(
|
|
|
- 'https://www.asinseed.com/assets/svg/flat-icons/notice.svg?v=20181122',
|
|
|
- upload_s3_dir='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V/')
|
|
|
- logger.info(f"{res}")
|
|
|
+ # res = c.download_img(
|
|
|
+ # 'https://www.asinseed.com/assets/svg/flat-icons/notice.svg?v=20181122',
|
|
|
+ # upload_s3_dir='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V/')
|
|
|
+ # logger.info(f"{res}")
|
|
|
# logger.info(f"{res.extracted_content}")
|
|
|
|
|
|
# res = await c.cralw4ai_run(file_path)
|