|
|
@@ -19,31 +19,27 @@ from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
|
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy,JsonXPathExtractionStrategy,ExtractionStrategy
|
|
|
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
|
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|
|
-
|
|
|
+import base64
|
|
|
from utils.logu import get_logger
|
|
|
from config.settings import OUTPUT_DIR
|
|
|
from utils.drission_page import load_chrome_from_ini,ChromeOptions
|
|
|
from utils.file import save_to_file,check_exists,s3,read_file
|
|
|
-from utils.config import CFG
|
|
|
+from config.settings import CFG
|
|
|
|
|
|
logger = get_logger('browser')
|
|
|
ASIN_HTML_DIR = OUTPUT_DIR / 'page' / 'asin'
|
|
|
ASIN_HTML_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
+TEMP_PAGE_DIR = OUTPUT_DIR / 'page' / 'temp'
|
|
|
+TEMP_PAGE_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
class Crawler():
|
|
|
- def __init__(self, chrome_options:ChromeOptions, storage_config:dict=None):
|
|
|
+ def __init__(self, chrome_options:ChromeOptions):
|
|
|
self.chrome_options = chrome_options
|
|
|
self.page = None
|
|
|
self.browser_config = {
|
|
|
"headless": self.chrome_options.headless,
|
|
|
"use_managed_browser": True,
|
|
|
}
|
|
|
- # BrowserConfig(
|
|
|
- # headless=chrome_options.headless,
|
|
|
- # verbose=False,
|
|
|
- # use_managed_browser=True,
|
|
|
- # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce'
|
|
|
- # )
|
|
|
def get(self, url:str):
|
|
|
if not self.page:
|
|
|
self.page = load_chrome_from_ini(
|
|
|
@@ -211,6 +207,29 @@ class Crawler():
|
|
|
}
|
|
|
return await self.excra_strategy_raw_html(html, schema, JsonCssExtractionStrategy)
|
|
|
|
|
|
+ def download_img(self,url:str,save_dir:str=TEMP_PAGE_DIR, page:str=None,as_img_base64:bool=True, upload_s3_dir:str=''):
|
|
|
+ # ('success', '{abs_current_path}\\notice.svg')
|
|
|
+ p = page or self.page
|
|
|
+ status,path = p.download(url, save_path=save_dir)
|
|
|
+ path_name = Path(path).name
|
|
|
+ ext = Path(path).suffix
|
|
|
+ if status == 'success':
|
|
|
+ if as_img_base64:
|
|
|
+ with open(path, 'rb') as f:
|
|
|
+ encoded_string = base64.b64encode(f.read()).decode('utf-8')
|
|
|
+ Path(path).unlink()
|
|
|
+ # dataUrl = f"data:image/svg+xml;base64,{encoded_string}"
|
|
|
+ return status,encoded_string
|
|
|
+ if upload_s3_dir:
|
|
|
+ # upload_s3_dir 如果是 / 结尾则去掉
|
|
|
+ if upload_s3_dir.endswith('/'):
|
|
|
+ upload_s3_dir = upload_s3_dir[:-1]
|
|
|
+ save_img_path = upload_s3_dir + f"/{path_name}"
|
|
|
+ with open(path, 'rb') as f:
|
|
|
+ save_to_file(f.read(), save_img_path)
|
|
|
+ Path(path).unlink()
|
|
|
+ return status,save_img_path
|
|
|
+ return status,path
|
|
|
async def task():
|
|
|
asin = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8']
|
|
|
c = Crawler(ChromeOptions())
|
|
|
@@ -218,8 +237,11 @@ async def task():
|
|
|
# file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html'
|
|
|
# tab.get(file_path)
|
|
|
c.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
|
|
|
- res = await c.excra_product_info(c.page.html)
|
|
|
- logger.info(f"{json.loads(res.extracted_content)}")
|
|
|
+ res = c.download_img(
|
|
|
+ 'https://www.asinseed.com/assets/svg/flat-icons/notice.svg?v=20181122',
|
|
|
+ upload_s3_dir='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V/')
|
|
|
+ logger.info(f"{res}")
|
|
|
+ # logger.info(f"{res.extracted_content}")
|
|
|
|
|
|
# res = await c.cralw4ai_run(file_path)
|
|
|
# logger.info(f"{res.model_dump()}")
|