Просмотр исходного кода

完成基于 schema 的快速提取 json

mrh 1 год назад
Родитель
Сommit
d670490b0d
4 измененных файлов с 293 добавлено и 67 удалено
  1. 115 0
      docs/gpt/crawler_Schema.md
  2. 97 67
      src/browser/crawl_asin.py
  3. 76 0
      tests/mytest/t_crawler.py
  4. 5 0
      utils/file.py

+ 115 - 0
docs/gpt/crawler_Schema.md

@@ -0,0 +1,115 @@
+3. Advanced Schema & Nested Structures
+Real sites often have nested or repeated data—like categories containing products, which themselves have a list of reviews or features. For that, we can define nested or list (and even nested_list) fields.
+
+Sample E-Commerce HTML
+We have a sample e-commerce HTML file on GitHub (example):
+
+https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
+This snippet includes categories, products, features, reviews, and related items. Let’s see how to define a schema that fully captures that structure without LLM.
+schema = {
+    "name": "E-commerce Product Catalog",
+    "baseSelector": "div.category",
+    # (1) We can define optional baseFields if we want to extract attributes 
+    # from the category container
+    "baseFields": [
+        {"name": "data_cat_id", "type": "attribute", "attribute": "data-cat-id"}, 
+    ],
+    "fields": [
+        {
+            "name": "category_name",
+            "selector": "h2.category-name",
+            "type": "text"
+        },
+        {
+            "name": "products",
+            "selector": "div.product",
+            "type": "nested_list",    # repeated sub-objects
+            "fields": [
+                {
+                    "name": "name",
+                    "selector": "h3.product-name",
+                    "type": "text"
+                },
+                {
+                    "name": "price",
+                    "selector": "p.product-price",
+                    "type": "text"
+                },
+                {
+                    "name": "details",
+                    "selector": "div.product-details",
+                    "type": "nested",  # single sub-object
+                    "fields": [
+                        {
+                            "name": "brand",
+                            "selector": "span.brand",
+                            "type": "text"
+                        },
+                        {
+                            "name": "model",
+                            "selector": "span.model",
+                            "type": "text"
+                        }
+                    ]
+                },
+                {
+                    "name": "features",
+                    "selector": "ul.product-features li",
+                    "type": "list",
+                    "fields": [
+                        {"name": "feature", "type": "text"} 
+                    ]
+                },
+                {
+                    "name": "reviews",
+                    "selector": "div.review",
+                    "type": "nested_list",
+                    "fields": [
+                        {
+                            "name": "reviewer", 
+                            "selector": "span.reviewer", 
+                            "type": "text"
+                        },
+                        {
+                            "name": "rating", 
+                            "selector": "span.rating", 
+                            "type": "text"
+                        },
+                        {
+                            "name": "comment", 
+                            "selector": "p.review-text", 
+                            "type": "text"
+                        }
+                    ]
+                },
+                {
+                    "name": "related_products",
+                    "selector": "ul.related-products li",
+                    "type": "list",
+                    "fields": [
+                        {
+                            "name": "name", 
+                            "selector": "span.related-name", 
+                            "type": "text"
+                        },
+                        {
+                            "name": "price", 
+                            "selector": "span.related-price", 
+                            "type": "text"
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}
+Key Takeaways:
+
+Nested vs. List:
+type: "nested" means a single sub-object (like ). details
+type: "list" means multiple items that are simple dictionaries or single text fields.
+type: "nested_list" means repeated complex objects (like or ).productsreviews
+Base Fields: We can extract attributes from the container element via . For instance, might be . "baseFields""data_cat_id"data-cat-id="elect123"
+Transforms: We can also define a if we want to lower/upper case, strip whitespace, or even run a custom function.transform
+
+参考上述说明,帮我定义Schema 查找该html表格中每个字段的值,包含超链接。

+ 97 - 67
src/browser/crawl_asin.py

@@ -15,10 +15,15 @@ import httpx
 import ssl
 from sqlmodel import select, Session
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy,JsonXPathExtractionStrategy
+from crawl4ai.content_filter_strategy import BM25ContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
 from utils.logu import get_logger
 from config.settings import OUTPUT_DIR
 from utils.drission_page import load_chrome_from_ini,ChromeOptions
-from utils.file import save_to_file,check_exists,s3
+from utils.file import save_to_file,check_exists,s3,read_file
 from utils.config import CFG
 
 logger = get_logger('browser')
@@ -28,19 +33,31 @@ ASIN_HTML_DIR.mkdir(parents=True, exist_ok=True)
 class Crawler():
     def __init__(self, chrome_options:ChromeOptions, storage_config:dict=None):
         self.chrome_options = chrome_options
+        self.page = None
+        self.browser_config = {
+            "headless": self.chrome_options.headless,
+            "use_managed_browser": True,
+        }
+        # BrowserConfig(
+            # headless=chrome_options.headless,
+            # verbose=False,
+            # use_managed_browser=True,
+            # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce'
+        # )   
+    def get(self, url:str):
+        if not self.page:
+            self.page = load_chrome_from_ini(
+                self.chrome_options 
+            )
+        self.page.get(url)
     
     async def run(self, url:str):
         page = load_chrome_from_ini(
             self.chrome_options
         )
         craw_ai_browser_config = BrowserConfig(
-            headless=self.chrome_options.headless,
-            # verbose=False,
-            # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
-            # debugging_port=int(port),
-            use_managed_browser=True,
+            **self.browser_config,
             cdp_url=page.browser._driver._websocket_url
-            # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce'
         )
         try:
             async with AsyncWebCrawler(config=craw_ai_browser_config) as crawler:
@@ -83,75 +100,88 @@ class Crawler():
         data = self.get_asin_page_data(asin, asin_area, mthml_type)
         save_path = save_path or str(ASIN_HTML_DIR / f'{asin}.html')
         return save_to_file(data, save_path)
+    
+    async def cralw4ai_run(self, uri:str) -> CrawlResult:
+        browser_config = BrowserConfig(
+            **self.browser_config,
+            cdp_url = self.page.browser._driver._websocket_url
+        )
+        schema = {
+            "baseSelector": "table.table tbody tr",  # 每行数据对应一个tr
+            "fields": [
+                {
+                    "name": "traffic_keyword",
+                    "selector": "td:first-child a",  # 关键词文本
+                    "type": "text"
+                },
+                {
+                    "name": "keyword_link",
+                    "selector": "td:first-child a",  # 关键词超链接
+                    "type": "attribute",
+                    "attribute": "href"
+                },
+                {
+                    "name": "monthly_searches",
+                    "selector": "td:nth-child(2) span",  # 搜索量数值
+                    "type": "text",
+                    "transform": lambda x: x.replace(",", "") if x else None  # 移除逗号转数字
+                },
+                {
+                    "name": "search_trend_link",
+                    "selector": "td:nth-child(2) a",  # 搜索量趋势链接(带图表)
+                    "type": "attribute",
+                    "attribute": "href"
+                },
+                {
+                    "name": "weight",
+                    "selector": "td:nth-child(3) i.leaf",  # 统计叶子图标数量
+                    "type": "count"  # 通过计数获取权重值
+                },
+                {
+                    "name": "amazon_search_link",
+                    "selector": "td:last-child a",  # Amazon搜索链接
+                    "type": "attribute",
+                    "attribute": "href"
+                }
+            ]
+        }
         
+        dummy_html = self.page.html
+        raw_url = f"raw://{dummy_html}"
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            result:CrawlResult = await crawler.arun(
+                url=raw_url,
+                config=CrawlerRunConfig(
+                    cache_mode=CacheMode.BYPASS,
+                    extraction_strategy=JsonCssExtractionStrategy(schema,verbose=True)
+                )
+            )
+
+            if not result.success:
+                logger.error(f"Crawl failed: {result.error_message}")
+                return
+            data = json.loads(result.extracted_content)
+            logger.info(f"Extracted {len(data)} coin rows")
+            logger.debug(f"First item: {result.extracted_content}")
+            return data
 async def task():
     asin = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8']
     c = Crawler(ChromeOptions())
-    page = c.run_browser()
-    logger.info(f"{str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')}")
-    tab = page.latest_tab
     file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml'
     # file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html'
     # tab.get(file_path)
-    # page.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
-    # save_to_file(tab.save(), str(ASIN_HTML_DIR / 'B0CQ1SHD8V.png'))
+    c.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
+    res = await c.cralw4ai_run(file_path)
+    # logger.info(f"{res.model_dump()}")
+    # logger.info(f"{json.loads(res.extracted_content)}")
+    # save_to_file(res.model_dump(), OUTPUT_DIR/'page\debug\B0CQ1SHD8V.json')
+    return
+    page = c.run_browser()
+    tab = page.latest_tab
     data = tab.save()
-    # logger.info(f"{type(data)} , {data[:50]}")
+    logger.info(f"{type(data)} , {data[:50]}")
     save_to_file(data, 's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',Metadata={'mykey':'myvalue','mykey2':'myvalue2'})
-    return
-    # 附带源信息上传 https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html#the-extraargs-parameter
-    with open(file_path, 'rb') as data:
-        s3.upload_fileobj(
-            data, "public", 'amazone/copywriting_production/output/B0CQ1SHD8V.mhtml',
-            ExtraArgs={
-                'Metadata': {'mykey': 'myvalue'},
-                'ContentType': 'text/html'
-                })
-    # c.get_asin_and_save_page(asin[0], 'JP', save_path=str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
-    # logger.info(f"{CFG.s3_secret_key}")
-    # c.get_asin_and_save_page(
-    #     asin[0], 
-    #     'JP',
-    #     save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
-    #     overwrite=True
-    # )
-    # page.save(str(ASIN_HTML_DIR), name=f'{asin[0]}.html')
-    # save_to_file(page.html, str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
-    # await c.run('https://fr.florame.com/en/essential-oils')
-    return
-    port = page.browser._chromium_options._address.split(':')[-1]
-    logger.info(f"{page.browser._driver.get(f'http://{page.browser._chromium_options._address}/json').json()}")
-    logger.info(f"{page.browser._driver._websocket_url}")
-    item_id = 1
-    # url = 'https://greg.app/acalypha-marissima-overview/'
-    url = 'https://fr.florame.com/en/essential-oils'
-    # url = 'https://repository.arizona.edu/bitstream/10150/550946/1/dp_04_01-04.pdf'
-    # url = 'https://baidu.com'
-    browser_config = BrowserConfig(
-        headless=False,
-        # verbose=False,
-        # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
-        # debugging_port=int(port),
-        use_managed_browser=True,
-        cdp_url=page.browser._driver._websocket_url
-        # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce'
-    )
-    # async with AsyncWebCrawler(config=browser_config) as crawler:
-    #     crawler_config = CrawlerRunConfig(
-    #         cache_mode=CacheMode.BYPASS                
-    #     )
-    #     result = await crawler.arun(url=url, config=crawler_config)
-    #     print(result.markdown)
-
-    crawler = AsyncWebCrawler(config=browser_config)
-    await crawler.start()
-    crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
-    result:CrawlResult = await crawler.arun(url=url, config=crawl_config)
-    logger.info(f"{item_id} crawler.arun result.success: {result.model_dump_json(indent=2)} ")
-    print(result.markdown)
-    input('press enter to continue')
-    await crawler.close()
-    # page.quit()
+ 
 
 def main():
     asyncio.run(task())

+ 76 - 0
tests/mytest/t_crawler.py

@@ -0,0 +1,76 @@
+
+async def task():
+    asin = ['B0CQ1SHD8V', 'B0B658JC22', 'B0DQ84H883', 'B0D44RT8R8']
+    c = Crawler(ChromeOptions())
+    page = c.run_browser()
+    logger.info(f"{str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html')}")
+    tab = page.latest_tab
+    file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml'
+    # file_path = r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html'
+    # tab.get(file_path)
+    # page.get(r'G:\code\amazone\copywriting_production\output\page\debug\B0CQ1SHD8V.html.mhtml')
+    # save_to_file(tab.save(), str(ASIN_HTML_DIR / 'B0CQ1SHD8V.png'))
+    data = tab.save()
+    # logger.info(f"{type(data)} , {data[:50]}")
+    save_to_file(data, 's3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',Metadata={'mykey':'myvalue','mykey2':'myvalue2'})
+    return
+    # 附带源信息上传 https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html#the-extraargs-parameter
+    with open(file_path, 'rb') as data:
+        s3.upload_fileobj(
+            data, "public", 'amazone/copywriting_production/output/B0CQ1SHD8V.mhtml',
+            ExtraArgs={
+                'Metadata': {'mykey': 'myvalue'},
+                'ContentType': 'text/html'
+                })
+    # c.get_asin_and_save_page(asin[0], 'JP', save_path=str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
+    # logger.info(f"{CFG.s3_secret_key}")
+    # c.get_asin_and_save_page(
+    #     asin[0], 
+    #     'JP',
+    #     save_path='s3://public/amazone/copywriting_production/output/B0CQ1SHD8V.html',
+    #     overwrite=True
+    # )
+    # page.save(str(ASIN_HTML_DIR), name=f'{asin[0]}.html')
+    # save_to_file(page.html, str(ASIN_HTML_DIR / 'B0CQ1SHD8V.html'))
+    # await c.run('https://fr.florame.com/en/essential-oils')
+    return
+    port = page.browser._chromium_options._address.split(':')[-1]
+    logger.info(f"{page.browser._driver.get(f'http://{page.browser._chromium_options._address}/json').json()}")
+    logger.info(f"{page.browser._driver._websocket_url}")
+    item_id = 1
+    # url = 'https://greg.app/acalypha-marissima-overview/'
+    url = 'https://fr.florame.com/en/essential-oils'
+    # url = 'https://repository.arizona.edu/bitstream/10150/550946/1/dp_04_01-04.pdf'
+    # url = 'https://baidu.com'
+    browser_config = BrowserConfig(
+        headless=False,
+        # verbose=False,
+        # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
+        # debugging_port=int(port),
+        use_managed_browser=True,
+        cdp_url=page.browser._driver._websocket_url
+        # cdp_url='ws://127.0.0.1:9321/devtools/browser/dc75fc3b-352a-4d26-910b-adf5c245e0ce'
+    )
+    # async with AsyncWebCrawler(config=browser_config) as crawler:
+    #     crawler_config = CrawlerRunConfig(
+    #         cache_mode=CacheMode.BYPASS                
+    #     )
+    #     result = await crawler.arun(url=url, config=crawler_config)
+    #     print(result.markdown)
+
+    crawler = AsyncWebCrawler(config=browser_config)
+    await crawler.start()
+    crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
+    result:CrawlResult = await crawler.arun(url=url, config=crawl_config)
+    logger.info(f"{item_id} crawler.arun result.success: {result.model_dump_json(indent=2)} ")
+    print(result.markdown)
+    input('press enter to continue')
+    await crawler.close()
+    # page.quit()
+
+def main():
+    asyncio.run(task())
+    # test()
+
+if __name__ == "__main__":
+    main()

+ 5 - 0
utils/file.py

@@ -50,6 +50,11 @@ def save_to_file(content, filename:Path, **extra_args):
         file.write(content)
     return filename
 
+def read_file(file_uri:str):
+    with open(file_uri, 'r', transport_params={'client': s3}) as f:
+        # 文件存在,继续操作
+        return f.read()
+
 def check_exists(file_uri:str):
     try:
         with open(file_uri, 'r', transport_params={'client': s3}) as f: